diff --git a/README.md b/README.md
index 203b8b5..f2baf60 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@ This project contains deep neural networks, computer vision and control code, ha
 
 The project's deep neural networks (DNNs) can be trained from scratch using publicly available data. A few [pre-trained DNNs](../blob/master/models/pretrained/) are also available as a part of this project. In case you want to train TrailNet DNN from scratch, follow the steps on [this page](./Models).
 
+The project also contains [Stereo DNN](../blob/master/stereoDNN/) models and runtime which allow to estimate depth from stereo camera on NVIDIA platforms.
+
 ## References and Demos
 * [arXiv paper](https://arxiv.org/abs/1705.02550)
 * GTC 2017 talk: [slides](http://on-demand.gputechconf.com/gtc/2017/presentation/s7172-nikolai-smolyanskiy-autonomous-drone-navigation-with-deep-learning.pdf), [video](http://on-demand.gputechconf.com/gtc/2017/video/s7172-smolyanskiy-autonomous-drone-navigation-with-deep-learning%20(1).PNG.mp4)
@@ -14,8 +16,15 @@ The project's deep neural networks (DNNs) can be trained from scratch using publ
 * [Demo video showing generalization to ground vehicle control and other environments](https://www.youtube.com/watch?v=ZKF5N8xUxfw)
 
 # News
-* **2018-02-15**: added support for the TBS discovery platform.
-  * Step by step instructions on how to assemble the [TBS discovery drone](../../wiki/Skypad-TBS-Discovery-Setup).
+**GTC 2018**: in case you will be at [GTC 2018](https://www.nvidia.com/en-us/gtc/) next week, you may be interested in attending our [Stereo DNN session](https://2018gputechconf.smarteventscloud.com/connect/sessionDetail.ww?SESSION_ID=152050). We'll be happy to chat about redtail in general and some of the interesting work that we've been doing.
+
+* **2018-03-22**: redtail 2.0.
+  * Added Stereo DNN models and inference library (TensorFlow/TensorRT). For more details, see the [README](../blob/master/stereoDNN/).
+  * Migrated to JetPack 3.2. This change brings latest components such as CUDA 9.0, cuDNN 7.0, TensorRT 3.0, OpenCV 3.3 and others to Jetson platform. Note that this is a breaking change.
+  * Added support for INT8 inference. This enables fast inference on devices that have hardware implementation of INT8 instructions. More details are on [our wiki](../../wiki/ROS-Nodes#int8-inference).
+
+* **2018-02-15**: added support for the TBS Discovery platform.
+  * Step by step instructions on how to assemble the [TBS Discovery drone](../../wiki/Skypad-TBS-Discovery-Setup).
   * Instructions on how to attach and use a [ZED stereo camera](https://www.stereolabs.com/zed/).
   * Detailed instructions on how to calibrate, test and fly the drone.
 
diff --git a/ros/packages/caffe_ros/CMakeLists.txt b/ros/packages/caffe_ros/CMakeLists.txt
index 73c0e43..32c59d5 100644
--- a/ros/packages/caffe_ros/CMakeLists.txt
+++ b/ros/packages/caffe_ros/CMakeLists.txt
@@ -12,64 +12,13 @@ find_package(catkin REQUIRED COMPONENTS
 ## System dependencies are found with CMake's conventions
 #find_package(Boost REQUIRED COMPONENTS system)
 find_package(CUDA)
-# OpenCV 2.4.13 is default on Jetson. Might be required to install in simulation environment.
-find_package(OpenCV 2.4.13 REQUIRED)
-
-## Uncomment this if the package has a setup.py. This macro ensures
-## modules and global scripts declared therein get installed
-## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html
-# catkin_python_setup()
-
-################################################
-## Declare ROS messages, services and actions ##
-################################################
-
-## To declare and build messages, services or actions from within this
-## package, follow these steps:
-## * Let MSG_DEP_SET be the set of packages whose message types you use in
-##   your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...).
-## * In the file package.xml:
-##   * add a build_depend tag for "message_generation"
-##   * add a build_depend and a run_depend tag for each package in MSG_DEP_SET
-##   * If MSG_DEP_SET isn't empty the following dependency has been pulled in
-##     but can be declared for certainty nonetheless:
-##     * add a run_depend tag for "message_runtime"
-## * In this file (CMakeLists.txt):
-##   * add "message_generation" and every package in MSG_DEP_SET to
-##     find_package(catkin REQUIRED COMPONENTS ...)
-##   * add "message_runtime" and every package in MSG_DEP_SET to
-##     catkin_package(CATKIN_DEPENDS ...)
-##   * uncomment the add_*_files sections below as needed
-##     and list every .msg/.srv/.action file to be processed
-##   * uncomment the generate_messages entry below
-##   * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...)
-
-## Generate messages in the 'msg' folder
-# add_message_files(
-#   FILES
-#   Message1.msg
-#   Message2.msg
-# )
-
-## Generate services in the 'srv' folder
-# add_service_files(
-#   FILES
-#   Service1.srv
-#   Service2.srv
-# )
-
-## Generate actions in the 'action' folder
-# add_action_files(
-#   FILES
-#   Action1.action
-#   Action2.action
-# )
-
-## Generate added messages and services with any dependencies listed here
-# generate_messages(
-#   DEPENDENCIES
-#   std_msgs
-# )
+# OpenCV 3.3.1 is default on Jetson starting JetPack 3.2. Might be required to install in simulation environment.
+# PATHS is required as ROS Kinetic installs its own version of OpenCV 3.3.1 without CUDA support.
+find_package(OpenCV 3.3.1 REQUIRED
+  CONFIG
+  PATHS /usr/local /usr
+  NO_DEFAULT_PATH
+)
 
 ###################################
 ## catkin specific configuration ##
@@ -95,8 +44,8 @@ set (CMAKE_CXX_STANDARD 14)
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
 
 set(
-	CUDA_NVCC_FLAGS
-	${CUDA_NVCC_FLAGS}; 
+  CUDA_NVCC_FLAGS
+  ${CUDA_NVCC_FLAGS}; 
     -O3 -gencode arch=compute_53,code=sm_53
 )
 
@@ -108,11 +57,6 @@ include_directories(
   ${catkin_INCLUDE_DIRS}
 )
 
-## Declare a C++ library
-# add_library(caffe_ros
-#   src/${PROJECT_NAME}/caffe_ros.cpp
-# )
-
 ## Add cmake target dependencies of the library
 ## as an example, code may need to be generated before libraries
 ## either from message generation or dynamic reconfigure
@@ -134,7 +78,6 @@ target_link_libraries(caffe_ros_node
   nvinfer
   opencv_core
   opencv_imgproc
-  opencv_gpu
   opencv_highgui
 )
 
diff --git a/ros/packages/caffe_ros/include/caffe_ros/int8_calibrator.h b/ros/packages/caffe_ros/include/caffe_ros/int8_calibrator.h
new file mode 100644
index 0000000..dadeb9b
--- /dev/null
+++ b/ros/packages/caffe_ros/include/caffe_ros/int8_calibrator.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#ifndef CAFFE_ROS_INT8_CALIBRATOR_H
+#define CAFFE_ROS_INT8_CALIBRATOR_H
+
+#include <NvInfer.h>
+#include <string>
+#include "internal_utils.h"
+
+namespace caffe_ros
+{
+class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
+{
+public:
+    Int8EntropyCalibrator(ConstStr& src, ConstStr& calib_cache);
+    ~Int8EntropyCalibrator();
+
+    int  getBatchSize() const override { return 1; }
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) override;
+
+    const void* readCalibrationCache(size_t& length) override;
+    void writeCalibrationCache(const void* cache, size_t length) override;
+
+    void setInputDims(nvinfer1::DimsCHW dims);
+
+private:
+    std::string src_;
+    std::string calib_cache_;
+    
+    std::deque<std::string> files_;
+
+    nvinfer1::DimsCHW dims_ = nvinfer1::DimsCHW(0, 0, 0);
+
+    float* img_d_ = nullptr;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/ros/packages/caffe_ros/include/caffe_ros/internal_utils.h b/ros/packages/caffe_ros/include/caffe_ros/internal_utils.h
new file mode 100644
index 0000000..96505d0
--- /dev/null
+++ b/ros/packages/caffe_ros/include/caffe_ros/internal_utils.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#ifndef CAFFE_ROS_INTERNAL_UTILS_H
+#define CAFFE_ROS_INTERNAL_UTILS_H
+
+#include <opencv2/opencv.hpp>
+
+namespace caffe_ros
+{
+using ConstStr = const std::string;
+
+// Formats of the input layer. BGR is usually used by most of the frameworks that use OpenCV.
+enum class InputFormat
+{
+    BGR = 0,
+    RGB
+};
+
+// Performs image preprocessing (resizing, scaling, format conversion etc)
+// that is done before feeding the image into the networ.
+cv::Mat preprocessImage(cv::Mat img, int dst_img_w, int dst_img_h, InputFormat inp_fmt, ConstStr& encoding,
+                        float inp_scale, float inp_shift);
+
+}
+
+#endif
diff --git a/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h b/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h
index f67cd68..441fcb7 100644
--- a/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h
+++ b/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h
@@ -7,7 +7,10 @@
 #include <ros/ros.h>
 #include <NvInfer.h>
 #include <opencv2/opencv.hpp>
-#include <opencv2/gpu/gpu.hpp>
+// REVIEW alexeyk: OpenCV that comes with JetPack 3.2 is compiled without CUDA support.
+// #include <opencv2/core/cuda.hpp>
+
+#include "int8_calibrator.h"
 
 namespace caffe_ros
 {
@@ -15,14 +18,12 @@ namespace caffe_ros
 class TensorNet
 {
 public:
-    using ConstStr = const std::string;
-    
     TensorNet();
     virtual ~TensorNet();
 
-    void loadNetwork(ConstStr& prototxtPath, ConstStr& modelPath,
-                     ConstStr& inputBlob = "data", ConstStr& outputBlob = "prob",
-                     bool useFP16 = true, bool use_cached_model = true);
+    void loadNetwork(ConstStr& prototxt_path, ConstStr& model_path,
+                     ConstStr& input_blob, ConstStr& output_blob,
+                     nvinfer1::DataType data_type, bool use_cached_model);
 
     void forward(const unsigned char* input, size_t w, size_t h, size_t c, const std::string& encoding);
 
@@ -36,15 +37,15 @@ class TensorNet
 
     const float* getOutput() const { return out_h_; }
 
-    void setInputFormat(ConstStr& inputFormat)
+    void setInputFormat(ConstStr& input_format)
     {
-        if (inputFormat == "BGR")
+        if (input_format == "BGR")
             inp_fmt_ = InputFormat::BGR;
-        else if (inputFormat == "RGB")
+        else if (input_format == "RGB")
             inp_fmt_ = InputFormat::RGB;
         else
         {
-            ROS_FATAL("Input format %s is not supported. Supported formats: BGR and RGB", inputFormat.c_str());
+            ROS_FATAL("Input format %s is not supported. Supported formats: BGR and RGB", input_format.c_str());
             ros::shutdown();
         }
     }
@@ -68,16 +69,12 @@ class TensorNet
         context_->setProfiler(on ? &s_profiler : nullptr);
     }
 
-protected:
+    void createInt8Calibrator(ConstStr& int8_calib_src, ConstStr& int8_calib_cache);
 
-    // Formats of the input layer. BGR is usually used by most of the frameworks that use OpenCV.
-    enum class InputFormat
-    {
-        BGR = 0,
-        RGB
-    };
+protected:
 
-    void profileModel(ConstStr& prototxtPath, ConstStr& modelPath, bool useFP16, ConstStr& outputBlob, std::ostream& model);
+    void profileModel(ConstStr& prototxt_path, ConstStr& model_path, nvinfer1::DataType data_type,
+                      ConstStr& input_blob, ConstStr& output_blob, std::ostream& model);
 
     class Logger : public nvinfer1::ILogger
     {
@@ -109,9 +106,7 @@ class TensorNet
 
     cv::Mat in_h_;
     cv::Mat in_final_h_;
-    // cv::gpu::GpuMat m_inOrigD;
-    // cv::gpu::GpuMat m_inD;
-    cv::gpu::GpuMat in_d_;
+    float* in_d_  = nullptr;
     float* out_h_ = nullptr;
     float* out_d_ = nullptr;
 
@@ -121,8 +116,13 @@ class TensorNet
     // This is a separate flag from ROS_DEBUG to enable only specific profiling
     // of data preparation and DNN feed forward.
     bool debug_mode_ = false;
+
+    std::unique_ptr<Int8EntropyCalibrator> int8_calib_;
 };
 
+nvinfer1::DataType parseDataType(const std::string& src);
+std::string        toString(nvinfer1::DataType src);
+
 }
 
 #endif
diff --git a/ros/packages/caffe_ros/launch/caffe_gscam.launch b/ros/packages/caffe_ros/launch/caffe_gscam.launch
index d3df810..d5084b2 100644
--- a/ros/packages/caffe_ros/launch/caffe_gscam.launch
+++ b/ros/packages/caffe_ros/launch/caffe_gscam.launch
@@ -3,7 +3,7 @@
     <arg name="model_path" />
     <arg name="input_layer"  default="data" />
     <arg name="output_layer" default="prob" />
-    <arg name="use_fp16"     default="true" />
+    <arg name="data_type"    default="fp16" />
     <arg name="mean"         default="0" />
     <arg name="debug_mode"   default="false" />
 
@@ -29,7 +29,7 @@
         <param name="model_path"    value="$(arg model_path)" />
         <param name="input_layer"   value="$(arg input_layer)" />
         <param name="output_layer"  value="$(arg output_layer)" />
-        <param name="use_fp16"      value="$(arg use_fp16)" />
+        <param name="data_type"     value="$(arg data_type)" />
         <param name="mean"          value="$(arg mean)" />
         <param name="debug_mode"    value="$(arg debug_mode)" />
     </node>
diff --git a/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch b/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch
index 7e80e0e..519abb4 100644
--- a/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch
+++ b/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch
@@ -3,7 +3,7 @@
     <arg name="model_path" />
     <arg name="input_layer"  default="data" />
     <arg name="output_layer" default="prob" />
-    <arg name="use_fp16"     default="true" />
+    <arg name="data_type"    default="fp16" />
     <arg name="mean"         default="0" />
     <arg name="debug_mode"   default="false" />
 
@@ -26,7 +26,7 @@
         <param name="model_path"    value="$(arg model_path)" />
         <param name="input_layer"   value="$(arg input_layer)" />
         <param name="output_layer"  value="$(arg output_layer)" />
-        <param name="use_fp16"      value="$(arg use_fp16)" />
+        <param name="data_type"     value="$(arg data_type)" />
         <param name="mean"          value="$(arg mean)" />
         <param name="debug_mode"    value="$(arg debug_mode)" />
     </node>
diff --git a/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch b/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch
index 698f697..fdf2adc 100644
--- a/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch
+++ b/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch
@@ -3,7 +3,7 @@
     <arg name="model_path" />
     <arg name="input_layer"  default="data" />
     <arg name="output_layer" default="prob" />
-    <arg name="use_fp16"     default="true" />
+    <arg name="data_type"    default="fp16" />
     <arg name="mean"         default="0" />
     <arg name="debug_mode"   default="false" />
 
@@ -26,7 +26,7 @@
         <param name="model_path"    value="$(arg model_path)" />
         <param name="input_layer"   value="$(arg input_layer)" />
         <param name="output_layer"  value="$(arg output_layer)" />
-        <param name="use_fp16"      value="$(arg use_fp16)" />
+        <param name="data_type"     value="$(arg data_type)" />
         <param name="mean"          value="$(arg mean)" />
         <param name="debug_mode"    value="$(arg debug_mode)" />
     </node>
diff --git a/ros/packages/caffe_ros/launch/everything.launch b/ros/packages/caffe_ros/launch/everything.launch
index 1734190..425de0c 100644
--- a/ros/packages/caffe_ros/launch/everything.launch
+++ b/ros/packages/caffe_ros/launch/everything.launch
@@ -2,13 +2,13 @@
     <arg name="trail_prototxt_path" default="/home/nvidia/redtail/models/pretrained/TrailNet_SResNet-18.prototxt" />
     <arg name="trail_model_path"    default="/home/nvidia/redtail/models/pretrained/TrailNet_SResNet-18.caffemodel" />
     <arg name="trail_output_layer"  default="out" />
-    <arg name="trail_use_fp16"      default="true" />
+    <arg name="trail_data_type"     default="fp16" />
     <arg name="trail_rate_hz"       default="30" />
     
     <arg name="object_prototxt_path" default="/home/nvidia/redtail/models/pretrained/yolo-relu.prototxt" />
     <arg name="object_model_path"    default="/home/nvidia/redtail/models/pretrained/yolo-relu.caffemodel" />
     <arg name="object_output_layer"  default="fc25" />
-    <arg name="object_use_fp16"      default="true" />
+    <arg name="object_data_type"     default="fp16" />
     <arg name="object_rate_hz"       default="1" />
     <arg name="obj_det_threshold"    default="0.2" />
 
@@ -40,7 +40,7 @@
         <param name="prototxt_path" value="$(arg trail_prototxt_path)" />
         <param name="model_path"    value="$(arg trail_model_path)" />
         <param name="output_layer"  value="$(arg trail_output_layer)" />
-        <param name="use_fp16"      value="$(arg trail_use_fp16)" />
+        <param name="data_type"     value="$(arg trail_data_type)" />
         <param name="max_rate_hz"   value="$(arg trail_rate_hz)" />
     </node>
 
@@ -54,7 +54,7 @@
         <param name="post_proc"     value="YOLO" />
         <param name="obj_det_threshold" value="$(arg obj_det_threshold)" />
         <param name="iou_threshold"     value="0.2" />
-        <param name="use_fp16"      value="$(arg object_use_fp16)" />
+        <param name="data_type"     value="$(arg object_data_type)" />
         <param name="max_rate_hz"   value="$(arg object_rate_hz)" />
     </node>
     
diff --git a/ros/packages/caffe_ros/launch/mavros_caffe.launch b/ros/packages/caffe_ros/launch/mavros_caffe.launch
index e39195c..ef6a36c 100644
--- a/ros/packages/caffe_ros/launch/mavros_caffe.launch
+++ b/ros/packages/caffe_ros/launch/mavros_caffe.launch
@@ -3,7 +3,7 @@
     <arg name="model_path" />
     <arg name="input_layer"  default="data" />
     <arg name="output_layer" default="softmax" />
-    <arg name="use_fp16"     default="true" />
+    <arg name="data_type"    default="fp16" />
     <arg name="mean"         default="0" />
     <arg name="debug_mode"   default="false" />
 
@@ -24,7 +24,7 @@
         <arg name="model_path"    value="$(arg model_path)" />
         <arg name="input_layer"   value="$(arg input_layer)" />
         <arg name="output_layer"  value="$(arg output_layer)" />
-        <arg name="use_fp16"      value="$(arg use_fp16)" />
+        <arg name="data_type"     value="$(arg data_type)" />
         <arg name="mean"          value="$(arg mean)" />
         <arg name="debug_mode"    value="$(arg debug_mode)" />
         <arg name="device"        value="$(arg device)" />
diff --git a/ros/packages/caffe_ros/src/caffe_ros.cpp b/ros/packages/caffe_ros/src/caffe_ros.cpp
index 0a81a7e..73b628d 100644
--- a/ros/packages/caffe_ros/src/caffe_ros.cpp
+++ b/ros/packages/caffe_ros/src/caffe_ros.cpp
@@ -23,6 +23,9 @@ CaffeRos::CaffeRos()
     std::string output_layer;
     std::string inp_fmt;
     std::string post_proc;
+    std::string data_type_s;
+    std::string int8_calib_src;
+    std::string int8_calib_cache;
     bool        use_FP16;
     float       inp_scale;
     float       inp_shift;
@@ -37,7 +40,14 @@ CaffeRos::CaffeRos()
     nh.param<std::string>("output_layer",  output_layer, "prob");
     nh.param<std::string>("inp_fmt",       inp_fmt, "BGR");
     nh.param<std::string>("post_proc",     post_proc, "");
+    nh.param<std::string>("data_type",     data_type_s, "fp16");
+    nh.param<std::string>("int8_calib_src",   int8_calib_src,   "");
+    nh.param<std::string>("int8_calib_cache", int8_calib_cache, "");
+    
+    // Backward compatibility: (use_FP16 == false) means use FP32.
     nh.param("use_fp16",  use_FP16, true);
+    data_type_s = use_FP16 ? data_type_s : "fp32";
+
     nh.param("inp_scale", inp_scale, 1.0f);
     nh.param("inp_shift", inp_shift, 0.0f);
     nh.param("camera_queue_size", camera_queue_size, DEFAULT_CAMERA_QUEUE_SIZE);
@@ -54,26 +64,37 @@ CaffeRos::CaffeRos()
     ROS_INFO("Input : %s", input_layer.c_str());
     ROS_INFO("Output: %s", output_layer.c_str());
     ROS_INFO("In Fmt: %s", inp_fmt.c_str());
-    ROS_INFO("FP16  : %s", use_FP16 ? "yes" : "no");
+    ROS_INFO("DType : %s", data_type_s.c_str());
     ROS_INFO("Scale : %.4f", inp_scale);
     ROS_INFO("Shift : %.2f", inp_shift);
     ROS_INFO("Cam Q : %d", camera_queue_size);
     ROS_INFO("DNN Q : %d", dnn_queue_size);
-    ROS_INFO("Post P: %s", post_proc.c_str());
+    ROS_INFO("Post P: %s", post_proc.empty() ? "none" : post_proc.c_str());
     ROS_INFO("Obj T : %.2f", obj_det_threshold_);
     ROS_INFO("IOU T : %.2f", iou_threshold_);
     ROS_INFO("Rate  : %.1f", max_rate_hz_);
     ROS_INFO("Debug : %s", debug_mode_ ? "yes" : "no");
+    ROS_INFO("INT8 calib src  : %s", int8_calib_src.c_str());
+    ROS_INFO("INT8 calib cache: %s", int8_calib_cache.c_str());
+    //
+    ROS_WARN("The use_FP16 parameter is deprecated though still supported. "
+             "Please use data_type instead as use_FP16 will be removed in future release.");
 
     setPostProcessing(post_proc);
 
-    net_.loadNetwork(prototxt_path, model_path, input_layer, output_layer, use_FP16, use_cached_model);
+    auto data_type = parseDataType(data_type_s);
+
+    if (data_type == nvinfer1::DataType::kINT8)
+        net_.createInt8Calibrator(int8_calib_src, int8_calib_cache);
+
+    net_.loadNetwork(prototxt_path, model_path, input_layer, output_layer,
+                     data_type, use_cached_model);
     net_.setInputFormat(inp_fmt);
     net_.setScale(inp_scale);
     net_.setShift(inp_shift);
     if (debug_mode_)
         net_.showProfile(true);
-        
+
     image_sub_  = nh.subscribe<sensor_msgs::Image>(camera_topic, camera_queue_size, &CaffeRos::imageCallback, this);
     output_pub_ = nh.advertise<sensor_msgs::Image>("network/output", dnn_queue_size);
 }
@@ -116,7 +137,7 @@ sensor_msgs::Image::ConstPtr CaffeRos::computeOutputs()
     out_msg->header.stamp.sec  = img.header.stamp.sec;
     out_msg->header.stamp.nsec = img.header.stamp.nsec;
     out_msg->header.frame_id   = img.header.frame_id;
-    
+
     // Use single precision multidimensional array to represent outputs.
     // This can be useful in case DNN output is multidimensional such as in segmentation networks.
     // Note that encoding may not be compatible with other ROS code that uses Image in case number of channels > 4.
@@ -155,7 +176,7 @@ sensor_msgs::Image::ConstPtr CaffeRos::computeOutputs()
         }
         ROS_ASSERT(i == msg_data.size());
         // Create message.
-        // YOLO output is represented as a matrix where each row is 
+        // YOLO output is represented as a matrix where each row is
         // a predicted object vector of size 6: label, prob and 4 bounding box coordinates.
         out_msg->encoding = "32FC1";
         out_msg->width    = num_col;
diff --git a/ros/packages/caffe_ros/src/int8_calibrator.cpp b/ros/packages/caffe_ros/src/int8_calibrator.cpp
new file mode 100644
index 0000000..8691007
--- /dev/null
+++ b/ros/packages/caffe_ros/src/int8_calibrator.cpp
@@ -0,0 +1,121 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "caffe_ros/int8_calibrator.h"
+#include <boost/filesystem.hpp>
+#include <boost/range/iterator_range.hpp>
+#include <cuda_runtime_api.h>
+#include <opencv2/opencv.hpp>
+#include <ros/ros.h>
+
+namespace fs = boost::filesystem;
+
+namespace caffe_ros
+{
+
+Int8EntropyCalibrator::Int8EntropyCalibrator(ConstStr& src, ConstStr& calib_cache)
+    : src_(src), calib_cache_(calib_cache)
+{
+    if (!src_.empty())
+    {
+        // If calibration source is not empty then it should be either directory
+        // or single image file.
+        if (!fs::is_directory(src_) && !fs::is_regular_file(src_))
+        {
+            ROS_FATAL("INT8 calibrator: not supported source \"%s\". Use directory or regular file name.", src_.c_str());
+            ros::shutdown();
+        }
+        // Set calibration file cache if it's empty.
+        if (calib_cache_.empty())
+            calib_cache_ = (fs::path(src_) / "_int8_calib.cache").string();
+        // Create file list.
+        if (fs::is_directory(src_))
+        {
+            for(const auto& entry: boost::make_iterator_range(fs::directory_iterator(src_), {}))
+                files_.push_back(entry.path().string());
+        }
+        else
+            files_.push_back(src_);
+    }
+}
+
+Int8EntropyCalibrator::~Int8EntropyCalibrator()
+{
+    if (img_d_ != nullptr)
+        cudaFree(img_d_);
+    img_d_ = nullptr;
+}
+
+bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[], int nbBindings)
+{
+    if (files_.size() == 0)
+        return false;
+
+    auto file = files_.front();
+    files_.pop_front();
+    ROS_DEBUG("INT8 calibrator: using \"%s\" as a source.", file.c_str());
+    auto img = cv::imread(file, cv::IMREAD_COLOR);
+    auto img_h = preprocessImage(img, dims_.w(), dims_.h(), InputFormat::BGR, "bgr8", 1, 0);
+    // Copy to the device.
+    ROS_ASSERT(img_h.isContinuous());
+    ROS_ASSERT(nbBindings == 1);
+    ROS_ASSERT(std::string("data") == names[0]);
+
+    size_t size = dims_.c() * dims_.h() * dims_.w() * sizeof(float);
+    if (img_d_ == nullptr)
+    {
+        if (cudaMalloc((void**)&img_d_, size) != cudaSuccess)
+        {
+            ROS_ERROR("INT8 calibrator: could not allocate %zu bytes on the device, error: %u.", size, cudaGetLastError());
+            return false;
+        }
+    }
+    if (cudaMemcpy(img_d_, img_h.ptr<float>(0), size, cudaMemcpyHostToDevice) != cudaSuccess)
+    {
+        ROS_FATAL("INT8 calibrator: could not copy data to device, error: %u.", cudaGetLastError());
+        return false;
+    }
+    bindings[0] = img_d_;
+    return true;
+}
+
+const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length)
+{
+    // If calibration source is not empty then it takes precedence over cache
+    // as we assume user has requested a calibration.
+    if (!src_.empty() || !fs::exists(calib_cache_))
+        return nullptr;
+
+    ROS_INFO("Reading INT8 calibration cache from: %s", calib_cache_.c_str());
+    // Open file and seek to the end.
+    std::ifstream file(calib_cache_, std::ios::binary | std::ios::ate);
+    if (!file.is_open())
+    {
+        ROS_ERROR("Could not open file %s for reading, INT8 cache will not be used.", calib_cache_.c_str());
+        return nullptr;
+    }
+    // Get file size and read the data.
+    length = file.tellg();
+    file.seekg(0, std::ios_base::beg);
+    char* data = new char[length];
+    file.read(data, length);
+    return data;
+}
+
+void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length)
+{
+    ROS_INFO("Writing INT8 calibration cache to: %s", calib_cache_.c_str());
+    auto file = std::ofstream(calib_cache_, std::ios::binary);
+    file.write((const char*)cache, length);
+}
+
+void Int8EntropyCalibrator::setInputDims(nvinfer1::DimsCHW dims)
+{
+    dims_ = dims;
+    // Free image device cache.
+    if (img_d_ != nullptr)
+        cudaFree(img_d_);
+    img_d_ = nullptr;
+}
+
+}
\ No newline at end of file
diff --git a/ros/packages/caffe_ros/src/tensor_net.cpp b/ros/packages/caffe_ros/src/tensor_net.cpp
index 51755bc..d056c4d 100644
--- a/ros/packages/caffe_ros/src/tensor_net.cpp
+++ b/ros/packages/caffe_ros/src/tensor_net.cpp
@@ -5,24 +5,24 @@
 #include <NvInfer.h>
 #include <NvCaffeParser.h>
 #include <cuda_runtime.h>
-#include <opencv2/opencv.hpp>
-#include <opencv2/gpu/gpu.hpp>
-#include <cublas_v2.h>
 #include "caffe_ros/tensor_net.h"
+#include <boost/algorithm/string.hpp>
 
 namespace caffe_ros
 {
 
+using namespace nvinfer1;
+
 TensorNet::Logger   TensorNet::s_log;
 TensorNet::Profiler TensorNet::s_profiler;
 
-static nvinfer1::DimsCHW DimsToCHW(nvinfer1::Dims dims)
+static DimsCHW DimsToCHW(Dims dims)
 {
     ROS_ASSERT(dims.nbDims == 3);
-    ROS_ASSERT(dims.type[0] == nvinfer1::DimensionType::kCHANNEL);
-    ROS_ASSERT(dims.type[1] == nvinfer1::DimensionType::kSPATIAL);
-    ROS_ASSERT(dims.type[2] == nvinfer1::DimensionType::kSPATIAL);
-    return nvinfer1::DimsCHW(dims.d[0], dims.d[1], dims.d[2]);
+    ROS_ASSERT(dims.type[0] == DimensionType::kCHANNEL);
+    ROS_ASSERT(dims.type[1] == DimensionType::kSPATIAL);
+    ROS_ASSERT(dims.type[2] == DimensionType::kSPATIAL);
+    return DimsCHW(dims.d[0], dims.d[1], dims.d[2]);
 }
 
 void TensorNet::Logger::log(Severity severity, const char *msg)
@@ -57,40 +57,91 @@ TensorNet::TensorNet()
 
 TensorNet::~TensorNet()
 {
+    cudaError_t err = cudaSuccess;
+    if (in_d_ != nullptr)
+    {
+        err = cudaFree(in_d_);
+        if (err != cudaSuccess)
+            ROS_WARN("cudaFree returned %d", (int)err);
+    }
+
     if (out_h_ != nullptr)
     {
-        cudaError_t err = cudaFreeHost(out_h_);
+        err = cudaFreeHost(out_h_);
         if (err != cudaSuccess)
             ROS_WARN("cudaFreeHost returned %d", (int)err);
-        out_h_ = nullptr;
-        out_d_ = nullptr;
     }
+    in_d_  = nullptr;
+    out_h_ = nullptr;
+    out_d_ = nullptr;
 }
 
-void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool use_FP16,
-                             ConstStr& output_blob, std::ostream& model)
+void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, DataType data_type,
+                             ConstStr& input_blob, ConstStr& output_blob, std::ostream& model)
 {
-    auto builder = nvinfer1::createInferBuilder(s_log);
+    auto builder = createInferBuilder(s_log);
     auto network = builder->createNetwork();
 
-    builder->setMinFindIterations(3); // allow time for TX1 GPU to spin up
+    // Set TRT autotuner parameters.
+    builder->setMinFindIterations(3);
     builder->setAverageFindIterations(2);
 
     auto parser = nvcaffeparser1::createCaffeParser();
+
+    DataType model_data_type = DataType::kFLOAT;
+    // Check for FP16.
     bool has_fast_FP16 = builder->platformHasFastFp16();
     ROS_INFO("Hardware support of fast FP16: %s.", has_fast_FP16 ? "yes" : "no");
-    if (has_fast_FP16 && !use_FP16)
-        ROS_INFO("... however, the model will be loaded as FP32.");
-    
-    nvinfer1::DataType model_data_type = (has_fast_FP16 && use_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
-    auto blob_finder = parser->parse(prototxt_path.c_str(), model_path.c_str(), *network, model_data_type);
+    if (has_fast_FP16)
+    {
+        if (data_type == DataType::kHALF)
+        {
+            model_data_type = DataType::kHALF;
+            builder->setHalf2Mode(true);
+        }
+        else
+            ROS_INFO("... however, FP16 will not be used for this model.");
+    }
+    // Check for Int8.
+    bool has_fast_int8 = builder->platformHasFastInt8();
+    ROS_INFO("Hardware support of fast INT8: %s.", has_fast_int8 ? "yes" : "no");
+    if (has_fast_int8)
+    {
+        if (data_type == DataType::kINT8)
+        {
+            ROS_ASSERT(int8_calib_ != nullptr);
+            model_data_type = DataType::kINT8;
+            builder->setInt8Mode(true);
+            builder->setInt8Calibrator(int8_calib_.get());
+        }
+        else
+            ROS_INFO("... however, INT8 will not be used for this model.");
+    }
+
+    ROS_INFO("Using %s model data type.", toString(model_data_type).c_str());
+    // Note: for INT8 models parsing, data type must be set to FP32, see TRT docs.
+    auto blob_finder = parser->parse(prototxt_path.c_str(), model_path.c_str(), *network,
+                                     model_data_type == DataType::kINT8 ? DataType::kFLOAT : model_data_type);
     if (blob_finder == nullptr)
     {
         ROS_FATAL("Failed to parse network: %s, %s", prototxt_path.c_str(), model_path.c_str());
         ros::shutdown();
     }
     ROS_INFO("Loaded model from: %s, %s", prototxt_path.c_str(), model_path.c_str());
-    
+
+    // Need to set input dimensions for INT8 calibrator.
+    if (data_type == DataType::kINT8)
+    {
+        auto in_b = blob_finder->find(input_blob.c_str());
+        if (in_b == nullptr)
+        {
+            ROS_FATAL("Could not find input blob: %s", input_blob.c_str());
+            ros::shutdown();
+        }
+        int8_calib_->setInputDims(DimsToCHW(in_b->getDimensions()));
+    }
+
+    // Find output blob and mark it as a network output.
     auto out_b = blob_finder->find(output_blob.c_str());
     if (out_b == nullptr)
     {
@@ -105,8 +156,6 @@ void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool
     builder->setMaxBatchSize(1);
     builder->setMaxWorkspaceSize(16 * 1024 * 1024);
 
-    builder->setHalf2Mode(has_fast_FP16 && use_FP16);
-
     ROS_INFO("Building CUDA engine...");
     auto engine = builder->buildCudaEngine(*network);
     if (engine == nullptr)
@@ -116,11 +165,11 @@ void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool
     }
 
     // Save model.
-    nvinfer1::IHostMemory* model_ptr = engine->serialize();
+    IHostMemory* model_ptr = engine->serialize();
     ROS_ASSERT(model_ptr != nullptr);
     model.write(reinterpret_cast<const char*>(model_ptr->data()), model_ptr->size());
     model_ptr->destroy();
-    
+
     ROS_INFO("Done building.");
 
     // Cleanup.
@@ -131,10 +180,10 @@ void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool
 }
 
 void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path,
-                            ConstStr& input_blob, ConstStr& output_blob, 
-                            bool use_FP16, bool use_cached_model)
+                            ConstStr& input_blob, ConstStr& output_blob,
+                            DataType data_type, bool use_cached_model)
 {
-    infer_ = nvinfer1::createInferRuntime(s_log);
+    infer_ = createInferRuntime(s_log);
     if (infer_ == nullptr)
     {
         ROS_FATAL("Failed to create inference runtime.");
@@ -143,7 +192,7 @@ void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path,
 
     std::stringstream model;
     if (!use_cached_model)
-        profileModel(prototxt_path, model_path, use_FP16, output_blob, model);
+        profileModel(prototxt_path, model_path, data_type, input_blob, output_blob, model);
     else
     {
         std::string cached_model_path(model_path + ".cache");
@@ -156,7 +205,7 @@ void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path,
         }
         else
         {
-            profileModel(prototxt_path, model_path, use_FP16, output_blob, model);
+            profileModel(prototxt_path, model_path, data_type, input_blob, output_blob, model);
             ROS_INFO("Saving cached model to: %s", cached_model_path.c_str());
             std::ofstream cacheFile(cached_model_path, std::ios::binary);
             cacheFile << model.rdbuf();
@@ -184,19 +233,23 @@ void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path,
     int iinp = engine_->getBindingIndex(input_blob.c_str());
     in_dims_ = DimsToCHW(engine_->getBindingDimensions(iinp));
     ROS_INFO("Input : (W:%4u, H:%4u, C:%4u).", in_dims_.w(), in_dims_.h(), in_dims_.c());
-    //cv::gpu::ensureSizeIsEnough(in_dims_.h(), in_dims_.w(), CV_8UC3, in_d_);
-    in_d_ = cv::gpu::createContinuous(in_dims_.c(), in_dims_.w() * in_dims_.h(), CV_32FC1);
-    assert(in_d_.isContinuous());
-    
+    size_t in_size_bytes = in_dims_.c() * in_dims_.h() * in_dims_.w() * sizeof(float);
+    // Allocate memory for the inputs.
+    if (cudaMalloc(&in_d_, in_size_bytes) != cudaSuccess)
+    {
+        ROS_FATAL("Could not allocate %zu bytes for the input, error: %u.", in_size_bytes, cudaGetLastError());
+        ros::shutdown();
+    }
+
     int iout  = engine_->getBindingIndex(output_blob.c_str());
     out_dims_ = DimsToCHW(engine_->getBindingDimensions(iout));
     ROS_INFO("Output: (W:%4u, H:%4u, C:%4u).", out_dims_.w(), out_dims_.h(), out_dims_.c());
 
     // Allocate mapped memory for the outputs.
-    size_t outSizeBytes = out_dims_.w() * out_dims_.h() * out_dims_.c() * sizeof(float);
-    if (cudaHostAlloc(&out_h_, outSizeBytes, cudaHostAllocMapped) != cudaSuccess)
+    size_t out_size_bytes = out_dims_.w() * out_dims_.h() * out_dims_.c() * sizeof(float);
+    if (cudaHostAlloc(&out_h_, out_size_bytes, cudaHostAllocMapped) != cudaSuccess)
     {
-        ROS_FATAL("Could not allocate %zu bytes for the output, error: %u.", outSizeBytes, cudaGetLastError());
+        ROS_FATAL("Could not allocate %zu bytes for the output, error: %u.", out_size_bytes, cudaGetLastError());
         ros::shutdown();
     }
     if (cudaHostGetDevicePointer(&out_d_, out_h_, 0) != cudaSuccess)
@@ -218,71 +271,97 @@ void TensorNet::forward(const unsigned char* input, size_t w, size_t h, size_t c
     ros::Time start = ros::Time::now();
 
     in_h_ = cv::Mat((int)h, (int)w, CV_8UC3, (void*)input);
+    in_final_h_ = preprocessImage(in_h_, in_dims_.w(), in_dims_.h(), inp_fmt_, encoding, inp_scale_, inp_shift_);
+    // Copy to the device.
+    ROS_ASSERT(in_final_h_.isContinuous());
+    if (cudaMemcpy(in_d_, in_final_h_.ptr<float>(0),
+                   in_final_h_.size().area() * in_final_h_.channels() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
+    {
+        ROS_FATAL("Could not copy data to device, error: %u.", cudaGetLastError());
+        ros::shutdown();
+    }
+
+    if (debug_mode_)
+        ROS_INFO("Preproc time: %.3f", (ros::Time::now() - start).toSec() * 1000);
+
+    void* bufs[] = {in_d_, out_d_};
+    context_->execute(1, bufs);
+    if (debug_mode_)
+        s_profiler.printLayerTimes();
+    ROS_DEBUG("Forward out (first 3 values): [%.4f, %.4f, %.4f]", out_h_[0], out_h_[1], out_h_[2]);
+}
+
+void TensorNet::createInt8Calibrator(ConstStr& int8_calib_src, ConstStr& int8_calib_cache)
+{
+    ROS_ASSERT(!int8_calib_src.empty() || !int8_calib_cache.empty());
+
+    if (!int8_calib_src.empty())
+        ROS_INFO("INT8 calibration is requested. This may take some time.");
+
+    int8_calib_ = std::make_unique<Int8EntropyCalibrator>(int8_calib_src, int8_calib_cache);
+}
+
+cv::Mat preprocessImage(cv::Mat img, int dst_img_w, int dst_img_h, InputFormat inp_fmt, ConstStr& encoding,
+                        float inp_scale, float inp_shift)
+{
     // Handle encodings.
-    if (inp_fmt_ == InputFormat::BGR)
+    if (inp_fmt == InputFormat::BGR)
     {
         // Convert image from RGB to BGR format used by OpenCV if needed.
         if (encoding == "rgb8")
-            cv::cvtColor(in_h_, in_h_, CV_RGB2BGR);
+            cv::cvtColor(img, img, CV_RGB2BGR);
     }
-    else if (inp_fmt_ == InputFormat::RGB)
+    else if (inp_fmt == InputFormat::RGB)
     {
         // Input image in OpenCV BGR, convert to RGB.
         if (encoding == "bgr8")
-            cv::cvtColor(in_h_, in_h_, CV_BGR2RGB);
+            cv::cvtColor(img, img, CV_BGR2RGB);
     }
-    //ROS_INFO("Dims: (%zu, %zu) -> (%zu, %zu)", w, h, (size_t)in_dims_.w(), (size_t)in_dims_.h());
+    //ROS_INFO("Dims: (%zu, %zu) -> (%zu, %zu)", w, h, (size_t)dst_img_w, (size_t)dst_img_h);
     // Convert to floating point type.
-    in_h_.convertTo(in_h_, CV_32F);
+    img.convertTo(img, CV_32F);
     // Resize (anisotropically) to input layer size.
-    cv::resize(in_h_, in_h_, cv::Size(in_dims_.w(), in_dims_.h()), 0, 0, cv::INTER_CUBIC);
+    cv::resize(img, img, cv::Size(dst_img_w, dst_img_h), 0, 0, cv::INTER_CUBIC);
     // Scale if needed.
-    if (inp_scale_ != 1)
-        in_h_ *= inp_scale_;
+    if (inp_scale != 1)
+        img *= inp_scale;
     // Shift if needed.
-    if (inp_shift_ != 0)
-        in_h_ = in_h_ + inp_shift_;
+    if (inp_shift != 0)
+        img += inp_shift;
     // Transpose to get CHW format.
-    in_final_h_ = in_h_.reshape(1, in_dims_.w() * in_dims_.h()).t();
-    // Copy to the device.
-    ROS_ASSERT(in_final_h_.isContinuous());
-    ROS_ASSERT(in_d_.isContinuous());
-    ROS_ASSERT(in_d_.size().area() * in_d_.channels() == in_final_h_.size().area() * in_final_h_.channels());
-    if (cudaMemcpy(in_d_.ptr<float>(0), in_final_h_.ptr<float>(0), 
-                   in_d_.size().area() * in_d_.channels() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess)
+    return img.reshape(1, dst_img_w * dst_img_h).t();
+}
+
+DataType parseDataType(const std::string& src)
+{
+    if (boost::iequals(src, "FP32"))
+        return DataType::kFLOAT;
+    if (boost::iequals(src, "FP16"))
+        return DataType::kHALF;
+    if (boost::iequals(src, "INT8"))
+        return DataType::kINT8;
+    else
     {
-        ROS_FATAL("Could not copy data to device, error: %u.", cudaGetLastError());
+        ROS_FATAL("Invalid data type: %s. Supported data types: FP32, FP16, INT8.", src.c_str());
         ros::shutdown();
+        // Will not get here (well, should not).
+        return (DataType)-1;
     }
-
-    if (debug_mode_)
-        ROS_INFO("Preproc time: %.3f", (ros::Time::now() - start).toSec() * 1000);
-
-    // GPU version:
-    // in_orig_d_.upload(cv::Mat((int)h, (int)w, CV_8UC3, (void*)input));
-    // // Resize to input layer size.
-    // cv::gpu::resize(in_orig_d_, in_d_, cv::Size(in_dims_.w(), in_dims_.h()), 0, 0, cv::INTER_CUBIC);
-    // // Convert to floating point type.
-    // in_d_.convertTo(in_f_d_, CV_32FC3);
-    // // Subtract shift.
-    // // REVIEW alexeyk: should be configurable as some models already have this in prototxt.
-    // cv::gpu::subtract(in_f_d_, 128.0f, in_f_d_);
-    // // Transpose to get CHW format.
-    // ROS_DEBUG("in_f_d_: %zu", in_f_d_.elemSize());
-    // cv::gpu::transpose(in_f_d_, in_f_d_);
-
-    // cv::Mat cpuM;
-    // in_d_.download(cpuM);
-    // std::ofstream file("/home/ubuntu/tmp.raw", std::ios::binary);
-    // file.write(cpuM.ptr<char>(0), std::streamsize(in_dims_.w() * in_dims_.h() * c));
-    // file.close();
-
-    void* bufs[] = {in_d_.ptr<float>(0), out_d_};
-    context_->execute(1, bufs);
-    if (debug_mode_)
-        s_profiler.printLayerTimes();
-    ROS_DEBUG("Forward out (first 3 values): [%.4f, %.4f, %.4f]", out_h_[0], out_h_[1], out_h_[2]);
 }
 
+std::string toString(DataType src)
+{
+    switch (src)
+    {
+    case DataType::kFLOAT:
+        return "FP32";
+    case DataType::kHALF:
+        return "FP16";
+    case DataType::kINT8:
+        return "INT8";
+    default:
+        return "<Unknown>";
+    }
 }
 
+}
diff --git a/ros/packages/caffe_ros/tests/data/trailnet_int8_calib.cache b/ros/packages/caffe_ros/tests/data/trailnet_int8_calib.cache
new file mode 100644
index 0000000..58c4cc1
--- /dev/null
+++ b/ros/packages/caffe_ros/tests/data/trailnet_int8_calib.cache
@@ -0,0 +1,54 @@
+1
+(Unnamed ITensor* 12): 3c66f92e
+conv1: 3c04519f
+pool1: 3c016718
+res1_1_1: 3c010a14
+(Unnamed ITensor* 21): 3ce8219b
+(Unnamed ITensor* 23): 3d1474bf
+(Unnamed ITensor* 31): 3d8316b5
+res2_1_1: 3c01cfc8
+softmax_t: 3be25d0c
+(Unnamed ITensor* 56): 3e0c8ef5
+(Unnamed ITensor* 37): 3d76540e
+(Unnamed ITensor* 14): 3cb3e96f
+(Unnamed ITensor* 18): 3ca98b20
+res4_2_1: 3c023151
+res1_2: 3cbbbdf3
+(Unnamed ITensor* 80): 3dcdbb3f
+fc3_t: 3d8e6f0d
+(Unnamed ITensor* 33): 3d9aba8f
+res2_1: 3c01a251
+(Unnamed ITensor* 42): 3dc7db29
+(Unnamed ITensor* 50): 3e30779d
+(Unnamed ITensor* 59): 3e49a689
+sub_mean: 3b9e3ebb
+data: 400fabe8
+(Unnamed ITensor* 46): 3daa87d1
+res2_1_2: 3d2c4b58
+(Unnamed ITensor* 40): 3db93d03
+res3_1: 3d5a4a5b
+res2_2_1: 3c01ca05
+(Unnamed ITensor* 78): 3e4fb8a4
+(Unnamed ITensor* 61): 3e51b72a
+(Unnamed ITensor* 52): 3e368ce7
+res3_1_2: 3db36048
+res1_2_1: 3c497c69
+fc3: 3e91f7f1
+(Unnamed ITensor* 69): 3e1a721f
+res2_2: 3c016a8c
+res4_1_2: 3de4c1d9
+res1_1: 3c66f92f
+res3_2_1: 3d455a9e
+softmax: 3c00e5d4
+res4_1: 3d1a681c
+(Unnamed ITensor* 27): 3d3b70b6
+res3_2: 3d3b9f83
+res4_2: 3c023e26
+(Unnamed ITensor* 9): 3c7091cc
+(Unnamed ITensor* 75): 3e09c85e
+res3_1_1: 3c01b81e
+pool_avg: 3c159346
+(Unnamed ITensor* 65): 3e27c4e6
+(Unnamed ITensor* 4): 3c81c69e
+res4_1_1: 3d669c33
+(Unnamed ITensor* 71): 3e18ef86
diff --git a/ros/packages/caffe_ros/tests/tests.cpp b/ros/packages/caffe_ros/tests/tests.cpp
index 0cf4853..84ec4bd 100644
--- a/ros/packages/caffe_ros/tests/tests.cpp
+++ b/ros/packages/caffe_ros/tests/tests.cpp
@@ -53,9 +53,9 @@ TEST(CaffeRosTests, TrailNetPredictions)
     ASSERT_TRUE(fs::exists(test_data_dir));
 
     CaffeRosTestsCallback t;
-    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/trails_dnn/network/output", 1,
+    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/trailnet/dnn/network/output", 1,
                                                     &CaffeRosTestsCallback::dnnCallback, &t);
-    const char* camera_topic = "/camera_trails/image_raw";
+    const char* camera_topic = "/trailnet/camera/image_raw";
     auto img_pub = nh.advertise<sensor_msgs::Image>(camera_topic, 1);
 
     // Test images and expected predictions.
@@ -111,9 +111,9 @@ TEST(CaffeRosTests, TrailNetPredictionsBGR8)
     ASSERT_TRUE(fs::exists(test_data_dir));
 
     CaffeRosTestsCallback t;
-    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/trails_dnn/network/output", 1,
+    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/trailnet/dnn/network/output", 1,
                                                     &CaffeRosTestsCallback::dnnCallback, &t);
-    const char* camera_topic = "/camera_trails/image_raw";
+    const char* camera_topic = "/trailnet/camera/image_raw";
     auto img_pub = nh.advertise<sensor_msgs::Image>(camera_topic, 1);
 
     // Test images and expected predictions.
@@ -169,9 +169,9 @@ TEST(CaffeRosTests, TrailNetPredictionsFP16)
     ASSERT_TRUE(fs::exists(test_data_dir));
 
     CaffeRosTestsCallback t;
-    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/trails_dnn_fp16/network/output", 1,
+    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/trailnet/dnn_fp16/network/output", 1,
                                                     &CaffeRosTestsCallback::dnnCallback, &t);
-    const char* camera_topic = "/camera_trails_fp16/image_raw";
+    const char* camera_topic = "/trailnet/camera_fp16/image_raw";
     auto img_pub = nh.advertise<sensor_msgs::Image>(camera_topic, 1);
 
     // Test images and expected predictions.
@@ -220,6 +220,65 @@ TEST(CaffeRosTests, TrailNetPredictionsFP16)
     }
 }
 
+TEST(CaffeRosTests, TrailNetPredictionsINT8)
+{
+    ros::NodeHandle nh("~");
+    std::string test_data_dir;
+    nh.param<std::string>("test_data_dir", test_data_dir, "");
+    ASSERT_TRUE(fs::exists(test_data_dir));
+
+    CaffeRosTestsCallback t;
+    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/trailnet/dnn_int8/network/output", 1,
+                                                    &CaffeRosTestsCallback::dnnCallback, &t);
+    const char* camera_topic = "/trailnet/camera_int8/image_raw";
+    auto img_pub = nh.advertise<sensor_msgs::Image>(camera_topic, 1);
+
+    // Test images and expected predictions.
+    auto images            = std::vector<std::string>{"rot_l.jpg", "rot_c.jpg", "rot_r.jpg", "tran_l.jpg", "tran_r.jpg"};
+    float predictions[][6] = {{0.685, 0.211, 0.104, 0.493, 0.050, 0.466},
+                              {0.112, 0.794, 0.093, 0.541, 0.127, 0.332},
+                              {0.000, 0.027, 0.971, 0.095, 0.068, 0.836},
+                              {0.100, 0.896, 0.000, 0.521, 0.101, 0.377},
+                              {0.156, 0.285, 0.558, 0.074, 0.098, 0.827}};
+    
+    // When running using rostest, current directory is $HOME/.ros
+    fs::path data_dir{test_data_dir};
+
+    for (size_t i = 0; i < images.size(); i++)
+    {
+        auto img_msg = readImage((data_dir / images[i]).string());
+        // Use image index as a unique timestamp.
+        img_msg->header.stamp.sec  = 0;
+        img_msg->header.stamp.nsec = (int)i;
+
+        ros::Rate rate(1000);
+        // Wait until DNN processes the current messages. There might be multiple messages
+        // in the queue so make sure to select the right one based on current index.
+        while (ros::ok() && (t.dnn_out_ == nullptr || t.dnn_out_->header.stamp.nsec != i))
+        {
+            img_pub.publish(img_msg);
+            ros::spinOnce();
+            rate.sleep();
+        }
+
+        EXPECT_TRUE(t.dnn_out_ != nullptr);
+        auto dnn_out = *t.dnn_out_;
+        // The output should be 1x1x6 (HxWxC).
+        EXPECT_EQ(dnn_out.width,  1);
+        EXPECT_EQ(dnn_out.height, 1);
+        // float32, channels == 6.
+        EXPECT_EQ(dnn_out.encoding, "32FC6");
+        
+        auto data  = reinterpret_cast<const float*>(dnn_out.data.data());
+        for (int col = 0; col < 6; col++)
+        {
+            // Must use proper floating point comparison.
+            // REVIEW alexeyk: the tolerance has to be much higher in INT8.
+            EXPECT_NEAR(data[col], predictions[i][col], 0.1f) << "Values are not equal at (" << i << ", " << col <<")";
+        }
+    }
+}
+
 TEST(CaffeRosTests, YoloNetPredictions)
 {
     ros::NodeHandle nh("~");
@@ -228,9 +287,9 @@ TEST(CaffeRosTests, YoloNetPredictions)
     ASSERT_TRUE(fs::exists(test_data_dir));
 
     CaffeRosTestsCallback t;
-    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/object_dnn/network/output", 1,
+    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/yolo/dnn/network/output", 1,
                                                     &CaffeRosTestsCallback::dnnCallback, &t);
-    const char* camera_topic = "/camera_object/image_raw";
+    const char* camera_topic = "/yolo/camera/image_raw";
     auto img_pub = nh.advertise<sensor_msgs::Image>(camera_topic, 1);
 
     // Test images and expected predictions.
@@ -281,9 +340,9 @@ TEST(CaffeRosTests, YoloNetPredictionsFP16)
     ASSERT_TRUE(fs::exists(test_data_dir));
 
     CaffeRosTestsCallback t;
-    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/object_dnn_fp16/network/output", 1,
+    auto dnn_sub = nh.subscribe<sensor_msgs::Image>("/yolo/dnn_fp16/network/output", 1,
                                                     &CaffeRosTestsCallback::dnnCallback, &t);
-    const char* camera_topic = "/camera_object_fp16/image_raw";
+    const char* camera_topic = "/yolo/camera_fp16/image_raw";
     auto img_pub = nh.advertise<sensor_msgs::Image>(camera_topic, 1);
 
     // Test images and expected predictions.
diff --git a/ros/packages/caffe_ros/tests/tests_basic.launch b/ros/packages/caffe_ros/tests/tests_basic.launch
index 0585426..c14c0e4 100644
--- a/ros/packages/caffe_ros/tests/tests_basic.launch
+++ b/ros/packages/caffe_ros/tests/tests_basic.launch
@@ -4,74 +4,99 @@
     rostest caffe_ros tests_basic.launch
     Use parameters to specify values for the arguments, for example:
 
+    1. Simple version:
+
+rostest caffe_ros tests_basic.launch test_data_dir:=/data/src/redtail/ros/packages/caffe_ros/tests/data \
+    model_dir:=/data/src/redtail/models/pretrained
+
+    2. Advanced version:
+
 export REDTAIL_TEST_DIR=/data/src/redtail/ros/packages/caffe_ros/tests/data
 export REDTAIL_MODEL_DIR=/data/src/redtail/models/pretrained
-rostest caffe_ros tests_basic.launch test_data_dir:=$REDTAIL_TEST_DIR \
+rostest caffe_ros tests_basic.launch test_data_dir:=$REDTAIL_TEST_DIR model_dir:=$REDTAIL_MODEL_DIR \
     trail_prototxt_path:=$REDTAIL_MODEL_DIR/TrailNet_SResNet-18.prototxt trail_model_path:=$REDTAIL_MODEL_DIR/TrailNet_SResNet-18.caffemodel \
     object_prototxt_path:=$REDTAIL_MODEL_DIR/yolo-relu.prototxt object_model_path:=$REDTAIL_MODEL_DIR/yolo-relu.caffemodel
 
      -->
     <arg name="test_data_dir" />
-    <arg name="trail_prototxt_path" default="/data/src/redtail/models/pretrained/TrailNet_SResNet-18.prototxt" />
-    <arg name="trail_model_path"    default="/data/src/redtail/models/pretrained/TrailNet_SResNet-18.caffemodel" />
+    <arg name="model_dir" />
+    <arg name="trail_prototxt_path" default="$(arg model_dir)/TrailNet_SResNet-18.prototxt" />
+    <arg name="trail_model_path"    default="$(arg model_dir)/TrailNet_SResNet-18.caffemodel" />
     <arg name="trail_output_layer"  default="out" />
 
-    <arg name="object_prototxt_path" default="/home/ubuntu/models/yolo-relu.prototxt" />
-    <arg name="object_model_path"    default="/home/ubuntu/models/yolo-relu.caffemodel" />
+    <arg name="object_prototxt_path" default="$(arg model_dir)/yolo-relu.prototxt" />
+    <arg name="object_model_path"    default="$(arg model_dir)/yolo-relu.caffemodel" />
     <arg name="object_output_layer"  default="fc25" />
-    
-    <node pkg="caffe_ros" type="caffe_ros_node" name="trails_dnn">
-        <param name="camera_topic"  value="/camera_trails/image_raw" />
-        <param name="prototxt_path" value="$(arg trail_prototxt_path)" />
-        <param name="model_path"    value="$(arg trail_model_path)" />
-        <param name="input_layer"   value="data" />
-        <param name="output_layer"  value="$(arg trail_output_layer)" />
-        <param name="use_fp16"      value="false" />
-        <param name="use_cached_model" value="false" />
-    </node>
 
-    <node pkg="caffe_ros" type="caffe_ros_node" name="trails_dnn_fp16">
-        <param name="camera_topic"  value="/camera_trails_fp16/image_raw" />
-        <param name="prototxt_path" value="$(arg trail_prototxt_path)" />
-        <param name="model_path"    value="$(arg trail_model_path)" />
-        <param name="input_layer"   value="data" />
-        <param name="output_layer"  value="$(arg trail_output_layer)" />
-        <param name="use_fp16"      value="true" />
-        <param name="use_cached_model" value="false" />
-    </node>
+    <arg name="int8_calib_cache_path" default="$(arg test_data_dir)/trailnet_int8_calib.cache" />
+
+    <group ns='trailnet'>
+        <node pkg="caffe_ros" type="caffe_ros_node" name="dnn">
+            <param name="camera_topic"  value="/trailnet/camera/image_raw" />
+            <param name="prototxt_path" value="$(arg trail_prototxt_path)" />
+            <param name="model_path"    value="$(arg trail_model_path)" />
+            <param name="input_layer"   value="data" />
+            <param name="output_layer"  value="$(arg trail_output_layer)" />
+            <param name="data_type"     value="fp32" />
+            <param name="use_cached_model" value="false" />
+        </node>
+
+        <node pkg="caffe_ros" type="caffe_ros_node" name="dnn_fp16">
+            <param name="camera_topic"  value="/trailnet/camera_fp16/image_raw" />
+            <param name="prototxt_path" value="$(arg trail_prototxt_path)" />
+            <param name="model_path"    value="$(arg trail_model_path)" />
+            <param name="input_layer"   value="data" />
+            <param name="output_layer"  value="$(arg trail_output_layer)" />
+            <param name="data_type"     value="fp16" />
+            <param name="use_cached_model" value="false" />
+        </node>
+
+        <node pkg="caffe_ros" type="caffe_ros_node" name="dnn_int8">
+            <param name="camera_topic"  value="/trailnet/camera_int8/image_raw" />
+            <param name="prototxt_path" value="$(arg trail_prototxt_path)" />
+            <param name="model_path"    value="$(arg trail_model_path)" />
+            <param name="input_layer"   value="data" />
+            <param name="output_layer"  value="$(arg trail_output_layer)" />
+            <param name="data_type"     value="int8" />
+            <param name="use_cached_model" value="false" />
+            <param name="int8_calib_cache" value="$(arg int8_calib_cache_path)" />
+        </node>
+    </group>
+
+    <group ns='yolo'>
+        <node pkg="caffe_ros" type="caffe_ros_node" name="dnn">
+            <param name="camera_topic"  value="/yolo/camera/image_raw" />
+            <param name="prototxt_path" value="$(arg object_prototxt_path)" />
+            <param name="model_path"    value="$(arg object_model_path)" />
+            <param name="input_layer"   value="data" />
+            <param name="output_layer"  value="$(arg object_output_layer)" />
+            <param name="inp_scale"     value="0.00390625" />
+            <param name="inp_fmt"       value="RGB" />
+            <param name="post_proc"     value="YOLO" />
+            <param name="obj_det_threshold" value="0.2" />
+            <param name="iou_threshold"     value="0.2" />
+            <param name="data_type"     value="fp32" />
+            <param name="max_rate_hz"   value="1" />
+            <param name="use_cached_model" value="false" />
+        </node>
 
-    <node pkg="caffe_ros" type="caffe_ros_node" name="object_dnn">
-        <param name="camera_topic"  value="/camera_object/image_raw" />
-        <param name="prototxt_path" value="$(arg object_prototxt_path)" />
-        <param name="model_path"    value="$(arg object_model_path)" />
-        <param name="input_layer"   value="data" />
-        <param name="output_layer"  value="$(arg object_output_layer)" />
-        <param name="inp_scale"     value="0.00390625" />
-        <param name="inp_fmt"       value="RGB" />
-        <param name="post_proc"     value="YOLO" />
-        <param name="obj_det_threshold" value="0.2" />
-        <param name="iou_threshold"     value="0.2" />
-        <param name="use_fp16"      value="false" />
-        <param name="max_rate_hz"   value="1" />
-        <param name="use_cached_model" value="false" />
-    </node>
+        <node pkg="caffe_ros" type="caffe_ros_node" name="dnn_fp16">
+            <param name="camera_topic"  value="/yolo/camera_fp16/image_raw" />
+            <param name="prototxt_path" value="$(arg object_prototxt_path)" />
+            <param name="model_path"    value="$(arg object_model_path)" />
+            <param name="input_layer"   value="data" />
+            <param name="output_layer"  value="$(arg object_output_layer)" />
+            <param name="inp_scale"     value="0.00390625" />
+            <param name="inp_fmt"       value="RGB" />
+            <param name="post_proc"     value="YOLO" />
+            <param name="obj_det_threshold" value="0.2" />
+            <param name="iou_threshold"     value="0.2" />
+            <param name="data_type"     value="fp16" />
+            <param name="max_rate_hz"   value="1" />
+            <param name="use_cached_model" value="false" />
+        </node>
+    </group>
 
-    <node pkg="caffe_ros" type="caffe_ros_node" name="object_dnn_fp16">
-        <param name="camera_topic"  value="/camera_object_fp16/image_raw" />
-        <param name="prototxt_path" value="$(arg object_prototxt_path)" />
-        <param name="model_path"    value="$(arg object_model_path)" />
-        <param name="input_layer"   value="data" />
-        <param name="output_layer"  value="$(arg object_output_layer)" />
-        <param name="inp_scale"     value="0.00390625" />
-        <param name="inp_fmt"       value="RGB" />
-        <param name="post_proc"     value="YOLO" />
-        <param name="obj_det_threshold" value="0.2" />
-        <param name="iou_threshold"     value="0.2" />
-        <param name="use_fp16"      value="true" />
-        <param name="max_rate_hz"   value="1" />
-        <param name="use_cached_model" value="false" />
-    </node>
-    
     <test pkg="caffe_ros" test-name="CaffeRosTests" type="caffe_ros_tests" time-limit="300.0">
         <param name="test_data_dir" value="$(arg test_data_dir)" />
     </test>
diff --git a/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch b/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch
index 0ded215..4b78e8f 100644
--- a/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch
+++ b/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch
@@ -3,7 +3,7 @@
     <arg name="model_path" />
     <arg name="input_layer"  default="data" />
     <arg name="output_layer" default="softmax" />
-    <arg name="use_fp16"     default="true" />
+    <arg name="data_type"    default="fp16" />
     <arg name="mean"         default="0" />
     <arg name="debug_mode"   default="false" />
     
@@ -29,7 +29,7 @@
         <arg name="model_path"    value="$(arg model_path)" />
         <arg name="input_layer"   value="$(arg input_layer)" />
         <arg name="output_layer"  value="$(arg output_layer)" />
-        <arg name="use_fp16"      value="$(arg use_fp16)" />
+        <arg name="data_type"     value="$(arg data_type)" />
         <arg name="mean"          value="$(arg mean)" />
         <arg name="debug_mode"    value="$(arg debug_mode)" />
         <arg name="device"        value="$(arg device)" />
diff --git a/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch b/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch
index f8ae0a3..0b3530f 100644
--- a/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch
+++ b/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch
@@ -6,7 +6,7 @@
     <arg name="model_path"    default="/home/nvidia/redtail/models/pretrained/TrailNet_SResNet-18.caffemodel" />
     <arg name="input_layer"  default="data" />
     <arg name="output_layer" default="out" />
-    <arg name="use_fp16"     default="true" />
+    <arg name="data_type"    default="fp16" />
 
     <arg name="frame_id"   default="/tf_frame" />
 
@@ -20,7 +20,7 @@
         <param name="model_path"    value="$(arg model_path)" />
         <param name="input_layer"   value="$(arg input_layer)" />
         <param name="output_layer"  value="$(arg output_layer)" />
-        <param name="use_fp16"      value="$(arg use_fp16)" />
+        <param name="data_type"     value="$(arg data_type)" />
     </node>
 
     <node pkg="redtail_debug" type="redtail_debug_node" name="redtail_debug" output="screen">
diff --git a/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch b/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch
index e390112..8955ef0 100644
--- a/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch
+++ b/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch
@@ -7,7 +7,7 @@
     <arg name="model_path"    default="/home/nvidia/redtail/models/pretrained/TrailNet_SResNet-18.caffemodel" />
     <arg name="input_layer"  default="data" />
     <arg name="output_layer" default="out" />
-    <arg name="use_fp16"     default="true" />
+    <arg name="data_type"    default="fp16" />
 
     <arg name="frame_id"   default="/tf_frame" />
 
@@ -35,7 +35,7 @@
         <param name="model_path"    value="$(arg model_path)" />
         <param name="input_layer"   value="$(arg input_layer)" />
         <param name="output_layer"  value="$(arg output_layer)" />
-        <param name="use_fp16"      value="$(arg use_fp16)" />
+        <param name="data_type"     value="$(arg data_type)" />
     </node>
 
     <node pkg="redtail_debug" type="redtail_debug_node" name="redtail_debug" output="screen">
diff --git a/ros/scripts/jetson_ros_install.sh b/ros/scripts/jetson_ros_install.sh
index 6a820eb..5086def 100755
--- a/ros/scripts/jetson_ros_install.sh
+++ b/ros/scripts/jetson_ros_install.sh
@@ -90,20 +90,10 @@ echo "Installing dependencies..."
 sudo apt-get install -y libgstreamer1.0-dev gstreamer1.0-tools libgstreamer-plugins-base1.0-dev libgstreamer-plugins-good1.0-dev libyaml-cpp-dev
 
 cd $HOME
-# REVIEW alexeyk: replace with sudo apt-get install -y ros-kinetic-camera-info-manager ros-kinetic-camera-calibration-parsers ros-kinetic-image-transport
-if [ ! -d "$HOME/image_common" ]; then
-    echo "Cloning image_common sources..."
-    git clone https://github.com/ros-perception/image_common.git
-    # Create symlinks to catkin workspace.
-    ln -s $HOME/image_common/camera_calibration_parsers $CATKIN_WS/src/
-    ln -s $HOME/image_common/camera_info_manager $CATKIN_WS/src/
-    ln -s $HOME/image_common/image_transport $CATKIN_WS/src/
-else
-    echo "Updating image_common sources..."
-    cd image_common
-    git pull
-fi
+# Install gscam dependencies.
+sudo apt-get install -y ros-kinetic-camera-info-manager ros-kinetic-camera-calibration-parsers ros-kinetic-image-transport
 
+# Install gscam from sources rather than apt-get install as the latter installs a lot of redundant stuff.
 cd $HOME
 if [ ! -d "$HOME/gscam" ]; then
     echo "Cloning gscam sources..."
@@ -121,14 +111,14 @@ echo "Building gscam package..."
 cd $CATKIN_WS
 catkin_make -DGSTREAMER_VERSION_1_x=On
 
-# Installing caffe_ros ROS package and its dependencies.
+# Installing redtail ROS packages and dependencies.
 echo "${green}Starting installation of caffe_ros and px4_controller ROS packages...${reset}"
 cd $HOME
 if [ ! -d "$HOME/redtail" ]; then
-    echo "Cloning caffe_ros sources..."
+    echo "Cloning redtail sources..."
     git clone https://github.com/NVIDIA-Jetson/redtail
 else
-    echo "Updating caffe_ros sources..."
+    echo "Updating redtail sources..."
     cd redtail
     git checkout master
     git pull
@@ -142,15 +132,7 @@ fi
 
 echo "Installing dependencies..."
 cd $HOME
-if [ ! -d "$HOME/angles" ]; then
-    git clone https://github.com/ros/angles.git
-    # Create symlink to catkin workspace.
-    ln -s $HOME/angles/angles $CATKIN_WS/src/
-else
-    echo "Updating angles sources..."
-    cd angles
-    git pull
-fi
+sudo apt-get install -y ros-kinetic-angles
 
 cd $CATKIN_WS
 echo "Building caffe_ros package..."
diff --git a/stereoDNN/.gitignore b/stereoDNN/.gitignore
new file mode 100644
index 0000000..565f820
--- /dev/null
+++ b/stereoDNN/.gitignore
@@ -0,0 +1,4 @@
+bin/
+build/
+build_rel/
+*.pyc
\ No newline at end of file
diff --git a/stereoDNN/CMakeLists.txt b/stereoDNN/CMakeLists.txt
new file mode 100644
index 0000000..619a0f0
--- /dev/null
+++ b/stereoDNN/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+# Build with:
+# Debug: cmake -DCMAKE_BUILD_TYPE=Debug ..
+
+cmake_minimum_required(VERSION 3.5)
+
+set(default_build_type "Release")
+
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+    set(TARGET_SUFFIX "_debug")
+    message("Using ${TARGET_SUFFIX} suffix.")
+endif()
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/../bin")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/../bin")
+
+add_subdirectory(lib)
+add_subdirectory(sample_app)
+add_subdirectory(tests)
diff --git a/stereoDNN/README.md b/stereoDNN/README.md
new file mode 100644
index 0000000..2c8a515
--- /dev/null
+++ b/stereoDNN/README.md
@@ -0,0 +1,136 @@
+# Stereo DNN TensorRT inference library
+The goal of this project is to enable inference for [NVIDIA Stereo DNN](https://arxiv.org/) TensorFlow models on Jetson as well as other platforms supported by NVIDIA TensorRT library.
+
+This is a 2-step process:
+1. Convert the TensorFlow model to TensorRT C++ API model. This step is performed only once for each model and can be done on any environment like user's desktop.
+2. Use the TRT C++ API model in an application. Once the model is built, it can be used in any environment (e.g. Jetson) to perform inference. 
+
+Note: TensorFlow is **not** required for the inference step. The library needs only basic components like CUDA 9.0, cuDNN 7.0 and TensorRT 3.0 so it will run as-is on Jetson with JetPack 3.2
+
+The library implements the following TensorRT plugins:
+* `conv3d`          : implementation of TensorFlow-compatible 3D convolution
+* `conv3d_transpose`: implementation of TensorFlow-compatible 3D transposed convolution (aka deconvolution)
+* `cost_volume`     : implementation of cost volume computation used in StereoDNN
+* `softargmax`      : implementation of specific softargmax implementation used in StereoDNN
+* `elu`             : implementation of ELU activation function
+* `transform`       : implementation of tensor transformation required for certain operations
+* `slice`           : implementation of tensor slicing required for certain operations
+* `pad`             : implementation of tensor padding required for certain operations
+
+Note that these plugins make certain assumptions that are valid in case of Stereo DNN.
+`slice` and `pad` plugins implement only a tiny piece of functionality required to run the inference. 
+
+## Models
+There are several Stereo DNN models included in this packages, the following table provides brief comparison. `TF` stands for TensorFlow and `TRT` - for our implementation based on TensorRT and cuDNN. All times are in milliseconds per image, averaged over 200 images.
+
+| Model     | Input size  | Titan Xp (TF) | Titan Xp (TRT) | Jetson TX2 (TRT) | D1 error (%) |
+| --------- | ----------- | --------------| -------------- | ---------------- | ------------ |
+| NVSmall   | 1025x321    |       800     |       450      |       7800       |     9.8      |
+| NVTiny    |  513x161    |        75     |        40      |        360       |     11.12    |
+| ResNet-18 | 1025x321    |       950     |       650      |      11000       |     3.4(*)   |
+
+Notes:
+* We could not run TensorFlow on Jetson with our models so no measurments were done in this case.
+* D1 error for `NVSmall` and `NVTiny` was measured on 200 training images from [KITTI 2015 stereo benchmark](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=stereo). This dataset was **not** used to train the models.
+* `*` - measured on [KITTI 2015 stereo test set](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=stereo). Note that this model was fine-tuned on 200 training images, so providing error on that dataset is not useful.
+
+## Converting TensorFlow model to TensorRT C++ API model
+To convert TensorFlow model to TensorRT C++ API, run the `./scripts/model_builder.py` script which takes several named parameters: 
+* model type
+* network name to use in generated C++ code
+* path to TensorFlow model
+* path to resulting binary weights file 
+* path to generated C++ file. 
+
+You can also optionally specify model data type (fp32 or fp16 with fp32 being default).
+
+Example:
+```sh
+cd ./scripts/
+python ./model_builder.py --model_type nvsmall --net_name NVSmall1025x321 --checkpoint_path=../models/NVSmall/TensorFlow/model-inference-1025x321-0 --weights_file=../models/NVSmall/TensorRT/trt_weights.bin --cpp_file=../sample_app/nvsmall_1025x321_net.cpp --data_type fp32
+```
+Currently the supported model types are `nvsmall` and `resnet18`. `NVTiny` is a slight variation of `NVSmall` so it works with `nvsmall` model type. Adding new model types should be relatively easy, `./scripts/model_nvsmall.py` or `./scripts/model_resnet18.py` can provide a good starting point.
+
+Note: TensorFlow v.1.5 or later is required. We stronly recommend using our [TensorFlow Docker container](../tools/tensorflow/docker) as it contains all necessary components required to use Stereo DNN with TensorFlow.
+
+## Building inference code
+Once the TensorRT C++ model is created, it can be used in any TensorRT-enabled application. The inference static library `nvstereo_inference` located at `./lib/` contains imlpementation of TensorRT plugins requried to run Stereo DNN. A sample application located at `./sample_app/` provides example of library usage. To build library, sample application and tests, run the following commands:
+
+```sh
+# Build debug:
+mkdir build
+cd ./build/
+cmake -DCMAKE_BUILD_TYPE=Debug ..
+make
+# Build release:
+cd ..
+mkdir build_rel
+cd ./build_rel/
+cmake -DCMAKE_BUILD_TYPE=Release ..
+```
+
+If you get CMake error that `GTest` is not found, do the following:
+```sh
+cd /usr/src/gtest
+cmake CMakeLists.txt
+make
+```
+and then try building the library again (you may need to use `sudo` depending on your environment).
+
+The build will place binary files in `./bin/` directory.
+
+It's a good idea to run the tests first to make sure everything is working as expected:
+```sh
+./bin/nvstereo_tests_debug ./tests/data
+```
+All tests should pass (obviously). We recommend running debug version first to make sure all asserts in the code are enabled.
+
+To run the sample application:
+```sh
+./bin/nvstereo_sample_app_debug nvsmall 513 161 ./models/NVTiny/TensorRT/trt_weights.bin ./sample_app/data/img_left.png ./sample_app/data/img_right.png ./bin/disp.bin
+```
+The app takes 8 parameters:
+* model type (`nvsmall` or `resnet18`)
+* dimensions of the image (width and height - must be equal to dimensions of network input)
+* path to weights file created by model builder script
+* 2 images, left and right (e.g. PNG files)
+* path to output file, the app will create 2 files: binary and PNG
+* [optional] data type (fp32 or fp16). Note that FP16 implementation in cuDNN is currently not optimized for 3D convolutions so results might be worse than FP32.
+
+We recommend running debug version first to make sure all asserts in the code are enabled.
+
+The following scripts demonstrate how to properly read and pre-process images for the Stereo DNN:
+
+Using OpenCV (C++ version is in the `sample_app` as well):
+```python
+import numpy as np
+import cv2
+
+# Using OpenCV
+img = cv2.imread('left.png')
+img = cv2.resize(img, (1025, 321), interpolation = cv2.INTER_AREA)
+# Convert to RGB and then CHW.
+img = np.transpose(img[:, :, ::-1], [2, 0, 1]).astype(np.float32)
+img /= 255.0
+print(img.shape)
+with open('left.bin', 'wb') as w:
+    img.reshape(-1).tofile(w)
+
+```
+
+Using TensorFlow:
+```python
+import numpy as np
+import tensorflow as tf
+
+img = tf.image.decode_png(tf.read_file('left.png'), dtype=tf.uint8)
+img = tf.image.convert_image_dtype(img, tf.float32)
+img = tf.image.resize_images(img, [321, 1025], tf.image.ResizeMethod.AREA)
+# Convert to CHW.
+img_res = np.transpose(img.eval(), [2, 0, 1])
+with open('left.bin', 'wb') as w:
+    img_res.reshape(-1).tofile(w)
+
+```
+
+Note that due to different implementation of resizing algorithm in TF and OpenCV results will not be byte-wise equal.
\ No newline at end of file
diff --git a/stereoDNN/lib/CMakeLists.txt b/stereoDNN/lib/CMakeLists.txt
new file mode 100644
index 0000000..4729cc0
--- /dev/null
+++ b/stereoDNN/lib/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+cmake_minimum_required(VERSION 3.5)
+
+set(PROJECT_NAME nvstereo_inference)
+project(${PROJECT_NAME})
+
+find_package(CUDA 9.0 REQUIRED)
+
+include_directories(${CUDA_INCLUDE_DIRS})
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+
+# Set CUDA NVCC flags:
+# Enable C++ 14
+list(APPEND CUDA_NVCC_FLAGS -std=c++14)
+list(APPEND CUDA_NVCC_FLAGS -lineinfo -Xcompiler -Wall -Xcompiler -Wextra)
+
+# Generate both PTX and SASS for Jetson TX2 and use base Maxwell PTX for everything else.
+list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_50,code=compute_50)
+list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_60,code=sm_61)
+list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_62,code=sm_62)
+
+file(GLOB ${PROJECT_NAME}_sources ./*.cpp ./*.cu)
+set(PROJECT_SOURCES ${${PROJECT_NAME}_sources})
+
+set(TARGET_NAME ${PROJECT_NAME}${TARGET_SUFFIX})
+cuda_add_library(${TARGET_NAME} ${PROJECT_SOURCES} STATIC)
diff --git a/stereoDNN/lib/conv3d_plugin.cpp b/stereoDNN/lib/conv3d_plugin.cpp
new file mode 100644
index 0000000..ec3ad64
--- /dev/null
+++ b/stereoDNN/lib/conv3d_plugin.cpp
@@ -0,0 +1,376 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <iomanip>
+#include <sstream>
+#include "conv_utils.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// 3D convolution plugin.
+// For more information on how 3D convolution is implemented, see
+// comments in conv_utils.h
+// -----------------------------------------------------------------
+class Conv3DPlugin: public IPlugin
+{
+public:
+    Conv3DPlugin(Conv3DType conv_type, Dims kernel_dims,
+                 Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                 Weights kernel_weights, Weights bias_weights,
+                 ILogger& log, std::string name):
+        conv_type_(conv_type), w_dims_(kernel_dims),
+        stride_dims_(stride_dims), pad_start_dims_(pad_start_dims), pad_end_dims_(pad_end_dims),
+        kernel_weights_(kernel_weights), bias_weights_(bias_weights),
+        log_(log), name_(name)
+    {
+        // REVIEW alexeyk: TRT currently does not support FP16 data tensors so we
+        // use weights tensor data type for all descriptors. In case weights
+        // are in FP16 we'll do the conversion on the fly. This should be changed
+        // when TRT adds full support for FP16.
+        // For FP16 we support only TRUE_HALF_CONFIG mode:
+        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionForward
+        data_type_ = CUDNN_DATA_FLOAT;
+
+        // Expecting kernel to be 5D tensor in KVCRS format.
+        assert(w_dims_.nbDims == 5);
+
+        // Expecting stride to be 3D tensor in DHW format.
+        assert(stride_dims.nbDims == 3);
+
+        // Expecting padding to be 3D tensors in DHW format.
+        assert(pad_start_dims.nbDims == 3);
+        assert(pad_end_dims.nbDims   == 3);
+        // Currently only symmetric padding is supported for H,W dims.
+        assert(pad_start_dims_.d[1] == pad_end_dims_.d[1]);
+        assert(pad_start_dims_.d[2] == pad_end_dims_.d[2]);
+        // Special case (TF-compatible) of asymmetric padding is supported for D dim.
+        assert(pad_start_dims_.d[0] == pad_end_dims_.d[0] || pad_start_dims_.d[0] == pad_end_dims_.d[0] - 1);
+
+        // TRT supprots FP32/FP16 weights.
+        assert(kernel_weights_.type == DataType::kFLOAT || kernel_weights_.type == DataType::kHALF);
+        assert(kernel_weights_.count > 0 && kernel_weights_.values != nullptr);
+        // TRT supprots FP32/FP16 weights.
+        assert(bias_weights_.type == DataType::kFLOAT || bias_weights_.type == DataType::kHALF);
+        assert((bias_weights_.count  > 0 && bias_weights_.values != nullptr) ||
+               (bias_weights_.count == 0 && bias_weights_.values == nullptr));
+        // Assume same type for simplicity.
+        assert(bias_weights_.type == kernel_weights_.type);
+
+        weights_type_ = trtToCudnnDataType(kernel_weights_.type);
+    }
+
+    Conv3DPlugin(Conv3DPlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(index       == 0);
+        assert(nbInputDims == 1);
+        assert(inputs[0].nbDims == 4);
+
+        x_dims_ = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]);
+
+        createDescriptors();
+        // Can use batch_size == 1 to set tensor descriptors initially.
+        // Set input descriptor.
+        ConvUtils::setConv3DTensorDescriptor(conv_type_, x_dims_, 1, weights_type_, x_desc_, log_);
+        // Set conv operation descriptors.
+        ConvUtils::setConv3DOperationDescriptors(conv_type_, w_dims_, stride_dims_, pad_start_dims_,
+                                                 weights_type_, w_desc_, c_desc_, log_);
+        // Compute output dims.
+        auto y_d = ConvUtils::getConv3DOutputDims(c_desc_, x_desc_, w_desc_, log_);
+        // Remove batch index dim.
+        y_dims_  = DimsNCHW(y_d.d[1], y_d.d[2], y_d.d[3], y_d.d[4]);
+        // Output tensor is always in cuDNN format.
+        ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, 1, weights_type_, y_desc_, log_);
+        // Set bias descriptor.
+        // REVIEW alexeyk: see the comment in tensorrt_model_builder.py re: the stride issue in Conv3D.
+        ConvUtils::setConv3DBiasDescriptor(Dims{5, {1, y_dims_.d[0], 1, 1, 1}}, weights_type_, b_desc_, log_);
+
+        return y_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(isValid());
+        assert(nbInputs  == 1);
+        assert(nbOutputs == 1);
+        assert(DimsUtils::areEqual(inputDims[0],  x_dims_));
+        assert(DimsUtils::areEqual(outputDims[0], y_dims_));
+
+        max_batch_size_ = maxBatchSize;
+        // Update in/out descriptors and run auto-tuner to find best (fastest) algo.
+        ConvUtils::setConv3DTensorDescriptor(conv_type_,         x_dims_, maxBatchSize, weights_type_, x_desc_, log_);
+        ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, maxBatchSize, weights_type_, y_desc_, log_);
+        findBestAlgo();
+
+        const size_t elt_size = getWeightsDataTypeSize();
+
+        // Need workspace for FP32 -> FP16 conversion.
+        if (isFP16())
+            workspace_bytes_ += max_batch_size_ * std::max(DimsUtils::getTensorSize(x_dims_), DimsUtils::getTensorSize(y_dims_)) * elt_size;
+
+        // Allocate memory and copy weights.
+        CHECK(cudaMalloc(&kernel_weights_d_, kernel_weights_.count * elt_size));
+        CHECK(cudaMemcpy(kernel_weights_d_, kernel_weights_.values,
+                         kernel_weights_.count * elt_size, cudaMemcpyHostToDevice));
+
+        if (bias_weights_.count > 0)
+        {
+            CHECK(cudaMalloc(&bias_weights_d_, bias_weights_.count * elt_size));
+            CHECK(cudaMemcpy(bias_weights_d_, bias_weights_.values,
+                             bias_weights_.count * elt_size, cudaMemcpyHostToDevice));
+
+        }
+        log_.log(ILogger::Severity::kINFO, (name_ + ": InDims  : " + DimsUtils::toString(x_dims_)).c_str());
+        log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims : " + DimsUtils::toString(y_dims_)).c_str());
+    }
+
+    int initialize() override
+    {
+        assert(isValid());
+        return 0;
+    }
+
+    void terminate() override
+    {
+        assert(isValid());
+
+        if (c_desc_ != nullptr)
+            CHECK(cudnnDestroyConvolutionDescriptor(c_desc_));
+        if (w_desc_ != nullptr)
+            CHECK(cudnnDestroyFilterDescriptor(w_desc_));
+        if (x_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+        if (y_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+        if (b_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(b_desc_));
+        if (cudnn_ != nullptr)
+            CHECK(cudnnDestroy(cudnn_));
+
+        if (kernel_weights_d_ != nullptr)
+            CHECK(cudaFree(kernel_weights_d_));
+        if (bias_weights_d_ != nullptr)
+            CHECK(cudaFree(bias_weights_d_));
+
+        c_desc_ = nullptr;
+        w_desc_ = nullptr;
+        x_desc_ = nullptr;
+        y_desc_ = nullptr;
+        b_desc_ = nullptr;
+        cudnn_  = nullptr;
+
+        kernel_weights_d_ = nullptr;
+        bias_weights_d_   = nullptr;
+
+        assert(!isValid());
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        assert(isValid());
+        assert(max_batch_size_ == maxBatchSize);
+
+        return workspace_bytes_;
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        assert(isValid());
+        // REVIEW alexeyk: for now assuming batch size always equals max batch size.
+        // That's pretty strict as it disables dynamic batch sizes but fine for now.
+        assert(batchSize == max_batch_size_);
+
+        cudnnStatus_t status;
+
+        CHECK(status = cudnnSetStream(cudnn_, stream));
+
+        size_t workspace_used_bytes = 0;
+        // Convert to FP16 first if needed.
+        auto px = preprocessInput(batchSize, inputs[0], workspace, stream, workspace_used_bytes);
+        assert(px != nullptr);
+        assert(workspace_used_bytes <= workspace_bytes_);
+
+        CHECK(status = cudnnConvolutionForward(cudnn_, &Consts::kOne, x_desc_, px, w_desc_, kernel_weights_d_,
+                                               c_desc_, best_algo_,
+                                               static_cast<uint8_t*>(workspace) + workspace_used_bytes, workspace_bytes_ - workspace_used_bytes,
+                                               &Consts::kZero, y_desc_, outputs[0]));
+
+        if (bias_weights_.count > 0)
+            CHECK(status = cudnnAddTensor(cudnn_, &Consts::kOne, b_desc_, bias_weights_d_, &Consts::kOne, y_desc_, outputs[0]));
+
+        // Convert back to FP32 if needed.
+        postprocessOutput(batchSize, outputs[0], workspace, stream);
+
+        return status == CUDNN_STATUS_SUCCESS ? 0 : -1;
+    }
+
+    size_t getSerializationSize() override
+    {
+        assert(isValid());
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+        assert(isValid());
+        // REVIEW alexeyk: implement.
+        assert(false);
+    }
+
+private:
+    bool isValid() const
+    {
+        return cudnn_ != nullptr;
+    }
+
+    bool isFP16() const
+    {
+        return weights_type_ == CUDNN_DATA_HALF;
+    }
+
+    size_t getWeightsDataTypeSize() const
+    {
+        return (isFP16() ? sizeof(uint16_t) : sizeof(float));
+    }
+
+    const void* preprocessInput(int batchSize, const void* x, void* workspace, cudaStream_t stream, size_t& workspace_used_bytes)
+    {
+        if (!isFP16())
+            return x;
+
+        assert(data_type_ == CUDNN_DATA_FLOAT);
+
+        // Convert to FP16 using workspace.
+        size_t x_size = batchSize * DimsUtils::getTensorSize(x_dims_);
+        CHECK(CudaKernels::fp32Tofp16((const float*)x, (uint16_t*)workspace, x_size, stream));
+
+        workspace_used_bytes = x_size * sizeof(uint16_t);
+        return workspace;
+    }
+
+    void postprocessOutput(int batchSize, void* y, void* workspace, cudaStream_t stream)
+    {
+        if (!isFP16())
+            return;
+
+        assert(data_type_ == CUDNN_DATA_FLOAT);
+
+        size_t y_size = batchSize * DimsUtils::getTensorSize(y_dims_);
+        // Copy to workspace first.
+        CHECK(cudaMemcpyAsync(workspace, y, y_size * sizeof(uint16_t), cudaMemcpyDeviceToDevice, stream));
+        // Convert to FP32 from workspace.
+        CHECK(CudaKernels::fp16Tofp32((const uint16_t*)workspace, (float*)y, y_size, stream));
+    }
+
+    void createDescriptors()
+    {
+        if (cudnn_ == nullptr)
+            CHECK(cudnnCreate(&cudnn_));
+        if (x_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+        if (y_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+        if (w_desc_ == nullptr)
+            CHECK(cudnnCreateFilterDescriptor(&w_desc_));
+        if (c_desc_ == nullptr)
+            CHECK(cudnnCreateConvolutionDescriptor(&c_desc_));
+        if (b_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&b_desc_));
+    }
+
+    void findBestAlgo()
+    {
+        // Let's hope cuDNN team will not come up with more than that number of algos (8 in cuDNN 7).
+        const int algo_count = 20;
+        int       res_algo_count;
+        cudnnConvolutionFwdAlgoPerf_t algos[algo_count];
+        CHECK(cudnnFindConvolutionForwardAlgorithm(cudnn_, x_desc_, w_desc_, c_desc_, y_desc_,
+                                                   algo_count, &res_algo_count, algos));
+
+        assert(res_algo_count > 0);
+        assert(algos[0].status == CUDNN_STATUS_SUCCESS);
+
+        // Best algo is the first.
+        best_algo_       = algos[0].algo;
+        workspace_bytes_ = algos[0].memory;
+
+        // Log results.
+        log_.log(ILogger::Severity::kINFO, (name_ + ": --> Conv3D layer tuning results:").c_str());
+        for (auto& a: algos)
+        {
+            if (a.status != CUDNN_STATUS_SUCCESS)
+                break;
+            std::ostringstream str;
+            str <<  a.algo << ": " << std::fixed << std::setw(8) << std::setprecision(1) << a.time << "ms, "
+                << std::fixed << std::setw(8) << a.memory << "B";
+            log_.log(ILogger::Severity::kINFO, str.str().c_str());
+        }
+        log_.log(ILogger::Severity::kINFO, (name_ + ": <-- Conv3D layer tuning results.").c_str());
+    }
+
+private:
+    Conv3DType      conv_type_;
+    cudnnDataType_t data_type_;
+    cudnnDataType_t weights_type_;
+
+    // Using DimsNCHW to represent 3D convos input/output is an ugly workaround
+    // of TRT limitations which currently result in assert in the guts of TRT.
+    DimsNCHW x_dims_;
+    DimsNCHW y_dims_;
+    Dims     w_dims_;
+    Dims     stride_dims_;
+    Dims     pad_start_dims_;
+    Dims     pad_end_dims_;
+
+    int      max_batch_size_ = 0;
+
+    // Kernel weights on the host.
+    Weights kernel_weights_;
+    // Kernel weights on the device.
+    float*  kernel_weights_d_ = nullptr;
+
+    // Bias weights on the host.
+    Weights bias_weights_;
+    // Bias weights on the device.
+    float*  bias_weights_d_ = nullptr;
+
+    cudnnHandle_t                cudnn_  = nullptr;
+    cudnnTensorDescriptor_t      x_desc_ = nullptr;
+    cudnnTensorDescriptor_t      y_desc_ = nullptr;
+    cudnnFilterDescriptor_t      w_desc_ = nullptr;
+    cudnnConvolutionDescriptor_t c_desc_ = nullptr;
+    cudnnTensorDescriptor_t      b_desc_ = nullptr;
+
+    cudnnConvolutionFwdAlgo_t  best_algo_       = (cudnnConvolutionFwdAlgo_t)-1;
+    size_t                     workspace_bytes_ = 0;
+
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createConv3DPlugin(Conv3DType conv_type, Dims kernel_dims,
+                                             Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                                             Weights kernel_weights, Weights bias_weights,
+                                             std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new Conv3DPlugin(conv_type, kernel_dims,
+                                        stride_dims, pad_start_dims, pad_end_dims,
+                                        kernel_weights, bias_weights,
+                                        log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/conv3d_transpose_plugin.cpp b/stereoDNN/lib/conv3d_transpose_plugin.cpp
new file mode 100644
index 0000000..55c50fe
--- /dev/null
+++ b/stereoDNN/lib/conv3d_transpose_plugin.cpp
@@ -0,0 +1,413 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+#include <iomanip>
+#include <sstream>
+#include "conv_utils.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Transposed 3D convolution plugin.
+// For dimension members naming we loosely follow the notation
+// from cuDNN where:
+// - transposed convolution input is denoted as dy.
+// - transposed convolution output is denoted as dx.
+// This makes a code more consistent as a convolution input is
+// a transposed convolution output and vice versa.
+// -----------------------------------------------------------------
+class Conv3DTransposePlugin: public IPlugin
+{
+public:
+    Conv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims,
+                          Dims out_dims, Dims stride_dims,
+                          Dims pad_start_dims, Dims pad_end_dims,
+                          Weights kernel_weights, Weights bias_weights,
+                          ILogger& log, std::string name):
+        conv_type_(conv_type), w_dims_(kernel_dims),
+        stride_dims_(stride_dims), pad_start_dims_(pad_start_dims), pad_end_dims_(pad_end_dims),
+        kernel_weights_(kernel_weights), bias_weights_(bias_weights),
+        log_(log), name_(name)
+    {
+        // REVIEW alexeyk: TRT currently does not support FP16 data tensors so we
+        // use weights tensor data type for all descriptors. In case weights
+        // are in FP16 we'll do the conversion on the fly. This should be changed
+        // when TRT adds full support for FP16.
+        // For FP16 we support only TRUE_HALF_CONFIG mode:
+        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionForward
+        data_type_ = CUDNN_DATA_FLOAT;
+
+        // Expecting kernel to be 5D tensor in KVCRS format.
+        assert(w_dims_.nbDims == 5);
+
+        // Expecting stride to be 3D tensor in DHW format.
+        assert(stride_dims.nbDims == 3);
+
+        // Expecting padding to be 3D tensors in DHW format.
+        assert(pad_start_dims.nbDims == 3);
+        assert(pad_end_dims.nbDims   == 3);
+        // Currently only symmetric padding is supported for H,W dims.
+        assert(pad_start_dims_.d[1] == pad_end_dims_.d[1]);
+        assert(pad_start_dims_.d[2] == pad_end_dims_.d[2]);
+        // Special case (TF-compatible) of asymmetric padding is supported for D dim.
+        assert(pad_start_dims_.d[0] == pad_end_dims_.d[0] || pad_start_dims_.d[0] == pad_end_dims_.d[0] - 1);
+
+        // Expecting output to be 4D tensor in CDHW format.
+        assert(out_dims.nbDims == 4);
+        x_dims_ = DimsNCHW(out_dims.d[0], out_dims.d[1], out_dims.d[2], out_dims.d[3]);
+
+        b_dims_ = Dims{5, {1, 1, x_dims_.d[1], 1, 1}};
+
+        // TRT supprots FP32/FP16 weights.
+        assert(kernel_weights_.type == DataType::kFLOAT || kernel_weights_.type == DataType::kHALF);
+        assert(kernel_weights_.count > 0 && kernel_weights_.values != nullptr);
+        // TRT supprots FP32/FP16 weights.
+        assert(bias_weights_.type == DataType::kFLOAT || bias_weights_.type == DataType::kHALF);
+        assert((bias_weights_.count  > 0 && bias_weights_.values != nullptr) ||
+               (bias_weights_.count == 0 && bias_weights_.values == nullptr));
+        // Assume same type for simplicity.
+        assert(bias_weights_.type == kernel_weights_.type);
+
+        weights_type_ = trtToCudnnDataType(kernel_weights_.type);
+    }
+
+    Conv3DTransposePlugin(Conv3DTransposePlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(index       == 0);
+        assert(nbInputDims == 1);
+        assert(inputs[0].nbDims == 4);
+
+        y_dims_ = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]);
+
+        createDescriptors();
+        // Can use batch_size == 1 to set tensor descriptors initially.
+        // Set input descriptor (i.e. convolution output).
+        // Note: convolution output tensor is always in cuDNN format.
+        ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, 1, weights_type_, y_desc_, log_);
+        // Set conv operation descriptors.
+        ConvUtils::setConv3DOperationDescriptors(conv_type_, w_dims_, stride_dims_, pad_start_dims_,
+                                                 weights_type_, w_desc_, c_desc_, log_);
+        // Set output descriptors (i.e. convolution input).
+        // xo_desc_ is needed in bias addition operation.
+        ConvUtils::setConv3DTensorDescriptor(conv_type_,         x_dims_, 1, weights_type_, x_desc_,  log_);
+        ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, x_dims_, 1, weights_type_, xo_desc_, log_);
+        // Verify that dimensions match.
+        std::array<int, 5> out_dims;
+        CHECK(cudnnGetConvolutionNdForwardOutputDim(c_desc_, x_desc_, w_desc_, out_dims.size(), out_dims.data()));
+        assert(DimsUtils::areEqual(y_dims_, DimsNCHW{out_dims[1], out_dims[2], out_dims[3], out_dims[4]}));
+        // Set bias descriptor.
+        ConvUtils::setConv3DBiasDescriptor(b_dims_, weights_type_, b_desc_, log_);
+
+        return x_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(isValid());
+        assert(nbInputs  == 1);
+        assert(nbOutputs == 1);
+        assert(DimsUtils::areEqual(inputDims[0],  y_dims_));
+        assert(DimsUtils::areEqual(outputDims[0], x_dims_));
+
+        max_batch_size_ = maxBatchSize;
+        // Update in/out descriptors and run auto-tuner to find best (fastest) algo.
+        ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, maxBatchSize, weights_type_, y_desc_, log_);
+        ConvUtils::setConv3DTensorDescriptor(conv_type_,         x_dims_, maxBatchSize, weights_type_, x_desc_, log_);
+        ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, x_dims_, maxBatchSize, weights_type_, xo_desc_, log_);
+        findBestAlgo();
+
+        const size_t elt_size = getWeightsDataTypeSize();
+
+        // Need workspace for FP32 -> FP16 conversion.
+        if (isFP16())
+            workspace_bytes_ += max_batch_size_ * std::max(DimsUtils::getTensorSize(x_dims_), DimsUtils::getTensorSize(y_dims_)) * elt_size;
+
+        // Allocate memory and copy weights.
+        CHECK(cudaMalloc(&kernel_weights_d_, kernel_weights_.count * elt_size));
+        CHECK(cudaMemcpy(kernel_weights_d_, kernel_weights_.values,
+                         kernel_weights_.count * elt_size, cudaMemcpyHostToDevice));
+
+        if (bias_weights_.count > 0)
+        {
+            CHECK(cudaMalloc(&bias_weights_d_, bias_weights_.count * elt_size));
+            CHECK(cudaMemcpy(bias_weights_d_, bias_weights_.values,
+                             bias_weights_.count * elt_size, cudaMemcpyHostToDevice));
+
+        }
+        log_.log(ILogger::Severity::kINFO, (name_ + ": InDims  : " + DimsUtils::toString(y_dims_)).c_str());
+        log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims : " + DimsUtils::toString(x_dims_)).c_str());
+    }
+
+    int initialize() override
+    {
+        assert(isValid());
+        return 0;
+    }
+
+    void terminate() override
+    {
+        assert(isValid());
+
+        if (c_desc_ != nullptr)
+            CHECK(cudnnDestroyConvolutionDescriptor(c_desc_));
+        if (w_desc_ != nullptr)
+            CHECK(cudnnDestroyFilterDescriptor(w_desc_));
+        if (x_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(x_desc_));
+        if (xo_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(xo_desc_));
+        if (y_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(y_desc_));
+        if (b_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(b_desc_));
+        if (cudnn_ != nullptr)
+            CHECK(cudnnDestroy(cudnn_));
+
+        if (kernel_weights_d_ != nullptr)
+            CHECK(cudaFree(kernel_weights_d_));
+        if (bias_weights_d_ != nullptr)
+            CHECK(cudaFree(bias_weights_d_));
+
+        c_desc_  = nullptr;
+        w_desc_  = nullptr;
+        x_desc_  = nullptr;
+        xo_desc_ = nullptr;
+        y_desc_  = nullptr;
+        b_desc_  = nullptr;
+        cudnn_   = nullptr;
+
+        kernel_weights_d_ = nullptr;
+        bias_weights_d_   = nullptr;
+
+        assert(!isValid());
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        assert(isValid());
+        assert(max_batch_size_ == maxBatchSize);
+
+        return workspace_bytes_;
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        assert(isValid());
+        // REVIEW alexeyk: for now assuming batch size always equals max batch size.
+        // That's pretty strict as it disables dynamic batch sizes but fine for now.
+        assert(batchSize == max_batch_size_);
+
+        cudnnStatus_t status;
+        cudaError_t   cu_status = cudaSuccess;
+
+        CHECK(status = cudnnSetStream(cudnn_, stream));
+
+        size_t workspace_used_bytes = 0;
+        // Convert to FP16 first if needed.
+        auto py = preprocessInput(batchSize, inputs[0], workspace, stream, workspace_used_bytes);
+        assert(py != nullptr);
+        assert(workspace_used_bytes <= workspace_bytes_);
+
+        CHECK(status = cudnnConvolutionBackwardData(cudnn_,  &Consts::kOne, w_desc_, kernel_weights_d_,
+                                                    y_desc_, inputs[0], c_desc_, best_algo_,
+                                                    static_cast<uint8_t*>(workspace) + workspace_used_bytes, workspace_bytes_ - workspace_used_bytes,
+                                                    &Consts::kZero, x_desc_, outputs[0]));
+
+        if (bias_weights_.count > 0)
+        {
+            // REVIEW alexeyk: cuDNN limitation - only second outermost dimension is supported.
+            if (b_dims_.d[1] == x_dims_.d[0])
+                CHECK(status = cudnnAddTensor(cudnn_, &Consts::kOne, b_desc_, bias_weights_d_, &Consts::kOne, x_desc_, outputs[0]));
+            else if (b_dims_.d[2] == x_dims_.d[1])
+                CHECK(cu_status = CudaKernels::addDBiasTo3DConv(bias_weights_d_, b_dims_, static_cast<float*>(outputs[0]), x_dims_, stream));
+            else
+                assert(false);
+        }
+
+        // Convert back to FP32 if needed.
+        postprocessOutput(batchSize, outputs[0], workspace, stream);
+
+        return (status == CUDNN_STATUS_SUCCESS && cu_status == cudaSuccess) ? 0 : -1;
+    }
+
+    size_t getSerializationSize() override
+    {
+        assert(isValid());
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+        assert(isValid());
+        // REVIEW alexeyk: implement.
+        assert(false);
+    }
+
+private:
+    bool isValid() const
+    {
+        return cudnn_ != nullptr;
+    }
+
+    bool isFP16() const
+    {
+        return weights_type_ == CUDNN_DATA_HALF;
+    }
+
+    size_t getWeightsDataTypeSize() const
+    {
+        return (isFP16() ? sizeof(uint16_t) : sizeof(float));
+    }
+
+    const void* preprocessInput(int batchSize, const void* y, void* workspace, cudaStream_t stream, size_t& workspace_used_bytes)
+    {
+        if (!isFP16())
+            return y;
+
+        assert(data_type_ == CUDNN_DATA_FLOAT);
+
+        // Convert to FP16 using workspace.
+        size_t y_size = batchSize * DimsUtils::getTensorSize(y_dims_);
+        CHECK(CudaKernels::fp32Tofp16((const float*)y, (uint16_t*)workspace, y_size, stream));
+
+        workspace_used_bytes = y_size * sizeof(uint16_t);
+        return workspace;
+    }
+
+    void postprocessOutput(int batchSize, void* x, void* workspace, cudaStream_t stream)
+    {
+        if (!isFP16())
+            return;
+
+        assert(data_type_ == CUDNN_DATA_FLOAT);
+
+        size_t x_size = batchSize * DimsUtils::getTensorSize(x_dims_);
+        // Copy to workspace first.
+        CHECK(cudaMemcpyAsync(workspace, x, x_size * sizeof(uint16_t), cudaMemcpyDeviceToDevice, stream));
+        // Convert to FP32 from workspace.
+        CHECK(CudaKernels::fp16Tofp32((const uint16_t*)workspace, (float*)x, x_size, stream));
+    }
+
+    void createDescriptors()
+    {
+        if (cudnn_ == nullptr)
+            CHECK(cudnnCreate(&cudnn_));
+        if (x_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&x_desc_));
+        if (xo_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&xo_desc_));
+        if (y_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&y_desc_));
+        if (w_desc_ == nullptr)
+            CHECK(cudnnCreateFilterDescriptor(&w_desc_));
+        if (c_desc_ == nullptr)
+            CHECK(cudnnCreateConvolutionDescriptor(&c_desc_));
+        if (b_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&b_desc_));
+    }
+
+    void findBestAlgo()
+    {
+        // Let's hope cuDNN team will not come up with more than that number of algos (7 in cuDNN 7).
+        const int algo_count = 20;
+        int       res_algo_count;
+        cudnnConvolutionBwdDataAlgoPerf_t algos[algo_count];
+        // REVIEW alexeyk: cuDNN issue: it looks like cudnnConvolutionBackwardData hangs when
+        // CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 is chosen. Same problem with Ex version.
+#if false
+        // CHECK(cudnnFindConvolutionBackwardDataAlgorithm(cudnn_, w_desc_, y_desc_, c_desc_, x_desc_,
+        //                                                 algo_count, &res_algo_count, algos));
+#else
+        CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_, w_desc_, y_desc_, c_desc_, x_desc_,
+                                                          algo_count, &res_algo_count, algos));
+#endif
+        assert(res_algo_count > 0);
+        assert(algos[0].status == CUDNN_STATUS_SUCCESS);
+
+        // Best algo is the first.
+        best_algo_       = algos[0].algo;
+        workspace_bytes_ = algos[0].memory;
+
+        // Log results.
+        log_.log(ILogger::Severity::kINFO, (name_ + ": --> Conv3DTranspose layer tuning results:").c_str());
+        for (auto& a: algos)
+        {
+            if (a.status != CUDNN_STATUS_SUCCESS)
+                break;
+            std::ostringstream str;
+            str <<  a.algo << ": " << std::fixed << std::setw(8) << std::setprecision(1) << a.time << "ms, "
+                << std::fixed << std::setw(8) << a.memory << "B";
+            log_.log(ILogger::Severity::kINFO, str.str().c_str());
+        }
+        log_.log(ILogger::Severity::kINFO, (name_ + ": <-- Conv3DTranspose layer tuning results.").c_str());
+    }
+
+private:
+    Conv3DType      conv_type_;
+    cudnnDataType_t data_type_;
+    cudnnDataType_t weights_type_;
+
+    // Using DimsNCHW to represent 3D convos input/output is an ugly workaround
+    // of TRT limitations which currently result in assert in the guts of TRT.
+    DimsNCHW x_dims_;
+    DimsNCHW y_dims_;
+    Dims     w_dims_;
+    Dims     stride_dims_;
+    Dims     pad_start_dims_;
+    Dims     pad_end_dims_;
+    Dims     b_dims_;
+
+    int      max_batch_size_ = 0;
+
+    // Kernel weights on the host.
+    Weights kernel_weights_;
+    // Kernel weights on the device.
+    float*  kernel_weights_d_ = nullptr;
+
+    // Bias weights on the host.
+    Weights bias_weights_;
+    // Bias weights on the device.
+    float*  bias_weights_d_ = nullptr;
+
+    cudnnHandle_t                cudnn_   = nullptr;
+    cudnnTensorDescriptor_t      x_desc_  = nullptr;
+    cudnnTensorDescriptor_t      xo_desc_ = nullptr; // Original descriptor of x.
+    cudnnTensorDescriptor_t      y_desc_  = nullptr;
+    cudnnFilterDescriptor_t      w_desc_  = nullptr;
+    cudnnConvolutionDescriptor_t c_desc_  = nullptr;
+    cudnnTensorDescriptor_t      b_desc_  = nullptr;
+
+    cudnnConvolutionBwdDataAlgo_t best_algo_       = (cudnnConvolutionBwdDataAlgo_t)-1;
+    size_t                        workspace_bytes_ = 0;
+
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createConv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims, Dims out_dims,
+                                                      Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                                                      Weights kernel_weights, Weights bias_weights,
+                                                      std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new Conv3DTransposePlugin(conv_type, kernel_dims, out_dims,
+                                                 stride_dims, pad_start_dims, pad_end_dims,
+                                                 kernel_weights, bias_weights,
+                                                 log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/conv_utils.cpp b/stereoDNN/lib/conv_utils.cpp
new file mode 100644
index 0000000..59f75d4
--- /dev/null
+++ b/stereoDNN/lib/conv_utils.cpp
@@ -0,0 +1,112 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "conv_utils.h"
+#include <array>
+#include <cassert>
+#include "internal_utils.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+void ConvUtils::setConv3DTensorDescriptor(Conv3DType conv_type, Dims dims, int batch_size,
+                                          cudnnDataType_t data_type, cudnnTensorDescriptor_t& desc,
+                                          ILogger& log)
+{
+    assert(dims.nbDims == 4);
+    assert(batch_size > 0);
+    assert(desc != nullptr);
+
+    int c = dims.d[0];
+    int d = dims.d[1];
+    int h = dims.d[2];
+    int w = dims.d[3];
+    
+    // Reshape input if needed.
+    if (conv_type == Conv3DType::kTensorFlow)
+    {
+        c = 1;
+        d = dims.d[0] * dims.d[1];
+    }
+
+    std::array<int, 5> full_dims = { batch_size, c, d, h, w };
+    std::array<int, 5> strides   = { c * d * h * w,
+                                     d * h * w,
+                                     h * w,
+                                     w,
+                                     1};
+    // Sanity check.
+    assert((int64_t)c * d * h * w == c * d * h * w);
+
+    CHECKL(cudnnSetTensorNdDescriptor(desc, data_type, full_dims.size(), full_dims.data(), strides.data()), log);
+}
+
+void ConvUtils::setConv3DOperationDescriptors(Conv3DType conv_type, Dims w_dims, Dims stride_dims, Dims pad_dims,
+                                              cudnnDataType_t data_type,
+                                              cudnnFilterDescriptor_t& w_desc, cudnnConvolutionDescriptor_t& c_desc,
+                                              ILogger& log)
+{
+    assert(w_dims.nbDims      == 5);
+    assert(stride_dims.nbDims == 3);
+    assert(pad_dims.nbDims    == 3);
+    assert(w_desc != nullptr);
+    assert(c_desc != nullptr);
+
+    // Reshape/update if needed.
+    if (conv_type == Conv3DType::kTensorFlow)
+    {
+        int c = w_dims.d[1];
+        int v = w_dims.d[2];
+
+        // Reshape filter.
+        w_dims.d[1] = 1;
+        w_dims.d[2] = c * v;
+
+        // Update stride.
+        stride_dims.d[0] *= v;
+
+        // Update padding.
+        pad_dims.d[0] *= v;
+    }
+
+    // Set filter descriptor.
+    CHECKL(cudnnSetFilterNdDescriptor(w_desc, data_type, CUDNN_TENSOR_NCHW, w_dims.nbDims, w_dims.d), log);
+
+    // Set convolution descriptor.
+    std::array<int, 3> dilation = { 1, 1, 1 };
+    CHECKL(cudnnSetConvolutionNdDescriptor(c_desc, pad_dims.nbDims, pad_dims.d, stride_dims.d, dilation.data(),
+                                           CUDNN_CROSS_CORRELATION, data_type), log);
+}
+
+Dims ConvUtils::getConv3DOutputDims(cudnnConvolutionDescriptor_t c_desc, cudnnTensorDescriptor_t x_desc,
+                                    cudnnFilterDescriptor_t w_desc, ILogger& log)
+{
+    assert(c_desc != nullptr);
+    assert(x_desc != nullptr);
+    assert(w_desc != nullptr);
+
+    // 5D tensor in (N, K, D, H, W) format.
+    Dims y_dims;
+    y_dims.nbDims = 5;
+    CHECKL(cudnnGetConvolutionNdForwardOutputDim(c_desc, x_desc, w_desc, y_dims.nbDims, y_dims.d), log);
+    return y_dims;
+}
+
+void ConvUtils::setConv3DBiasDescriptor(Dims dims, cudnnDataType_t data_type,
+                                        cudnnTensorDescriptor_t& desc, ILogger& log)
+{
+    assert(dims.nbDims == 5);
+    assert(desc != nullptr);
+
+    // REVIEW alexeyk: see the comment in tensorrt_model_builder.py re: the stride issue in Conv3D.
+    std::array<int, 5> strides = {dims.d[1] * dims.d[2] * dims.d[3] * dims.d[4],
+                                  dims.d[2] * dims.d[3] * dims.d[4],
+                                  dims.d[3] * dims.d[4],
+                                  dims.d[4],
+                                  1};
+    CHECKL(cudnnSetTensorNdDescriptor(desc, data_type, dims.nbDims, dims.d, strides.data()), log);
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/conv_utils.h b/stereoDNN/lib/conv_utils.h
new file mode 100644
index 0000000..a479343
--- /dev/null
+++ b/stereoDNN/lib/conv_utils.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#ifndef REDTAIL_CONV_UTILS_H
+#define REDTAIL_CONV_UTILS_H
+
+#include <NvInfer.h>
+#include <cudnn.h>
+#include "redtail_tensorrt_plugins.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Contains various helper methods that set cudNN convolution descriptors.
+//
+// Notes on 3D convolution implementation in cuDNN:
+// 1. Convolution input is represnted as 4D tensor (ignoring batch size dimension):
+//    (C, Di, Hi, Wi), where C is number of 3D feature maps, Di, Hi, Wi - 
+//    depth, height and width of the input over which convolution is performed.
+//    Convolution is performed by the filter (described later in details) over
+//    (Di, Hi, Wi) dimensions. This is similar to 2D case where convolution
+//    is performed over (Hi,Wi) dimensions over input that has C feature maps.
+// 2. Convolution filter is represented as 5D tensor (cuDNN-like notation):
+//    (K, C, V, R, S), where K is number of output feature maps (similar to 2D convo),
+//    C - must be equal to C of the input (similar to 2D convo), and V, R, S - 
+//    dimensions of the filter that go along Di, Hi, Wi dimensions of the input.
+// 3. Convolution output is represented as 4D tensor (ignoring batch dimension):
+//    (K, Do, Ho, Wo), where K is number of output feature maps (equal to K dim
+//    of the filter), and Do, Ho, Wo are dimensions of the output that are 
+//    computed according to usual rules of convolution dimensions, taking into
+//    account padding and strides.
+//
+// This is different from 3D convolution implementation in some other
+// DL toolkits, for example, TensorFlow conv3d operator computes it differently
+// REVIEW alexeyk: finish documenting Conv3DType.
+// 
+// We also have to abuse TensorRT DimsNCHW type in few places to represent 3D convolutions
+// as TRT (as of v3.0) does not support generic 4D/5D tensors - it fails with assert
+// somewhere in the guts of TRT when using generic Dims type.
+// -----------------------------------------------------------------
+class ConvUtils
+{
+public:
+    // -----------------------------------------------------------------
+    // Sets descriptor for 3D convolution input/output.
+    // This method is used in:
+    // - Conv3DPlugin: 
+    //      sets plugin input descriptor which is a convolution input.
+    //      sets plugin output descriptor which is a convolution output.
+    // - Conv3DTransposePlugin:
+    //      sets plugin output descriptor which is a transposed convolution output.
+    //      sets plugin input descriptor which is a transposed convolution input.
+    //
+    // dims      : tensor dimensions, must have rank 4.
+    // batch_size: batch size, must be positive.
+    // -----------------------------------------------------------------
+    static void setConv3DTensorDescriptor(Conv3DType conv_type, Dims dims, int batch_size,
+                                          cudnnDataType_t data_type, cudnnTensorDescriptor_t& desc,
+                                          ILogger& log);
+
+    // -----------------------------------------------------------------
+    // Sets convolution filter and operation descriptors for 3D convolution.
+    // This method is used in:
+    // - Conv3DPlugin: 
+    // - Conv3DTransposePlugin:
+    //
+    // w_dims:
+    //   Filter dimensions, must have rank 5. The filter tensor is assumed to be in (K, C, V, R, S) format.
+    // pad_dims:
+    //   Input padding dimensions, must have rank 3. The padding is for (Di, Hi, Wi) dimensions of the
+    //   convolution input (transposed conovlution output).
+    // stride_dims:
+    //   Filter stride dimensions, must have rank 3. The stride is for (Di, Hi, Wi) dimensions of the
+    //   convolution input (transposed conovlution output).
+    // -----------------------------------------------------------------
+    static void setConv3DOperationDescriptors(Conv3DType conv_type, Dims w_dims, Dims stride_dims, Dims pad_dims,
+                                              cudnnDataType_t data_type,
+                                              cudnnFilterDescriptor_t& w_desc, cudnnConvolutionDescriptor_t& c_desc,
+                                              ILogger& log);
+
+    // -----------------------------------------------------------------
+    // Returns convolution output dimensions given input, filter and
+    // convolution descriptors.
+    // -----------------------------------------------------------------
+    static Dims getConv3DOutputDims(cudnnConvolutionDescriptor_t c_desc, cudnnTensorDescriptor_t x_desc,
+                                    cudnnFilterDescriptor_t w_desc, ILogger& log);
+
+    static void setConv3DBiasDescriptor(Dims dims, cudnnDataType_t data_type,
+                                        cudnnTensorDescriptor_t& desc, ILogger& log);
+
+public:
+    ConvUtils(ConvUtils&&) = delete;
+};
+
+} }
+
+#endif
\ No newline at end of file
diff --git a/stereoDNN/lib/cost_volume_plugin.cpp b/stereoDNN/lib/cost_volume_plugin.cpp
new file mode 100644
index 0000000..0a9fe0b
--- /dev/null
+++ b/stereoDNN/lib/cost_volume_plugin.cpp
@@ -0,0 +1,118 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Cost volume plugin.
+// -----------------------------------------------------------------
+class CostVolumePlugin: public IPlugin
+{
+public:
+    CostVolumePlugin(int max_disparity, ILogger& log, std::string name):
+        max_disparity_(max_disparity), log_(log), name_(name)
+    {
+        assert(max_disparity_ > 0);
+    }
+
+    CostVolumePlugin(CostVolumePlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        // Expecting 2 3D inputs, left and right, which are outputs of 2D convolutions
+        // and are of the same shape.
+        assert(nbInputDims == 2);
+        for (int i = 0; i < nbInputDims; i++)
+        {
+            assert(inputs[i].nbDims == 3);
+            assert(inputs[i].type[0] == DimensionType::kCHANNEL);
+            assert(inputs[i].type[1] == DimensionType::kSPATIAL);
+            assert(inputs[i].type[2] == DimensionType::kSPATIAL);
+        }
+        assert(DimsUtils::areEqual(inputs[0], inputs[1]));
+
+        in_dims_  = inputs[0];
+        out_dims_ = DimsNCHW(max_disparity_, 2 * in_dims_.d[0], in_dims_.d[1], in_dims_.d[2]);
+        return out_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(nbInputs  == 2);
+        assert(nbOutputs == 1);
+        assert(DimsUtils::areEqual(inputDims[0],  in_dims_));
+        assert(DimsUtils::areEqual(inputDims[1],  in_dims_));
+        assert(DimsUtils::areEqual(outputDims[0], out_dims_));
+        assert(maxBatchSize == 1);
+
+        log_.log(ILogger::Severity::kINFO, (name_ + ": InDims(x2): " + DimsUtils::toString(in_dims_)).c_str());
+        log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims   : " + DimsUtils::toString(out_dims_)).c_str());
+    }
+
+    int initialize() override
+    {
+        return 0;
+    }
+
+    void terminate() override
+    {
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        assert(maxBatchSize == 1);
+        return 0;
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        assert(batchSize == 1);
+
+        auto pleft     = static_cast<const float*>(inputs[0]);
+        auto pright    = static_cast<const float*>(inputs[1]);
+        auto pcost_vol = static_cast<float*>(outputs[0]);
+
+        cudaError_t status;
+        CHECK(status = CudaKernels::computeCostVolume(pleft, pright, in_dims_, pcost_vol, out_dims_, stream));
+
+        return status;
+    }
+
+    size_t getSerializationSize() override
+    {
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+    }
+
+private:
+    int      max_disparity_;
+    Dims     in_dims_;
+    DimsNCHW out_dims_;
+    
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createCostVolumePlugin(int max_disparity, std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new CostVolumePlugin(max_disparity, log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/elu_plugin.cpp b/stereoDNN/lib/elu_plugin.cpp
new file mode 100644
index 0000000..762fc4e
--- /dev/null
+++ b/stereoDNN/lib/elu_plugin.cpp
@@ -0,0 +1,163 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// ELU activation function plugin.
+// -----------------------------------------------------------------
+class EluPlugin: public IPlugin
+{
+public:
+    EluPlugin(DataType data_type, ILogger& log, std::string name):
+        data_type_(trtToCudnnDataType(data_type)), log_(log), name_(name)
+    {
+        // REVIEW alexeyk: TRT currently does not support FP16 data tensors.
+        assert(data_type == DataType::kFLOAT);
+    }
+
+    EluPlugin(EluPlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(nbInputDims == 1);
+        in_dims_  = inputs[0];
+        // No restrictions on input dims.
+
+        return in_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(nbInputs == 1);
+        assert(nbOutputs == 1);
+        // Sanity check.
+        assert(DimsUtils::areEqual(in_dims_, inputDims[0]));
+        assert(DimsUtils::areEqual(in_dims_, outputDims[0]));
+
+        cudnnStatus_t status;
+
+        CHECK(status = cudnnCreate(&cudnn_));
+        CHECK(status = cudnnCreateActivationDescriptor(&act_));
+        CHECK(status = cudnnSetActivationDescriptor(act_, CUDNN_ACTIVATION_ELU, CUDNN_PROPAGATE_NAN, 1.0));
+        CHECK(status = cudnnCreateTensorDescriptor(&t_desc_));
+
+        setTensorDescriptor(maxBatchSize);
+
+        log_.log(ILogger::Severity::kINFO, (name_ + ": Dims: " + DimsUtils::toString(in_dims_)).c_str());
+    }
+
+    int initialize() override
+    {
+        return 0;
+    }
+
+    void terminate() override
+    {
+        assert(isValid());
+
+        if (act_ != nullptr)
+            CHECK(cudnnDestroyActivationDescriptor(act_));
+        if (t_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(t_desc_));
+        if (cudnn_ != nullptr)
+            CHECK(cudnnDestroy(cudnn_));
+        act_    = nullptr;
+        t_desc_ = nullptr;
+        cudnn_  = nullptr;
+
+        assert(!isValid());
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        return 0;
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        assert(isValid());
+
+        cudnnStatus_t status;
+
+        CHECK(status = cudnnSetStream(cudnn_, stream));
+        if (batchSize != tensor_dims_.d[0])
+            updateTensorDescriptor(batchSize);
+        CHECK(status = cudnnActivationForward(cudnn_, act_, &Consts::kOne, t_desc_, inputs[0],  &Consts::kZero, t_desc_, outputs[0]));
+
+        return status == CUDNN_STATUS_SUCCESS ? 0 : -1;
+    }
+
+    size_t getSerializationSize() override
+    {
+        assert(isValid());
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+        assert(isValid());
+    }
+
+private:
+    void setTensorDescriptor(int batch_size)
+    {
+        assert(isValid());
+
+        tensor_dims_.nbDims = in_dims_.nbDims + 1;
+        tensor_dims_.d[0]   = batch_size;
+        std::copy(in_dims_.d, in_dims_.d + in_dims_.nbDims, tensor_dims_.d + 1);
+
+        tensor_strides_ = DimsUtils::getStrides(tensor_dims_);
+
+        CHECK(cudnnSetTensorNdDescriptor(t_desc_, data_type_, tensor_dims_.nbDims, tensor_dims_.d, tensor_strides_.d));
+    }
+
+    // Updates tensor descriptor according to batch_size.
+    void updateTensorDescriptor(int batch_size)
+    {
+        // No other parameters require update.
+        tensor_dims_.d[0] = batch_size;
+        CHECK(cudnnSetTensorNdDescriptor(t_desc_, data_type_, tensor_dims_.nbDims, tensor_dims_.d, tensor_strides_.d));
+    }
+
+private:
+    bool isValid() const
+    {
+        return cudnn_ != nullptr;
+    }
+
+private:
+    cudnnDataType_t             data_type_;
+    cudnnHandle_t               cudnn_  = nullptr;
+    cudnnActivationDescriptor_t act_    = nullptr;
+    cudnnTensorDescriptor_t     t_desc_ = nullptr;
+
+    Dims in_dims_;
+    Dims tensor_dims_;
+    Dims tensor_strides_;
+
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createEluPlugin(DataType data_type, std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new EluPlugin(data_type, log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/internal_macros.h b/stereoDNN/lib/internal_macros.h
new file mode 100644
index 0000000..30106e2
--- /dev/null
+++ b/stereoDNN/lib/internal_macros.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#ifndef REDTAIL_INTERNAL_MACROS_H
+#define REDTAIL_INTERNAL_MACROS_H
+
+// -----------------------------------------------------------------
+// CHECKL and CHECK macros check for status and log the message
+// in case of status != 0. The difference between macros is CHECKL
+// takes log argument explicitly while CHECK - assumes log_ variable
+// is defined (used in class memebers).
+// -----------------------------------------------------------------
+#undef  CHECKL
+#define CHECKL(status, log) do {                                                           \
+    auto res = (status);                                                             \
+    if ((int)res != 0)                                                               \
+        redtail::tensorrt::reportError(res, __FILE__, __LINE__, __FUNCTION__, log); \
+} while(false)
+
+#undef  CHECK
+#define CHECK(status) CHECKL(status, log_)
+
+// -----------------------------------------------------------------
+// UNUSED is generic version of macro that allows to avoid unused
+// variable warning.
+// UNUSEDR disables such warning only in Release builds.
+// -----------------------------------------------------------------
+#undef  UNUSED
+#define UNUSED(x) ((void)(x))
+
+#undef  UNUSEDR
+#ifdef NDEBUG
+    #define UNUSEDR(x) ((void)(x))
+#else
+    #define UNUSEDR(x)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/stereoDNN/lib/internal_utils.cpp b/stereoDNN/lib/internal_utils.cpp
new file mode 100644
index 0000000..8c2d232
--- /dev/null
+++ b/stereoDNN/lib/internal_utils.cpp
@@ -0,0 +1,236 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+#include <iomanip>
+#include <numeric>
+#include <string>
+#include <sstream>
+
+namespace redtail { namespace tensorrt
+{
+
+const float Consts::kZero = 0.0f;
+const float Consts::kOne  = 1.0f;
+
+// -----------------------------------------------------------------
+// DimsUtils implementation.
+// -----------------------------------------------------------------
+size_t DimsUtils::getTensorSize(Dims dims)
+{
+    assert(dims.nbDims >= 1);
+    size_t res = std::accumulate(dims.d, dims.d + dims.nbDims, (size_t)1, std::multiplies<size_t>());
+    // Dims.d is int, so overflow check.
+    assert(res == (size_t)std::accumulate(dims.d, dims.d + dims.nbDims, (int)1, std::multiplies<int>()));
+    return res;
+}
+
+Dims DimsUtils::getStrides(Dims dims)
+{
+    Dims strides;
+    strides.nbDims = dims.nbDims;
+    strides.d[strides.nbDims - 1] = 1;
+    for (int i = strides.nbDims - 2; i >= 0; i--)
+    {
+        strides.d[i]    = strides.d[i + 1] * dims.d[i + 1];
+        strides.type[i] = DimensionType::kSPATIAL;
+    }
+    return strides;
+}
+
+// Equality check. The function checks for shape and size,
+// but not for dimension types.
+bool DimsUtils::areEqual(Dims d1, Dims d2)
+{
+    if (d1.nbDims != d2.nbDims)
+        return false;
+    return std::equal(d1.d, d1.d + d1.nbDims, d2.d);
+}
+
+std::string DimsUtils::toString(Dims dims)
+{
+    std::ostringstream str;
+    str << "{";
+    for (int i = 0; i < dims.nbDims; i++)
+    {
+        str << std::fixed << std::setw(4) << dims.d[i];
+        if (i < dims.nbDims - 1)
+            str << ",";
+    }
+    str << "}";
+    return str.str();
+}
+
+// -----------------------------------------------------------------
+// Conversion helpers.
+// -----------------------------------------------------------------
+cudnnDataType_t trtToCudnnDataType(DataType trt_data_type)
+{
+    // REVIEW alexeyk: fp32/16 only for now.
+    assert(trt_data_type == DataType::kFLOAT || trt_data_type == DataType::kHALF);
+
+    if (trt_data_type == DataType::kFLOAT)
+        return CUDNN_DATA_FLOAT;
+    if (trt_data_type == DataType::kHALF)
+        return CUDNN_DATA_HALF;
+    return (cudnnDataType_t)-1;
+}
+
+// -----------------------------------------------------------------
+// PluginContainer implementation.
+// -----------------------------------------------------------------
+
+std::unique_ptr<IPluginContainer> IPluginContainer::create(ILogger& log)
+{
+    return std::make_unique<PluginContainer>(log);
+}
+
+// -----------------------------------------------------------------
+// Plugins helper functions.
+// -----------------------------------------------------------------
+ILayer* addElu(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+               DataType data_type, const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createEluPlugin(data_type, name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&input};
+    auto     layer    = network.addPlugin(inputs, 1, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+ILayer* addCostVolume(IPluginContainer& plugin_factory, INetworkDefinition& network,
+                      ITensor& left_input, ITensor& right_input, int max_disparity,
+                      const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createCostVolumePlugin(max_disparity, name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&left_input, &right_input};
+    auto layer = network.addPlugin(inputs, 2, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+ILayer* addConv3D(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                  Conv3DType conv_type, Dims kernel_dims, Dims stride_dims,
+                  Dims pad_start_dims, Dims pad_end_dims,
+                  Weights kernel_weights, Weights bias_weights,
+                  const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createConv3DPlugin(conv_type, kernel_dims,
+                                                    stride_dims, pad_start_dims, pad_end_dims,
+                                                    kernel_weights, bias_weights,
+                                                    name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&input};
+    auto layer = network.addPlugin(inputs, 1, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+ILayer* addConv3DTranspose(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                           Conv3DType conv_type, Dims kernel_dims, Dims out_dims,
+                           Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                           Weights kernel_weights, Weights bias_weights,
+                           const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createConv3DTransposePlugin(conv_type, kernel_dims, out_dims,
+                                                             stride_dims, pad_start_dims, pad_end_dims,
+                                                             kernel_weights, bias_weights,
+                                                             name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&input};
+    auto layer = network.addPlugin(inputs, 1, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+ILayer* addSlice(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                 Dims dims, Dims slice_start, Dims slice_end,
+                 const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createSlicePlugin(dims, slice_start, slice_end, name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&input};
+    auto     layer    = network.addPlugin(inputs, 1, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+ILayer* addTransform(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                     Permutation permutation,
+                     const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createTransformPlugin(permutation, name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&input};
+    auto     layer    = network.addPlugin(inputs, 1, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+ILayer* addPad(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+               DimsNCHW pad_start, DimsNCHW pad_end,
+               const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createPaddingPlugin(pad_start, pad_end, name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&input};
+    auto     layer    = network.addPlugin(inputs, 1, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+ILayer* addSoftargmax(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                      const std::string& name)
+{
+    // Create plugin.
+    auto plugin = plugin_factory.createSoftargmaxPlugin(name);
+    assert(plugin != nullptr);
+    // Add to the network.
+    ITensor* inputs[] = {&input};
+    auto     layer    = network.addPlugin(inputs, 1, *plugin);
+    assert(layer != nullptr);
+    return layer;
+}
+
+// -----------------------------------------------------------------
+// Error checking and reporting methods.
+// -----------------------------------------------------------------
+void reportError(cudaError_t status, const char* file, int line, const char* func, ILogger& log)
+{
+    if (status == cudaSuccess)
+        return;
+    std::ostringstream str;
+    str << file << ":" << line << ": " << func << ": CUDA error "
+        << status << " (" << cudaGetErrorName(status) << ": " << cudaGetErrorString(status) << ").";
+    log.log(ILogger::Severity::kERROR, str.str().c_str());
+    assert(status == cudaSuccess);
+}
+
+void reportError(cudnnStatus_t status, const char* file, int line, const char* func, ILogger& log)
+{
+    if (status == CUDNN_STATUS_SUCCESS)
+        return;
+    std::ostringstream str;
+    str << file << ":" << line << ": " << func << ": cuDNN error " << status << " (" << cudnnGetErrorString(status) << ").";
+    log.log(ILogger::Severity::kERROR, str.str().c_str());
+    assert(status == CUDNN_STATUS_SUCCESS);
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/internal_utils.h b/stereoDNN/lib/internal_utils.h
new file mode 100644
index 0000000..d2742e0
--- /dev/null
+++ b/stereoDNN/lib/internal_utils.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#ifndef REDTAIL_INTERNAL_UTILS_H
+#define REDTAIL_INTERNAL_UTILS_H
+
+#include <cassert>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <cuda_runtime_api.h>
+#include <NvInfer.h>
+#include <cudnn.h>
+
+#include "internal_macros.h"
+#include "redtail_tensorrt_plugins.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Constants.
+// REVIEW alexeyk: float ony for now, refactor to support FP16/Int8.
+// -----------------------------------------------------------------
+struct Consts
+{
+    static const float kZero;
+    static const float kOne;
+
+public:
+    Consts(Consts&&) = delete;
+};
+
+// -----------------------------------------------------------------
+// Various Dims helper functions.
+// -----------------------------------------------------------------
+class DimsUtils
+{
+public:
+    static size_t getTensorSize(Dims dims);
+
+    static Dims   getStrides(Dims dims);
+
+    static bool   areEqual(Dims d1, Dims d2);
+
+    static std::string toString(Dims dims);
+
+public:
+    DimsUtils(DimsUtils&&) = delete;
+};
+
+// -----------------------------------------------------------------
+// Conversion helpers.
+// -----------------------------------------------------------------
+cudnnDataType_t trtToCudnnDataType(DataType trt_data_type);
+
+// -----------------------------------------------------------------
+// CUDA kernels interface.
+// -----------------------------------------------------------------
+class CudaKernels
+{
+public:
+    template<typename T>
+    static cudaError_t computeCostVolume(const T* left, const T* right, Dims in_dims, T* cost_vol, Dims out_dims, cudaStream_t stream);
+
+    template<typename T>
+    static cudaError_t addDBiasTo3DConv(const T* bias, Dims bias_dims, T* conv, Dims conv_dims, cudaStream_t stream);
+
+    static cudaError_t fp32Tofp16(const float* src,    uint16_t* dst, size_t size, cudaStream_t stream);
+    static cudaError_t fp16Tofp32(const uint16_t* src, float* dst,    size_t size, cudaStream_t stream);
+
+public:
+    CudaKernels(CudaKernels&&) = delete;
+};
+
+// Template instantiation.
+template<>
+cudaError_t CudaKernels::computeCostVolume(const float*, const float*, Dims, float*, Dims, cudaStream_t);
+
+template<>
+cudaError_t CudaKernels::addDBiasTo3DConv(const float*, Dims, float*, Dims, cudaStream_t);
+
+// -----------------------------------------------------------------
+// Simple implementation of IPluginContainer.
+// The container keeps pointers to created plugins.
+// -----------------------------------------------------------------
+class PluginContainer: public IPluginContainer
+{
+public:
+    PluginContainer(ILogger& log):
+        log_(log)
+    {
+    }
+
+    ~PluginContainer() noexcept = default;
+
+    // Also disables copy ctor/assign ops.
+    PluginContainer(PluginContainer&&) = delete;
+
+    // ELU plugin.
+    IPlugin* createEluPlugin(DataType data_type, std::string name) override;
+
+    // Cost volume plugin.
+    IPlugin* createCostVolumePlugin(int max_disparity, std::string name) override;
+
+    // 3D convolution.
+    IPlugin* createConv3DPlugin(Conv3DType conv_type, Dims kernel_dims,
+                                Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                                Weights kernel_weights, Weights bias_weights,
+                                std::string name) override;
+
+    // Transposed 3D convolution.
+    IPlugin* createConv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims, Dims out_dims,
+                                         Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                                         Weights kernel_weights, Weights bias_weights,
+                                         std::string name) override;
+
+    // Transform.
+    IPlugin* createTransformPlugin(Permutation permutation, std::string name) override;
+
+    // Padding.
+    IPlugin* createPaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end,
+                                 std::string name) override;
+
+    IPlugin* createSlicePlugin(Dims dims, Dims slice_start, Dims slice_end,
+                               std::string name) override;
+
+    IPlugin* createSoftargmaxPlugin(std::string name) override;
+
+private:
+    // TensorRT IPlugin interface has protected dtor so cannot use unique_ptr
+    // or delete plugins in the container dtor.
+    std::vector<IPlugin*> plugins_;
+    std::mutex            lock_;
+
+    ILogger& log_;
+};
+
+void reportError(cudaError_t status, const char* file, int line, const char* func, ILogger& log);
+void reportError(cudnnStatus_t status, const char* file, int line, const char* func, ILogger& log);
+
+} }
+
+#endif
\ No newline at end of file
diff --git a/stereoDNN/lib/kernels.cu b/stereoDNN/lib/kernels.cu
new file mode 100644
index 0000000..31dc60a
--- /dev/null
+++ b/stereoDNN/lib/kernels.cu
@@ -0,0 +1,261 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cuda_fp16.h>
+
+// Check async error.
+// Sync and get kernel status in Debug builds.
+#ifndef NDEBUG
+    #define SYNC_AND_CHECK_STREAM(stream) do {          \
+    cudaError_t status = cudaStreamSynchronize(stream); \
+    if (status != cudaSuccess)                          \
+        return status;                                  \
+}while(false)
+#else
+    #define SYNC_AND_CHECK_STREAM(stream)
+#endif
+
+#define CHECKK(stream) do {                  \
+    cudaError_t status = cudaGetLastError(); \
+    if (status != cudaSuccess)               \
+        return status;                       \
+    SYNC_AND_CHECK_STREAM(stream);           \
+}while(false)
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+static const int kMaxGridSizeY = 65535;
+static const int kMaxGridSizeZ = 65535;
+
+// -----------------------------------------------------------------
+// Helper function to get block count.
+// -----------------------------------------------------------------
+static uint32_t getBlockCount(uint32_t total_size, uint32_t block_size)
+{
+    uint32_t res = (total_size + block_size - 1) / block_size;
+    assert(res > 0);
+    assert((size_t)res * block_size >= total_size);
+    return res;
+}
+
+// REVIEW alexeyk: kernels are not optimized for now.
+
+// -----------------------------------------------------------------
+// Cost volume kernels.
+// -----------------------------------------------------------------
+template<typename T>
+__global__ void costVolumeCopyKernel(const T* src, int32_t c, int32_t h, int32_t w, int32_t disp, T* dst)
+{
+    assert(src != nullptr);
+    assert(dst != nullptr);
+
+    const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y;
+    const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z;
+    if (ix >= w || iy >= h || iz >= c)
+        return;
+    const size_t isrc   = iz * h * w + iy * w + ix;
+    const size_t stride = 2 * c * h * w;
+    T  val  = src[isrc];
+    T* pdst = dst + isrc;
+    for (int32_t idst = 0; idst < disp; idst++)
+    {
+        *pdst = val;
+        pdst += stride;
+    }
+}
+
+template<typename T>
+__global__ void costVolumeCopyPadKernel(const T* src, int32_t c, int32_t h, int32_t w, int32_t disp, T* dst)
+{
+    assert(src != nullptr);
+    assert(dst != nullptr);
+
+    const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y;
+    const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z;
+    if (ix >= w || iy >= h || iz >= c)
+        return;
+    const size_t isrc    = iz * h * w + iy * w + ix;
+    size_t       stride  = c * h * w;
+    const size_t idst    = isrc + stride;
+    stride *= 2;
+
+    T* pdst = dst + idst;
+    for (int32_t pad = 0; pad < disp; pad++)
+    {
+        if (ix < pad)
+            *pdst = 0;
+        else
+            *pdst = src[isrc - pad];
+        pdst += stride;
+    }
+}
+
+template<typename T>
+__global__ void costVolumeKernel(const T* left, const T* right, int32_t c, int32_t h, int32_t w, int32_t disp, T* dst)
+{
+    assert(left  != nullptr);
+    assert(right != nullptr);
+    assert(dst   != nullptr);
+
+    const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y;
+    const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z;
+    if (ix >= w || iy >= h || iz >= c)
+        return;
+
+    // Setup initial indices.
+    size_t       stride = c * h * w;
+    // Left and right source is the same.
+    const size_t ileft  = iz * h * w + iy * w + ix;
+    T*           pdst_l = dst + ileft;
+    const size_t iright = ileft;
+    // Right destination is offset by 1 in c dimension.
+    T*           pdst_r = dst + iright + stride;
+    // Final stride is 2 in c dimension.
+    stride *= 2;
+
+    T  val_l  = left[ileft];
+    for (int32_t pad = 0; pad < disp; pad++)
+    {
+        if (ix < pad)
+            *pdst_r = 0;
+        else
+            *pdst_r = right[iright - pad];
+        *pdst_l = val_l;
+        pdst_l += stride;
+        pdst_r += stride;
+    }
+}
+
+template<>
+cudaError_t CudaKernels::computeCostVolume(const float* left, const float* right, Dims in_dims, float* cost_vol, Dims out_dims, cudaStream_t stream)
+{
+    assert(in_dims.nbDims  == 3);
+    assert(out_dims.nbDims == 4);
+
+    dim3 b_dim{16, 16, 1};
+    dim3 g_dim;
+    g_dim.x = getBlockCount(in_dims.d[2], b_dim.x);
+    g_dim.y = getBlockCount(in_dims.d[1], b_dim.y);
+    g_dim.z = getBlockCount(in_dims.d[0], b_dim.z);
+
+    // REVIEW alexeyk: using 2 kernels instead of one as it's not yet optimized so 2 kernels are faster.
+    // REVIEW alexeyk: optimize, see gld_efficiency,gst_efficiency,gld_transactions,gst_transactions.
+    // costVolumeKernel<<<g_dim, b_dim, 0, stream>>>(left, right, in_dims.d[0], in_dims.d[1], in_dims.d[2], out_dims.d[0],
+    //                                               cost_vol);
+    costVolumeCopyKernel<<<g_dim, b_dim, 0, stream>>>(left, in_dims.d[0], in_dims.d[1], in_dims.d[2], out_dims.d[0],
+                                                      cost_vol);
+    CHECKK(stream);
+    costVolumeCopyPadKernel<<<g_dim, b_dim, 0, stream>>>(right, in_dims.d[0], in_dims.d[1], in_dims.d[2], out_dims.d[0],
+                                                         cost_vol);
+    CHECKK(stream);
+    return cudaSuccess;
+}
+
+// -----------------------------------------------------------------
+// Some convolution-related kernels.
+// -----------------------------------------------------------------
+template<typename T>
+__global__ void addDBiasTo3DConvKernel(const T* bias, int32_t c, int32_t d, int32_t h, int32_t w, T* conv)
+{
+    assert(bias != nullptr);
+    assert(conv != nullptr);
+
+    const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x;
+    const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y;
+    const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z;
+    if (ix >= w || iy >= h || iz >= d * c)
+        return;
+
+    int32_t cur_d     = iz % d;
+    const size_t idst = iz * h * w + iy * w + ix;
+
+    conv[idst] += bias[cur_d];
+}
+
+template<>
+cudaError_t CudaKernels::addDBiasTo3DConv(const float* bias, Dims bias_dims, float* conv, Dims conv_dims, cudaStream_t stream)
+{
+    assert(bias_dims.nbDims == 5);
+    assert(conv_dims.nbDims == 4);
+    // REVIEW alexeyk: minibatch size 1 for now.
+    assert(bias_dims.d[0] == 1);
+    assert(bias_dims.d[2] == conv_dims.d[1]);
+    UNUSEDR(bias_dims);
+
+    dim3 b_dim{16, 16, 1};
+    dim3 g_dim;
+    g_dim.x = getBlockCount(conv_dims.d[3], b_dim.x);
+    g_dim.y = getBlockCount(conv_dims.d[2], b_dim.y);
+    g_dim.z = getBlockCount(conv_dims.d[0] * conv_dims.d[1], b_dim.z);
+    // REVIEW alexeyk: no block striding for now.
+    assert(g_dim.y <= kMaxGridSizeY);
+    assert(g_dim.z <= kMaxGridSizeZ);
+    UNUSEDR(kMaxGridSizeY);
+    UNUSEDR(kMaxGridSizeZ);
+
+    addDBiasTo3DConvKernel<<<g_dim, b_dim, 0, stream>>>(bias, conv_dims.d[0], conv_dims.d[1], conv_dims.d[2], conv_dims.d[3], conv);
+    CHECKK(stream);
+
+    return cudaSuccess;
+}
+
+// -----------------------------------------------------------------
+// Conversion kernels.
+// -----------------------------------------------------------------
+__global__ void fp32Tofp16Kernel(const float* src, uint16_t* dst, size_t size)
+{
+    assert(src != nullptr);
+    assert(dst != nullptr);
+
+    const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= size)
+        return;
+
+    __half val(src[tid]);
+    dst[tid] = *(uint16_t*)&val;
+}
+
+cudaError_t CudaKernels::fp32Tofp16(const float* src, uint16_t* dst, size_t size, cudaStream_t stream)
+{
+    dim3 b_dim{256, 1, 1};
+    dim3 g_dim;
+    g_dim.x = getBlockCount(size, b_dim.x);
+
+    fp32Tofp16Kernel<<<g_dim, b_dim, 0, stream>>>(src, dst, size);
+    CHECKK(stream);
+
+    return cudaSuccess;
+}
+
+__global__ void fp16Tofp32Kernel(const uint16_t* src, float* dst, size_t size)
+{
+    assert(src != nullptr);
+    assert(dst != nullptr);
+
+    const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= size)
+        return;
+
+    dst[tid] = (float)(*(__half*)(src + tid));
+}
+
+cudaError_t CudaKernels::fp16Tofp32(const uint16_t* src, float* dst, size_t size, cudaStream_t stream)
+{
+    dim3 b_dim{256, 1, 1};
+    dim3 g_dim;
+    g_dim.x = getBlockCount(size, b_dim.x);
+
+    fp16Tofp32Kernel<<<g_dim, b_dim, 0, stream>>>(src, dst, size);
+    CHECKK(stream);
+
+    return cudaSuccess;
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/padding_plugin.cpp b/stereoDNN/lib/padding_plugin.cpp
new file mode 100644
index 0000000..3101641
--- /dev/null
+++ b/stereoDNN/lib/padding_plugin.cpp
@@ -0,0 +1,125 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Tensor padding plugin.
+// -----------------------------------------------------------------
+class PaddingPlugin: public IPlugin
+{
+public:
+    PaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end, ILogger& log, std::string name):
+        pad_start_(pad_start), pad_end_(pad_end), log_(log), name_(name)
+    {
+        // Only D end padding is currently supported.
+        assert(pad_start_.n() == 0);
+        assert(pad_start_.c() == 0);
+        assert(pad_start_.h() == 0);
+        assert(pad_start_.w() == 0);
+
+        assert(pad_end_.n() >= 0);
+        assert(pad_end_.c() == 0);
+        assert(pad_end_.h() == 0);
+        assert(pad_end_.w() == 0);
+    }
+
+    PaddingPlugin(PaddingPlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(nbInputDims == 1);
+        // Only NCHW format is supported for now, TRT does not work (assert) with generic Dims.
+        assert(inputs[0].nbDims == 4);
+
+        in_dims_  = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]);
+        out_dims_ = DimsNCHW(in_dims_.n() + pad_start_.n() + pad_end_.n(),
+                             in_dims_.c() + pad_start_.c() + pad_end_.c(),
+                             in_dims_.h() + pad_start_.h() + pad_end_.h(),
+                             in_dims_.w() + pad_start_.w() + pad_end_.w());
+        return out_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(nbInputs  == 1);
+        assert(nbOutputs == 1);
+        assert(DimsUtils::areEqual(inputDims[0],  in_dims_));
+        assert(DimsUtils::areEqual(outputDims[0], out_dims_));
+
+        log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str());
+        log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str());
+    }
+
+    int initialize() override
+    {
+        return 0;
+    }
+
+    void terminate() override
+    {
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        return 0;
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        // Copy original tensor.
+        size_t in_size_bytes = DimsUtils::getTensorSize(in_dims_) * sizeof(float);
+        CHECK(cudaMemcpyAsync(outputs[0], inputs[0], in_size_bytes, cudaMemcpyDeviceToDevice, stream));
+
+        // Zero out end D padding if needed.
+        size_t out_size_bytes = DimsUtils::getTensorSize(out_dims_) * sizeof(float);
+        if (out_size_bytes > in_size_bytes)
+        {
+            auto pdst = static_cast<unsigned char*>(outputs[0]) + in_size_bytes;
+            CHECK(cudaMemsetAsync(pdst, 0, out_size_bytes - in_size_bytes, stream));
+        }
+
+        return 0;
+    }
+
+    size_t getSerializationSize() override
+    {
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+    }
+
+private:
+
+private:
+    DimsNCHW  pad_start_;
+    DimsNCHW  pad_end_;
+    DimsNCHW  in_dims_;
+    DimsNCHW  out_dims_;
+
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createPaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end, std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new PaddingPlugin(pad_start, pad_end, log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/redtail_tensorrt_plugins.h b/stereoDNN/lib/redtail_tensorrt_plugins.h
new file mode 100644
index 0000000..11deba1
--- /dev/null
+++ b/stereoDNN/lib/redtail_tensorrt_plugins.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#ifndef REDTAIL_TENSORRT_PLUGINS_H
+#define REDTAIL_TENSORRT_PLUGINS_H
+
+#include <memory>
+#include <NvInfer.h>
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// 3D convolution type used to setup convolution descriptors.
+// -----------------------------------------------------------------
+enum class Conv3DType
+{
+    kCuDnn      = 0,
+    kTensorFlow = 1
+};
+
+// -----------------------------------------------------------------
+// Plugin container/factory.
+// TensorRT does not manage plugins and requires a plugin lifetime
+// to be the same as any TRT engine.
+// Each plugin create* function returns naked pointer as expected by TRT,
+// with IPluginContainer managing the plugin's lifetime.
+// -----------------------------------------------------------------
+class IPluginContainer
+{
+public:
+    virtual ~IPluginContainer() = default;
+
+    // ELU plugin.
+    virtual IPlugin* createEluPlugin(DataType data_type, std::string name) = 0;
+
+    // Cost volume plugin.
+    virtual IPlugin* createCostVolumePlugin(int max_disparity, std::string name) = 0;
+
+    // 3D convolution.
+    virtual IPlugin* createConv3DPlugin(Conv3DType conv_type, Dims kernel_dims,
+                                        Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                                        Weights kernel_weights, Weights bias_weights,
+                                        std::string name) = 0;
+
+    virtual IPlugin* createConv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims, Dims out_dims,
+                                                 Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                                                 Weights kernel_weights, Weights bias_weights,
+                                                 std::string name) = 0;
+
+    virtual IPlugin* createTransformPlugin(Permutation permutation, std::string name) = 0;
+
+    virtual IPlugin* createPaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end,
+                                         std::string name) = 0;
+
+    virtual IPlugin* createSlicePlugin(Dims dims, Dims slice_start, Dims slice_end,
+                                       std::string name) = 0;
+
+    virtual IPlugin* createSoftargmaxPlugin(std::string name) = 0;
+
+    static std::unique_ptr<IPluginContainer> create(ILogger& log);
+};
+
+// -----------------------------------------------------------------
+// Plugins helper functions.
+// -----------------------------------------------------------------
+ILayer* addElu(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+               DataType data_type, const std::string& name);
+
+ILayer* addCostVolume(IPluginContainer& plugin_factory, INetworkDefinition& network,
+                      ITensor& left_input, ITensor& right_input, int max_disparity,
+                      const std::string& name);
+
+ILayer* addConv3D(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                  Conv3DType conv_type, Dims kernel_dims, Dims stride_dims,
+                  Dims pad_start_dims, Dims pad_end_dims,
+                  Weights kernel_weights, Weights bias_weights,
+                  const std::string& name);
+
+ILayer* addConv3DTranspose(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                           Conv3DType conv_type, Dims kernel_dims, Dims out_dims,
+                           Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims,
+                           Weights kernel_weights, Weights bias_weights,
+                           const std::string& name);
+
+ILayer* addSlice(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                 Dims dims, Dims slice_start, Dims slice_end,
+                 const std::string& name);
+
+ILayer* addTransform(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                     Permutation permutation,
+                     const std::string& name);
+
+ILayer* addPad(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+               DimsNCHW pad_start, DimsNCHW pad_end,
+               const std::string& name);
+
+ILayer* addSoftargmax(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input,
+                      const std::string& name);
+
+} }
+
+#endif // REDTAIL_TENSORRT_PLUGINS_H
diff --git a/stereoDNN/lib/slice_plugin.cpp b/stereoDNN/lib/slice_plugin.cpp
new file mode 100644
index 0000000..bbcfff9
--- /dev/null
+++ b/stereoDNN/lib/slice_plugin.cpp
@@ -0,0 +1,121 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Tensor slice plugin.
+// REVIEW alexey: currently the only supported slicing is for the
+// second outermost dimension of 5D tensor as this is all we need so far.
+// -----------------------------------------------------------------
+class SlicePlugin: public IPlugin
+{
+public:
+    SlicePlugin(Dims dims, Dims slice_start, Dims slice_end, ILogger& log, std::string name):
+        in_dims_(dims), slice_start_(slice_start), slice_end_(slice_end),
+        log_(log), name_(name)
+    {
+        assert(in_dims_.nbDims == 4);
+        assert(in_dims_.nbDims == slice_start_.nbDims);
+        assert(in_dims_.nbDims == slice_end_.nbDims);
+        // For now no negative indices or empty tensors either.
+        assert(0 <= slice_start_.d[0] && slice_start_.d[0] < slice_end_.d[0]);
+        assert(0 <  slice_end_.d[0]   && slice_end_.d[0]   <= in_dims_.d[0]);
+
+        for (int i = 1; i < in_dims_.nbDims; i++)
+        {
+            assert(slice_start_.d[i] == 0);
+            assert(slice_end_.d[i]   == in_dims_.d[i]);
+        }
+    }
+
+    SlicePlugin(SlicePlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(nbInputDims == 1);
+        // Only NCHW format is supported for now.
+        assert(inputs[0].nbDims == 4);
+
+        out_dims_ = DimsNCHW(slice_end_.d[0] - slice_start_.d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]);
+        return out_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(nbInputs  == 1);
+        assert(nbOutputs == 1);
+        assert(DimsUtils::areEqual(inputDims[0],  in_dims_));
+        assert(DimsUtils::areEqual(outputDims[0], out_dims_));
+
+        log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str());
+        log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str());
+    }
+
+    int initialize() override
+    {
+        return 0;
+    }
+
+    void terminate() override
+    {
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        return 0;
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        size_t d_dim_size = DimsUtils::getTensorSize(in_dims_) / in_dims_.d[0];
+        assert(d_dim_size * in_dims_.d[0] == DimsUtils::getTensorSize(in_dims_));
+
+        auto   psrc = static_cast<const float*>(inputs[0]) + d_dim_size * slice_start_.d[0];
+        size_t size = out_dims_.d[0] * d_dim_size;
+
+        cudaError_t status;
+        CHECK(status = cudaMemcpyAsync(outputs[0], psrc, size * sizeof(float), cudaMemcpyDeviceToDevice, stream));
+
+        return status;
+    }
+
+    size_t getSerializationSize() override
+    {
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+    }
+
+private:
+    Dims     in_dims_;
+    DimsNCHW out_dims_;
+    Dims     slice_start_;
+    Dims     slice_end_;
+
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createSlicePlugin(Dims dims, Dims slice_start, Dims slice_end, std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new SlicePlugin(dims, slice_start, slice_end, log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/softargmax_plugin.cpp b/stereoDNN/lib/softargmax_plugin.cpp
new file mode 100644
index 0000000..1dc0882
--- /dev/null
+++ b/stereoDNN/lib/softargmax_plugin.cpp
@@ -0,0 +1,236 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+#include <numeric>
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Soft argmax plugin.
+// -----------------------------------------------------------------
+class SoftargmaxPlugin: public IPlugin
+{
+public:
+    SoftargmaxPlugin(ILogger& log, std::string name):
+        log_(log), name_(name)
+    {
+        // REVIEW alexeyk: FP32 only for now.
+        data_type_ = CUDNN_DATA_FLOAT;
+    }
+
+    SoftargmaxPlugin(SoftargmaxPlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(nbInputDims == 1);
+        // Input must be 5D tensor in NDCHW format (batch index implicit) and
+        // C dim must be equal to 1.
+        assert(inputs[0].nbDims == 4);
+        assert(inputs[0].d[1]   == 1);
+
+        in_dims_  = {3, {inputs[0].d[0], inputs[0].d[2], inputs[0].d[3]}};
+        out_dims_ = DimsCHW(1, in_dims_.d[1], in_dims_.d[2]);
+
+        return out_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(nbInputs  == 1);
+        assert(nbOutputs == 1);
+        assert(DimsUtils::areEqual(inputDims[0],  Dims{4, {in_dims_.d[0], 1, in_dims_.d[1], in_dims_.d[2]}}));
+        assert(DimsUtils::areEqual(outputDims[0], out_dims_));
+
+        last_batch_size_ = maxBatchSize;
+
+        createDescriptors();
+        setTensorDescriptors(maxBatchSize);
+
+        log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str());
+        log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str());
+
+        assert(isValid());
+    }
+
+    int initialize() override
+    {
+        assert(isValid());
+
+        // Create and copy indices. Not using workspace as amount of memory required
+        // for indices is tiny.
+        std::vector<float> indices(in_dims_.d[0]);
+        std::iota(indices.begin(), indices.end(), 0);
+        size_t size_bytes = indices.size() * sizeof(float);
+        CHECK(cudaMalloc(&indices_d_, size_bytes));
+        CHECK(cudaMemcpy(indices_d_, indices.data(), size_bytes, cudaMemcpyHostToDevice));
+
+        return 0;
+    }
+
+    void terminate() override
+    {
+        assert(isValid());
+
+        if (in_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(in_desc_));
+        if (out_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(out_desc_));
+        if (idx_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(idx_desc_));
+        if (op_desc_ != nullptr)
+            CHECK(cudnnDestroyOpTensorDescriptor(op_desc_));
+        if (sum_desc_ != nullptr)
+            CHECK(cudnnDestroyReduceTensorDescriptor(sum_desc_));
+        if (cudnn_ != nullptr)
+            CHECK(cudnnDestroy(cudnn_));
+        if (indices_d_ != nullptr)
+            CHECK(cudaFree(indices_d_));
+        in_desc_   = nullptr;
+        out_desc_  = nullptr;
+        idx_desc_  = nullptr;
+        cudnn_     = nullptr;
+        indices_d_ = nullptr;
+        sum_desc_  = nullptr;
+
+        assert(!isValid());
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        // Need a copy of the input.
+        return 2 * (DimsUtils::getTensorSize(in_dims_)) * maxBatchSize * sizeof(float);
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        assert(isValid());
+
+        cudnnStatus_t status    = CUDNN_STATUS_SUCCESS;
+        cudaError_t   cu_status = cudaSuccess;
+
+        CHECK(status = cudnnSetStream(cudnn_, stream));
+
+        if (batchSize != last_batch_size_)
+            setTensorDescriptors(batchSize);
+
+        auto   pdst = static_cast<float*>(workspace);
+        // Copy input to workspace.
+        size_t in_size       = batchSize * DimsUtils::getTensorSize(in_dims_);
+        size_t in_size_bytes = in_size * sizeof(float);
+        CHECK(cu_status = cudaMemcpyAsync(pdst, inputs[0], in_size_bytes, cudaMemcpyDeviceToDevice, stream));
+
+        // Scale.
+        const float kMinusOne = -1.0f;
+        CHECK(status = cudnnScaleTensor(cudnn_, in_desc_, pdst, &kMinusOne));
+
+        // Do softmax over D dim.
+        CHECK(status = cudnnSoftmaxForward(cudnn_, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &Consts::kOne, in_desc_, pdst,
+                                           &Consts::kZero, in_desc_, pdst));
+
+        // Multiply and sum-reduce over D dim.
+        CHECK(status = cudnnOpTensor(cudnn_, op_desc_, &Consts::kOne, in_desc_, pdst, 
+                                     &Consts::kOne, idx_desc_, indices_d_,
+                                     &Consts::kZero, in_desc_, pdst));
+        CHECK(status = cudnnReduceTensor(cudnn_, sum_desc_, nullptr, 0, pdst + in_size, in_size_bytes,
+                                         &Consts::kOne,  in_desc_,  pdst,
+                                         &Consts::kZero, out_desc_, outputs[0]));
+
+        return (status == CUDNN_STATUS_SUCCESS && cu_status == cudaSuccess) ? 0 : -1;
+    }
+
+    size_t getSerializationSize() override
+    {
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+    }
+
+private:
+    bool isValid() const
+    {
+        return cudnn_ != nullptr;
+    }
+
+    void createDescriptors()
+    {
+        if (cudnn_ == nullptr)
+            CHECK(cudnnCreate(&cudnn_));
+        if (in_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&in_desc_));
+        if (out_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&out_desc_));
+        if (idx_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&idx_desc_));
+        if (op_desc_ == nullptr)
+            CHECK(cudnnCreateOpTensorDescriptor(&op_desc_));
+        if (sum_desc_ == nullptr)
+            CHECK(cudnnCreateReduceTensorDescriptor(&sum_desc_));
+    }
+
+    void setTensorDescriptors(int batch_size)
+    {
+        // Set input tensor descriptor as NDHW (add batch dim and remove C dim).
+        auto tensor_dims    = Dims{4, {batch_size, in_dims_.d[0], in_dims_.d[1], in_dims_.d[2]}};
+        auto tensor_strides = DimsUtils::getStrides(tensor_dims);
+        CHECK(cudnnSetTensorNdDescriptor(in_desc_, data_type_, tensor_dims.nbDims, tensor_dims.d, tensor_strides.d));
+
+        // Set output tensor descriptor as NCHW where C == 1.
+        tensor_dims    = Dims{4, {batch_size, out_dims_.d[0], out_dims_.d[1], out_dims_.d[2]}};
+        tensor_strides = DimsUtils::getStrides(tensor_dims);
+        CHECK(cudnnSetTensorNdDescriptor(out_desc_, data_type_, tensor_dims.nbDims, tensor_dims.d, tensor_strides.d));
+
+        // Set index tensor.
+        tensor_dims    = Dims{4, {1, in_dims_.d[0], 1, 1}};
+        tensor_strides = DimsUtils::getStrides(tensor_dims);
+        CHECK(cudnnSetTensorNdDescriptor(idx_desc_, data_type_, tensor_dims.nbDims, tensor_dims.d, tensor_strides.d));
+        
+        // Set tensor op for index multiplication.
+        CHECK(cudnnSetOpTensorDescriptor(op_desc_, CUDNN_OP_TENSOR_MUL, data_type_, CUDNN_PROPAGATE_NAN));
+
+        // Set reduce descriptor.
+        CHECK(cudnnSetReduceTensorDescriptor(sum_desc_, CUDNN_REDUCE_TENSOR_ADD, data_type_, CUDNN_PROPAGATE_NAN,
+                                             CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES));
+
+        last_batch_size_ = batch_size;
+    }
+
+private:
+    cudnnDataType_t               data_type_;
+    cudnnHandle_t                 cudnn_    = nullptr;
+    cudnnTensorDescriptor_t       in_desc_  = nullptr;
+    cudnnTensorDescriptor_t       out_desc_ = nullptr;
+    cudnnTensorDescriptor_t       idx_desc_ = nullptr;
+    cudnnOpTensorDescriptor_t     op_desc_  = nullptr;
+    cudnnReduceTensorDescriptor_t sum_desc_ = nullptr;
+
+    Dims    in_dims_;
+    DimsCHW out_dims_;
+    int     last_batch_size_ = 0;
+
+    float*  indices_d_ = nullptr;
+
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createSoftargmaxPlugin(std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new SoftargmaxPlugin(log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/lib/transform_plugin.cpp b/stereoDNN/lib/transform_plugin.cpp
new file mode 100644
index 0000000..4963155
--- /dev/null
+++ b/stereoDNN/lib/transform_plugin.cpp
@@ -0,0 +1,192 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "internal_utils.h"
+#include <cassert>
+#include <numeric>
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+// -----------------------------------------------------------------
+// Tensor transform plugin.
+// -----------------------------------------------------------------
+class TransformPlugin: public IPlugin
+{
+public:
+    TransformPlugin(Permutation permutation, ILogger& log, std::string name):
+        permutation_(permutation), log_(log), name_(name)
+    {
+    }
+
+    TransformPlugin(TransformPlugin&&) = delete;
+
+    int getNbOutputs() const override
+    {
+        return 1;
+    }
+
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(nbInputDims == 1);
+        // Only NCHW format is supported for now, TRT does not work (assert) with generic Dims.
+        assert(inputs[0].nbDims == 4);
+
+        in_dims_ = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]);
+        // Sanity check.
+        int actual_sum   = std::accumulate(permutation_.order, permutation_.order + in_dims_.nbDims, 0, std::plus<int>());
+        int expected_sum = in_dims_.nbDims * (in_dims_.nbDims - 1) / 2;
+        assert(actual_sum == expected_sum);
+        UNUSEDR(actual_sum);
+        UNUSEDR(expected_sum);
+        // Transposed output dimensions.
+        for (int i = 0; i < in_dims_.nbDims; i++)
+            out_dims_.d[i] = in_dims_.d[permutation_.order[i]];
+        return out_dims_;
+    }
+
+    void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override
+    {
+        assert(nbInputs  == 1);
+        assert(nbOutputs == 1);
+        assert(DimsUtils::areEqual(inputDims[0],  in_dims_));
+        assert(DimsUtils::areEqual(outputDims[0], out_dims_));
+
+        createDescriptors();
+        setTensorDescriptors(maxBatchSize);
+
+        log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str());
+        log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str());
+
+        assert(isValid());
+    }
+
+    int initialize() override
+    {
+        assert(isValid());
+        return 0;
+    }
+
+    void terminate() override
+    {
+        assert(isValid());
+
+        if (in_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(in_desc_));
+        if (out_desc_ != nullptr)
+            CHECK(cudnnDestroyTensorDescriptor(out_desc_));
+        if (cudnn_ != nullptr)
+            CHECK(cudnnDestroy(cudnn_));
+        in_desc_  = nullptr;
+        out_desc_ = nullptr;
+        cudnn_    = nullptr;
+
+        assert(!isValid());
+    }
+
+    size_t getWorkspaceSize(int maxBatchSize) const
+    {
+        return 0;
+    }
+
+    int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
+    {
+        assert(isValid());
+
+        cudnnStatus_t status;
+
+        CHECK(status = cudnnSetStream(cudnn_, stream));
+
+        if (batchSize != tensor_dims_.d[0])
+            setTensorDescriptors(batchSize);
+
+        CHECK(status = cudnnTransformTensor(cudnn_, &Consts::kOne, in_desc_, inputs[0], &Consts::kZero, out_desc_, outputs[0]));
+
+        return status == CUDNN_STATUS_SUCCESS ? 0 : -1;
+    }
+
+    size_t getSerializationSize() override
+    {
+        return 0;
+    }
+
+    void serialize(void* buffer) override
+    {
+    }
+
+private:
+    bool isValid() const
+    {
+        return cudnn_ != nullptr;
+    }
+
+    void createDescriptors()
+    {
+        if (cudnn_ == nullptr)
+            CHECK(cudnnCreate(&cudnn_));
+        if (in_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&in_desc_));
+        if (out_desc_ == nullptr)
+            CHECK(cudnnCreateTensorDescriptor(&out_desc_));
+    }
+
+    void setTensorDescriptors(int batch_size)
+    {
+        // cudnnTransformTensor requires dimensions of both tensors to be the same.
+        // The transform is represented by different strides.
+
+        // Add batch dimension to final tensor and copy dims from input.
+        tensor_dims_.nbDims = in_dims_.nbDims + 1;
+        tensor_dims_.d[0]   = batch_size;
+        std::copy(in_dims_.d, in_dims_.d + in_dims_.nbDims, tensor_dims_.d + 1);
+
+        in_tensor_strides_ = DimsUtils::getStrides(tensor_dims_);
+        CHECK(cudnnSetTensorNdDescriptor(in_desc_,  CUDNN_DATA_FLOAT,
+                                         tensor_dims_.nbDims, tensor_dims_.d, in_tensor_strides_.d));
+
+        // Compute output tensor strides.
+        // 1. Create output tensor dims with batch dimension.
+        Dims out_tensor_dims;
+        out_tensor_dims.nbDims = out_dims_.nbDims + 1;
+        out_tensor_dims.d[0]   = batch_size;
+        std::copy(out_dims_.d, out_dims_.d + out_dims_.nbDims, out_tensor_dims.d + 1);
+        // 2. Compute the strides for the output tensor. We need a copy to do a transform later.
+        Dims out_strides = DimsUtils::getStrides(out_tensor_dims);
+        out_tensor_strides_ = out_strides;
+        // 3. As cudnnTransformTensor requires both dims to be the same, we need
+        // to change strides on the _output_ tensor to achieve desired transform effect.
+        for (int i = 1; i < out_strides.nbDims; i++)
+            out_tensor_strides_.d[i] = out_strides.d[permutation_.order[i - 1] + 1];
+        CHECK(cudnnSetTensorNdDescriptor(out_desc_, CUDNN_DATA_FLOAT,
+                                         tensor_dims_.nbDims, tensor_dims_.d, out_tensor_strides_.d));
+
+    }
+
+private:
+    cudnnHandle_t           cudnn_    = nullptr;
+    cudnnTensorDescriptor_t in_desc_ = nullptr;
+    cudnnTensorDescriptor_t out_desc_ = nullptr;
+
+    Permutation permutation_;
+    DimsNCHW    in_dims_;
+    DimsNCHW    out_dims_;
+
+    Dims        tensor_dims_;
+    Dims        in_tensor_strides_;
+    Dims        out_tensor_strides_;
+
+    ILogger&    log_;
+    std::string name_;
+};
+
+// Factory method.
+IPlugin* PluginContainer::createTransformPlugin(Permutation permutation, std::string name)
+{
+    std::lock_guard<std::mutex> lock(lock_);
+    plugins_.push_back(new TransformPlugin(permutation, log_, name));
+    return plugins_.back();
+}
+
+} }
\ No newline at end of file
diff --git a/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.data-00000-of-00001
new file mode 100644
index 0000000..1ebde5a
Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 differ
diff --git a/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.index b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.index
new file mode 100644
index 0000000..6205fb9
Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.index differ
diff --git a/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.meta b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.meta
new file mode 100644
index 0000000..2b2c9fc
Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.meta differ
diff --git a/stereoDNN/models/NVSmall/TensorRT/trt_weights.bin b/stereoDNN/models/NVSmall/TensorRT/trt_weights.bin
new file mode 100644
index 0000000..5fee420
Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorRT/trt_weights.bin differ
diff --git a/stereoDNN/models/NVSmall/TensorRT/trt_weights_fp16.bin b/stereoDNN/models/NVSmall/TensorRT/trt_weights_fp16.bin
new file mode 100644
index 0000000..a0807bd
Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorRT/trt_weights_fp16.bin differ
diff --git a/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.data-00000-of-00001 b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.data-00000-of-00001
new file mode 100644
index 0000000..fd007a5
Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.data-00000-of-00001 differ
diff --git a/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.index b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.index
new file mode 100644
index 0000000..5f868ca
Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.index differ
diff --git a/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.meta b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.meta
new file mode 100644
index 0000000..5cc0eb2
Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.meta differ
diff --git a/stereoDNN/models/NVTiny/TensorRT/trt_weights.bin b/stereoDNN/models/NVTiny/TensorRT/trt_weights.bin
new file mode 100644
index 0000000..3f95079
Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorRT/trt_weights.bin differ
diff --git a/stereoDNN/models/NVTiny/TensorRT/trt_weights_fp16.bin b/stereoDNN/models/NVTiny/TensorRT/trt_weights_fp16.bin
new file mode 100644
index 0000000..4090a2a
Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorRT/trt_weights_fp16.bin differ
diff --git a/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.data-00000-of-00001
new file mode 100644
index 0000000..b6224e9
Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 differ
diff --git a/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.index b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.index
new file mode 100644
index 0000000..359528c
Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.index differ
diff --git a/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.meta b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.meta
new file mode 100644
index 0000000..81ebbce
Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.meta differ
diff --git a/stereoDNN/models/ResNet-18/TensorRT/trt_weights.bin b/stereoDNN/models/ResNet-18/TensorRT/trt_weights.bin
new file mode 100644
index 0000000..e766aea
Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorRT/trt_weights.bin differ
diff --git a/stereoDNN/models/ResNet-18/TensorRT/trt_weights_fp16.bin b/stereoDNN/models/ResNet-18/TensorRT/trt_weights_fp16.bin
new file mode 100644
index 0000000..c38612e
Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorRT/trt_weights_fp16.bin differ
diff --git a/stereoDNN/sample_app/CMakeLists.txt b/stereoDNN/sample_app/CMakeLists.txt
new file mode 100644
index 0000000..3f1e320
--- /dev/null
+++ b/stereoDNN/sample_app/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+cmake_minimum_required(VERSION 3.5)
+
+set(PROJECT_NAME nvstereo_sample_app)
+project(${PROJECT_NAME})
+
+find_package(CUDA 9.0 REQUIRED)
+find_package(OpenCV 3.3.1 REQUIRED)
+
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(${CMAKE_SOURCE_DIR}/lib)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+
+file(GLOB ${PROJECT_NAME}_sources ./*.cpp)
+set(PROJECT_SOURCES ${${PROJECT_NAME}_sources})
+
+set(TARGET_NAME ${PROJECT_NAME}${TARGET_SUFFIX})
+
+add_executable(${TARGET_NAME} ${PROJECT_SOURCES})
+
+target_link_libraries(${TARGET_NAME}
+    nvstereo_inference${TARGET_SUFFIX}
+    ${CUDA_LIBRARIES}
+    cudnn
+    nvinfer
+    opencv_core
+    opencv_imgproc
+    opencv_highgui)
diff --git a/stereoDNN/sample_app/data/img_left.bin b/stereoDNN/sample_app/data/img_left.bin
new file mode 100644
index 0000000..e7a5d35
Binary files /dev/null and b/stereoDNN/sample_app/data/img_left.bin differ
diff --git a/stereoDNN/sample_app/data/img_left.png b/stereoDNN/sample_app/data/img_left.png
new file mode 100644
index 0000000..cf07f6d
Binary files /dev/null and b/stereoDNN/sample_app/data/img_left.png differ
diff --git a/stereoDNN/sample_app/data/img_right.bin b/stereoDNN/sample_app/data/img_right.bin
new file mode 100644
index 0000000..c6aa131
Binary files /dev/null and b/stereoDNN/sample_app/data/img_right.bin differ
diff --git a/stereoDNN/sample_app/data/img_right.png b/stereoDNN/sample_app/data/img_right.png
new file mode 100644
index 0000000..f49813b
Binary files /dev/null and b/stereoDNN/sample_app/data/img_right.png differ
diff --git a/stereoDNN/sample_app/main.cpp b/stereoDNN/sample_app/main.cpp
new file mode 100644
index 0000000..c3aa46b
--- /dev/null
+++ b/stereoDNN/sample_app/main.cpp
@@ -0,0 +1,282 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include "NvInfer.h"
+#include "NvCaffeParser.h"
+#include <cuda_runtime_api.h>
+#include <cassert>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <unordered_map>
+#include <cudnn.h>
+#include <opencv2/opencv.hpp>
+
+#include "redtail_tensorrt_plugins.h"
+
+#define UNUSED(x) ((void)(x))
+
+#define CHECK(status) do {   \
+    int res = (int)(status); \
+    assert(res == 0);        \
+    UNUSED(res);             \
+} while(false)
+
+using namespace nvinfer1;
+using namespace redtail::tensorrt;
+
+namespace redtail { namespace tensorrt
+{
+using weight_map = std::unordered_map<std::string, Weights>;
+
+// NVSmall DNN: 1025x321 input, 96 max disparity.
+INetworkDefinition* createNVSmall1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory,
+                                                 DimsCHW img_dims, const weight_map& weights, DataType data_type, ILogger& log);
+
+// Tiny version of NVSmall DNN: 513x161 input, 48 max disparity.
+INetworkDefinition* createNVTiny513x161Network(IBuilder& builder, IPluginContainer& plugin_factory,
+                                               DimsCHW img_dims, const weight_map& weights, DataType data_type,
+                                               ILogger& log);
+
+// Baseline ResNet-18 DNN: 1025x321 input, 136 max disparity.
+INetworkDefinition* createResNet18_1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory,
+                                                   DimsCHW img_dims, const weight_map& weights, DataType data_type,
+                                                   ILogger& log);
+} }
+
+class Logger : public nvinfer1::ILogger
+{
+    public:
+    void log(nvinfer1::ILogger::Severity severity, const char* msg) override
+    {
+        // Skip info (verbose) messages.
+        // if (severity == Severity::kINFO)
+        //     return;
+
+        switch (severity)
+        {
+            case Severity::kINTERNAL_ERROR: std::cerr << "TRT INTERNAL_ERROR: "; break;
+            case Severity::kERROR:          std::cerr << "TRT ERROR: "; break;
+            case Severity::kWARNING:        std::cerr << "TRT WARNING: "; break;
+            case Severity::kINFO:           std::cerr << "TRT INFO: "; break;
+            default:                        std::cerr << "TRT UNKNOWN: "; break;
+        }
+        std::cerr << msg << std::endl;
+    }
+};
+
+static Logger gLogger;
+
+std::vector<float> readImgFile(const std::string& filename, int w, int h)
+{
+    auto img = cv::imread(filename);
+    assert(img.data != nullptr);
+    // 0. Convert to float.
+    img.convertTo(img, CV_32F);
+    // 1. Resize.
+    cv::resize(img, img, cv::Size(w, h), 0, 0,cv::INTER_AREA);
+    // 2. Convert BGR -> RGB.
+    cv::cvtColor(img, img, CV_BGR2RGB);
+    // 3. Convert HWC -> CHW.
+    cv::Mat res = img.reshape(1, w * h).t();
+    // 4. Scale.
+    res /= 255.0;
+    return std::vector<float>(res.ptr<float>(0), res.ptr<float>(0) + w * h * 3);
+}
+
+std::vector<float> readBinFile(const std::string& filename)
+{
+    std::ifstream input_file(filename, std::ios::binary | std::ios::ate);
+    assert(input_file.is_open());
+    size_t size = input_file.tellg();
+    input_file.seekg(0, std::ios_base::beg);
+    std::vector<float> data(size / sizeof(float));
+    input_file.read((char*)data.data(), size);
+    return data;
+}
+
+std::unordered_map<std::string, Weights> readWeights(const std::string& filename, DataType data_type)
+{
+    assert(data_type == DataType::kFLOAT || data_type == DataType::kHALF);
+
+    std::unordered_map<std::string, Weights> weights;
+    std::ifstream weights_file(filename, std::ios::binary);
+    assert(weights_file.is_open());
+    while (weights_file.peek() != std::ifstream::traits_type::eof())
+    {
+        std::string name;
+        uint32_t    count;
+        Weights     w {data_type, nullptr, 0};
+        std::getline(weights_file, name, '\0');
+        weights_file.read(reinterpret_cast<char*>(&count), sizeof(uint32_t));
+        w.count = count;
+        size_t el_size_bytes = data_type == DataType::kFLOAT ? 4 : 2;
+        auto p = new uint8_t[count * el_size_bytes];
+        weights_file.read(reinterpret_cast<char*>(p), count * el_size_bytes);
+        w.values = p;
+        assert(weights.find(name) == weights.cend());
+        weights[name] = w;
+    }
+    return weights;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc < 8)
+    {
+        printf("\n"
+               "Usage  : nvstereo_sample_app[_debug] <model_type> <width> <height> <path_to_weights_file> <path_to_left_image> <path_to_right_image> <disparity_output> [data_type]\n"
+               "where  : model_type is the type of the DNN, supported are: nvsmall and resnet18\n"
+               "         width and height are dimensions of the network (e.g. 1025 321)\n"
+               "         weights file is the output of TensorRT model builder script\n"
+               "         left and right are images that will be scaled to <width> x <height>\n"
+               "         disparity output is the output of the network of size <width> x <height> (bin and PNG files are created)\n"
+               "         data type(optional) is the data type of the model: fp32 (default) or fp16\n"
+               "See <stereoDNN>/models directory for model files\n"
+               "Example: nvstereo_sample_app nvsmall 1025 321 trt_weights.bin img_left.png img_right.png out_disp.bin\n\n");
+        return 1;
+    }
+    //getchar();
+
+    auto model_type = std::string(argv[1]);
+    if (model_type != "nvsmall" && model_type != "resnet18")
+    {
+        printf("Invalid model type %s, supported: nvsmall, resnet18.\n", model_type.c_str());
+        exit(1);
+    }
+
+    DataType data_type = DataType::kFLOAT;
+    if (argc >= 9)
+    {
+        auto d_type = std::string(argv[8]);
+        if (d_type == "fp32")
+            data_type = DataType::kFLOAT;
+        else if (d_type == "fp16")
+            data_type = DataType::kHALF;
+        else
+        {
+            printf("Data type %s is not supported, supported types: fp32, fp16.\n", d_type.c_str());
+            exit(1);
+        }
+    }
+    printf("Using %s data type.\n", data_type == DataType::kFLOAT ? "fp32" : "fp16");
+
+    // Read weights.
+    // Note: the weights object lifetime must be at least the same as engine.
+    auto weights = readWeights(argv[4], data_type);
+    printf("Loaded %zu weight sets.\n", weights.size());
+
+    //const int b = 1;
+    const int c = 3;
+    const int h = std::stoi(argv[3]);
+    const int w = std::stoi(argv[2]);
+    printf("Using [%d, %d](width, height) as network input dimensions.\n", w, h);
+
+    // Read images.
+    auto img_left  = readImgFile(argv[5], w, h);
+    //auto img_left  = readBinFile(argv[5]);
+    assert(img_left.size() == (size_t)c * h * w);
+    auto img_right = readImgFile(argv[6], w, h);
+    //auto img_right = readBinFile(argv[6]);
+    assert(img_right.size() == (size_t)c * h * w);
+
+    // Create builder and network.
+    IBuilder* builder = createInferBuilder(gLogger);
+
+    // Note: the plugin_factory object lifetime must be at least the same as engine.
+    auto plugin_factory = IPluginContainer::create(gLogger);
+    // TRT v3 supports FP16 only for the weights (e.g. convolutions) but not the data so use float data type.
+    INetworkDefinition* network = nullptr;
+    if (model_type == "nvsmall")
+    {
+        if (w == 1025)
+            network = createNVSmall1025x321Network(*builder, *plugin_factory, DimsCHW { c, h, w }, weights, DataType::kFLOAT, gLogger);
+        else if (w == 513)
+            network = createNVTiny513x161Network(  *builder, *plugin_factory, DimsCHW { c, h, w }, weights, DataType::kFLOAT, gLogger);
+        else
+            assert(false);
+    }
+    else if (model_type == "resnet18")
+    {
+        if (w == 1025)
+            network = createResNet18_1025x321Network(*builder, *plugin_factory, DimsCHW { c, h, w }, weights, DataType::kFLOAT, gLogger);
+        else
+        {
+            printf("ResNet-18 model supports only 1025x321 input image.\n");
+            exit(1);
+        }
+    }
+    else
+        assert(false);
+
+    builder->setMaxBatchSize(1);
+    size_t workspace_bytes = 1024 * 1024 * 1024;
+    builder->setMaxWorkspaceSize(workspace_bytes);
+
+    builder->setHalf2Mode(data_type == DataType::kHALF);
+    // Build the network.
+    auto engine = builder->buildCudaEngine(*network);
+    network->destroy();
+
+    // REVIEW alexeyk: serialization is not yet supported. Need to implement IPluginFactory properly.
+    // IHostMemory *model_stream = engine->serialize();
+    // engine->destroy();
+    // builder->destroy();
+
+    // IRuntime* runtime = createInferRuntime(gLogger);
+    // engine = runtime->deserializeCudaEngine(model_stream->data(), model_stream->size(), nullptr);
+    // model_stream->destroy();
+
+    assert(engine->getNbBindings() == 3);
+    void* buffers[3];
+    int in_idx_left = engine->getBindingIndex("left");
+    assert(in_idx_left == 0);
+    int in_idx_right = engine->getBindingIndex("right");
+    assert(in_idx_right == 1);
+    int out_idx = engine->getBindingIndex("disp");
+    assert(out_idx == 2);
+
+    IExecutionContext *context = engine->createExecutionContext();
+
+    std::vector<float> output(h * w);
+
+    // Allocate GPU memory and copy data.
+    CHECK(cudaMalloc(&buffers[in_idx_left],  img_left.size() * sizeof(float)));
+    CHECK(cudaMalloc(&buffers[in_idx_right], img_right.size() * sizeof(float)));
+    CHECK(cudaMalloc(&buffers[out_idx],      output.size() * sizeof(float)));
+
+    CHECK(cudaMemcpy(buffers[in_idx_left],  img_left.data(),  img_left.size() * sizeof(float),  cudaMemcpyHostToDevice));
+    CHECK(cudaMemcpy(buffers[in_idx_right], img_right.data(), img_right.size() * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Do the inference.
+    auto host_start = std::chrono::high_resolution_clock::now();
+    auto err        = context->execute(1, buffers);
+    auto host_end   = std::chrono::high_resolution_clock::now();
+    assert(err);
+    UNUSED(err);
+    auto host_elapsed_ms = std::chrono::duration<float, std::milli>(host_end - host_start).count();
+    printf("Host time: %.4fms\n", host_elapsed_ms);
+
+    // Copy output back to host.
+    CHECK(cudaMemcpy(output.data(), buffers[out_idx], output.size() * sizeof(float), cudaMemcpyDeviceToHost));
+
+    // Write results.
+    // 1. As binary file.
+    auto res_file = std::ofstream(argv[7], std::ios::binary);
+    res_file.write((char*)output.data(), output.size() * sizeof(float));
+    // 2. As PNG image.
+    auto img_f = cv::Mat(h, w, CV_32F, output.data());
+    img_f *= 256;
+    cv::Mat img_u16;
+    img_f.convertTo(img_u16, CV_16U);
+    cv::imwrite(std::string(argv[7]) + ".png", img_u16);
+
+    // Cleanup.
+    context->destroy();
+    engine->destroy();
+    for (auto b: buffers)
+        CHECK(cudaFree(b));
+
+    printf("Done\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/stereoDNN/sample_app/nvsmall_1025x321_net.cpp b/stereoDNN/sample_app/nvsmall_1025x321_net.cpp
new file mode 100644
index 0000000..26ba02a
--- /dev/null
+++ b/stereoDNN/sample_app/nvsmall_1025x321_net.cpp
@@ -0,0 +1,369 @@
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+//-------------------------------------------------------------------
+// !!! This file was automatically generated. Do not edit. !!!
+//-------------------------------------------------------------------
+
+#include <NvInfer.h>
+#include <cassert>
+#include <string>
+#include <unordered_map>
+#include "redtail_tensorrt_plugins.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+using weight_map = std::unordered_map<std::string, Weights>;
+
+INetworkDefinition* createNVSmall1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory,
+                                     DimsCHW img_dims, const weight_map& weights, DataType data_type,
+                                     ILogger& log)
+{
+    INetworkDefinition* network = builder.createNetwork();
+    assert(network != nullptr);
+    // Input tensor.
+    auto left = network->addInput("left", DataType::kFLOAT, img_dims);
+    assert(left != nullptr);
+
+    // Input tensor.
+    auto right = network->addInput("right", DataType::kFLOAT, img_dims);
+    assert(right != nullptr);
+
+    // Scaling op.
+    auto left_scale = network->addScale(*left, ScaleMode::kUNIFORM,
+                                 weights.at("left_scale_shift"), weights.at("left_scale_scale"), weights.at("left_scale_power"));
+    assert(left_scale != nullptr);
+
+    // Scaling op.
+    auto right_scale = network->addScale(*right, ScaleMode::kUNIFORM,
+                                 weights.at("right_scale_shift"), weights.at("right_scale_scale"), weights.at("right_scale_power"));
+    assert(right_scale != nullptr);
+
+    // left_conv1 convolution op.
+    auto left_conv1 = network->addConvolution(*left_scale->getOutput(0), 32, DimsHW {5, 5},
+                                       weights.at("left_conv1_k"), weights.at("left_conv1_b"));
+    assert(left_conv1 != nullptr);
+    left_conv1->setStride( DimsHW {2, 2});
+    left_conv1->setPadding(DimsHW {2, 2});
+
+    // left_conv1_act ELU activation op.
+    auto left_conv1_act = addElu(plugin_factory, *network, *left_conv1->getOutput(0), data_type, "left_conv1_act");
+    assert(left_conv1_act != nullptr);
+
+    // right_conv1 convolution op.
+    auto right_conv1 = network->addConvolution(*right_scale->getOutput(0), 32, DimsHW {5, 5},
+                                       weights.at("right_conv1_k"), weights.at("right_conv1_b"));
+    assert(right_conv1 != nullptr);
+    right_conv1->setStride( DimsHW {2, 2});
+    right_conv1->setPadding(DimsHW {2, 2});
+
+    // right_conv1_act ELU activation op.
+    auto right_conv1_act = addElu(plugin_factory, *network, *right_conv1->getOutput(0), data_type, "right_conv1_act");
+    assert(right_conv1_act != nullptr);
+
+    // left_conv2 convolution op.
+    auto left_conv2 = network->addConvolution(*left_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_conv2_k"), weights.at("left_conv2_b"));
+    assert(left_conv2 != nullptr);
+    left_conv2->setStride( DimsHW {1, 1});
+    left_conv2->setPadding(DimsHW {1, 1});
+
+    // left_conv2_act ELU activation op.
+    auto left_conv2_act = addElu(plugin_factory, *network, *left_conv2->getOutput(0), data_type, "left_conv2_act");
+    assert(left_conv2_act != nullptr);
+
+    // right_conv2 convolution op.
+    auto right_conv2 = network->addConvolution(*right_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_conv2_k"), weights.at("right_conv2_b"));
+    assert(right_conv2 != nullptr);
+    right_conv2->setStride( DimsHW {1, 1});
+    right_conv2->setPadding(DimsHW {1, 1});
+
+    // right_conv2_act ELU activation op.
+    auto right_conv2_act = addElu(plugin_factory, *network, *right_conv2->getOutput(0), data_type, "right_conv2_act");
+    assert(right_conv2_act != nullptr);
+
+    // left_conv3 convolution op.
+    auto left_conv3 = network->addConvolution(*left_conv2_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_conv3_k"), weights.at("left_conv3_b"));
+    assert(left_conv3 != nullptr);
+    left_conv3->setStride( DimsHW {1, 1});
+    left_conv3->setPadding(DimsHW {1, 1});
+
+    // left_conv3_act ELU activation op.
+    auto left_conv3_act = addElu(plugin_factory, *network, *left_conv3->getOutput(0), data_type, "left_conv3_act");
+    assert(left_conv3_act != nullptr);
+
+    // right_conv3 convolution op.
+    auto right_conv3 = network->addConvolution(*right_conv2_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_conv3_k"), weights.at("right_conv3_b"));
+    assert(right_conv3 != nullptr);
+    right_conv3->setStride( DimsHW {1, 1});
+    right_conv3->setPadding(DimsHW {1, 1});
+
+    // right_conv3_act ELU activation op.
+    auto right_conv3_act = addElu(plugin_factory, *network, *right_conv3->getOutput(0), data_type, "right_conv3_act");
+    assert(right_conv3_act != nullptr);
+
+    // left_conv4 convolution op.
+    auto left_conv4 = network->addConvolution(*left_conv3_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_conv4_k"), weights.at("left_conv4_b"));
+    assert(left_conv4 != nullptr);
+    left_conv4->setStride( DimsHW {1, 1});
+    left_conv4->setPadding(DimsHW {1, 1});
+
+    // left_conv4_act ELU activation op.
+    auto left_conv4_act = addElu(plugin_factory, *network, *left_conv4->getOutput(0), data_type, "left_conv4_act");
+    assert(left_conv4_act != nullptr);
+
+    // right_conv4 convolution op.
+    auto right_conv4 = network->addConvolution(*right_conv3_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_conv4_k"), weights.at("right_conv4_b"));
+    assert(right_conv4 != nullptr);
+    right_conv4->setStride( DimsHW {1, 1});
+    right_conv4->setPadding(DimsHW {1, 1});
+
+    // right_conv4_act ELU activation op.
+    auto right_conv4_act = addElu(plugin_factory, *network, *right_conv4->getOutput(0), data_type, "right_conv4_act");
+    assert(right_conv4_act != nullptr);
+
+    // left_conv5 convolution op.
+    auto left_conv5 = network->addConvolution(*left_conv4_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_conv5_k"), weights.at("left_conv5_b"));
+    assert(left_conv5 != nullptr);
+    left_conv5->setStride( DimsHW {1, 1});
+    left_conv5->setPadding(DimsHW {1, 1});
+
+    // right_conv5 convolution op.
+    auto right_conv5 = network->addConvolution(*right_conv4_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_conv5_k"), weights.at("right_conv5_b"));
+    assert(right_conv5 != nullptr);
+    right_conv5->setStride( DimsHW {1, 1});
+    right_conv5->setPadding(DimsHW {1, 1});
+
+    // cost_vol cost volume op.
+    auto cost_vol = addCostVolume(plugin_factory, *network, *left_conv5->getOutput(0), *right_conv5->getOutput(0), 48, "cost_vol");
+    assert(cost_vol != nullptr);
+
+    // conv3D_1 3D convolution op.
+    auto conv3D_1 = addConv3D(plugin_factory, *network, *cost_vol->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {32, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_1_k"), weights.at("conv3D_1_b"),
+                         "conv3D_1");
+    assert(conv3D_1 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_1_tran = addTransform(plugin_factory, *network, *conv3D_1->getOutput(0), {1, 0, 2, 3}, "conv3D_1_tran_transform");
+    assert(conv3D_1_tran != nullptr);
+
+    // conv3D_1_act ELU activation op.
+    auto conv3D_1_act = addElu(plugin_factory, *network, *conv3D_1_tran->getOutput(0), data_type, "conv3D_1_act");
+    assert(conv3D_1_act != nullptr);
+
+    // conv3D_2 3D convolution op.
+    auto conv3D_2 = addConv3D(plugin_factory, *network, *conv3D_1_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_2_k"), weights.at("conv3D_2_b"),
+                         "conv3D_2");
+    assert(conv3D_2 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_2_tran = addTransform(plugin_factory, *network, *conv3D_2->getOutput(0), {1, 0, 2, 3}, "conv3D_2_tran_transform");
+    assert(conv3D_2_tran != nullptr);
+
+    // conv3D_2_act ELU activation op.
+    auto conv3D_2_act = addElu(plugin_factory, *network, *conv3D_2_tran->getOutput(0), data_type, "conv3D_2_act");
+    assert(conv3D_2_act != nullptr);
+
+    // conv3D_3ds_pad padding op.
+    auto conv3D_3ds_pad = addPad(plugin_factory, *network, *conv3D_2_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_3ds_pad");
+    assert(conv3D_3ds_pad != nullptr);
+
+    // conv3D_3ds 3D convolution op.
+    auto conv3D_3ds = addConv3D(plugin_factory, *network, *conv3D_3ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_3ds_k"), weights.at("conv3D_3ds_b"),
+                         "conv3D_3ds");
+    assert(conv3D_3ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_3ds_tran = addTransform(plugin_factory, *network, *conv3D_3ds->getOutput(0), {1, 0, 2, 3}, "conv3D_3ds_tran_transform");
+    assert(conv3D_3ds_tran != nullptr);
+
+    // conv3D_3ds_act ELU activation op.
+    auto conv3D_3ds_act = addElu(plugin_factory, *network, *conv3D_3ds_tran->getOutput(0), data_type, "conv3D_3ds_act");
+    assert(conv3D_3ds_act != nullptr);
+
+    // conv3D_4 3D convolution op.
+    auto conv3D_4 = addConv3D(plugin_factory, *network, *conv3D_3ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_4_k"), weights.at("conv3D_4_b"),
+                         "conv3D_4");
+    assert(conv3D_4 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_4_tran = addTransform(plugin_factory, *network, *conv3D_4->getOutput(0), {1, 0, 2, 3}, "conv3D_4_tran_transform");
+    assert(conv3D_4_tran != nullptr);
+
+    // conv3D_4_act ELU activation op.
+    auto conv3D_4_act = addElu(plugin_factory, *network, *conv3D_4_tran->getOutput(0), data_type, "conv3D_4_act");
+    assert(conv3D_4_act != nullptr);
+
+    // conv3D_5 3D convolution op.
+    auto conv3D_5 = addConv3D(plugin_factory, *network, *conv3D_4_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_5_k"), weights.at("conv3D_5_b"),
+                         "conv3D_5");
+    assert(conv3D_5 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_5_tran = addTransform(plugin_factory, *network, *conv3D_5->getOutput(0), {1, 0, 2, 3}, "conv3D_5_tran_transform");
+    assert(conv3D_5_tran != nullptr);
+
+    // conv3D_5_act ELU activation op.
+    auto conv3D_5_act = addElu(plugin_factory, *network, *conv3D_5_tran->getOutput(0), data_type, "conv3D_5_act");
+    assert(conv3D_5_act != nullptr);
+
+    // conv3D_6ds_pad padding op.
+    auto conv3D_6ds_pad = addPad(plugin_factory, *network, *conv3D_5_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_6ds_pad");
+    assert(conv3D_6ds_pad != nullptr);
+
+    // conv3D_6ds 3D convolution op.
+    auto conv3D_6ds = addConv3D(plugin_factory, *network, *conv3D_6ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_6ds_k"), weights.at("conv3D_6ds_b"),
+                         "conv3D_6ds");
+    assert(conv3D_6ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_6ds_tran = addTransform(plugin_factory, *network, *conv3D_6ds->getOutput(0), {1, 0, 2, 3}, "conv3D_6ds_tran_transform");
+    assert(conv3D_6ds_tran != nullptr);
+
+    // conv3D_6ds_act ELU activation op.
+    auto conv3D_6ds_act = addElu(plugin_factory, *network, *conv3D_6ds_tran->getOutput(0), data_type, "conv3D_6ds_act");
+    assert(conv3D_6ds_act != nullptr);
+
+    // conv3D_7 3D convolution op.
+    auto conv3D_7 = addConv3D(plugin_factory, *network, *conv3D_6ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_7_k"), weights.at("conv3D_7_b"),
+                         "conv3D_7");
+    assert(conv3D_7 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_7_tran = addTransform(plugin_factory, *network, *conv3D_7->getOutput(0), {1, 0, 2, 3}, "conv3D_7_tran_transform");
+    assert(conv3D_7_tran != nullptr);
+
+    // conv3D_7_act ELU activation op.
+    auto conv3D_7_act = addElu(plugin_factory, *network, *conv3D_7_tran->getOutput(0), data_type, "conv3D_7_act");
+    assert(conv3D_7_act != nullptr);
+
+    // conv3D_8 3D convolution op.
+    auto conv3D_8 = addConv3D(plugin_factory, *network, *conv3D_7_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_8_k"), weights.at("conv3D_8_b"),
+                         "conv3D_8");
+    assert(conv3D_8 != nullptr);
+
+    // conv3D_8_act ELU activation op.
+    auto conv3D_8_act = addElu(plugin_factory, *network, *conv3D_8->getOutput(0), data_type, "conv3D_8_act");
+    assert(conv3D_8_act != nullptr);
+
+    // deconv3D_1 3D transposed convolution op.
+    Dims deconv3D_1_out_dims{4, {25, 64, 81, 257}};
+    auto deconv3D_1 = addConv3DTranspose(plugin_factory, *network, *conv3D_8_act->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}}, deconv3D_1_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_1_k"), weights.at("deconv3D_1_b"),
+                                  "deconv3D_1");
+    assert(deconv3D_1 != nullptr);
+
+    // deconv3D_1 output slice op.
+    auto deconv3D_1_slice_layer = addSlice(plugin_factory, *network, *deconv3D_1->getOutput(0),
+                                    deconv3D_1_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_1_out_dims.d[0] - 1, deconv3D_1_out_dims.d[1], deconv3D_1_out_dims.d[2], deconv3D_1_out_dims.d[3]}},
+                                    "deconv3D_1_slice");
+    assert(deconv3D_1_slice_layer != nullptr);
+
+    // deconv3D_1_add_skip tensor add op.
+    auto deconv3D_1_add_skip = network->addElementWise(*(deconv3D_1_slice_layer->getOutput(0)), *(conv3D_5_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_1_add_skip != nullptr);
+
+    // deconv3D_1_act ELU activation op.
+    auto deconv3D_1_act = addElu(plugin_factory, *network, *deconv3D_1_add_skip->getOutput(0), data_type, "deconv3D_1_act");
+    assert(deconv3D_1_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_1_transform = addTransform(plugin_factory, *network, *deconv3D_1_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_1_transform_transform");
+    assert(deconv3D_1_transform != nullptr);
+
+    // deconv3D_2 3D transposed convolution op.
+    Dims deconv3D_2_out_dims{4, {49, 32, 161, 513}};
+    auto deconv3D_2 = addConv3DTranspose(plugin_factory, *network, *deconv3D_1_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, deconv3D_2_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_2_k"), weights.at("deconv3D_2_b"),
+                                  "deconv3D_2");
+    assert(deconv3D_2 != nullptr);
+
+    // deconv3D_2 output slice op.
+    auto deconv3D_2_slice_layer = addSlice(plugin_factory, *network, *deconv3D_2->getOutput(0),
+                                    deconv3D_2_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_2_out_dims.d[0] - 1, deconv3D_2_out_dims.d[1], deconv3D_2_out_dims.d[2], deconv3D_2_out_dims.d[3]}},
+                                    "deconv3D_2_slice");
+    assert(deconv3D_2_slice_layer != nullptr);
+
+    // deconv3D_2_add_skip tensor add op.
+    auto deconv3D_2_add_skip = network->addElementWise(*(deconv3D_2_slice_layer->getOutput(0)), *(conv3D_2_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_2_add_skip != nullptr);
+
+    // deconv3D_2_act ELU activation op.
+    auto deconv3D_2_act = addElu(plugin_factory, *network, *deconv3D_2_add_skip->getOutput(0), data_type, "deconv3D_2_act");
+    assert(deconv3D_2_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_2_transform = addTransform(plugin_factory, *network, *deconv3D_2_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_2_transform_transform");
+    assert(deconv3D_2_transform != nullptr);
+
+    // deconv3D_3 3D transposed convolution op.
+    Dims deconv3D_3_out_dims{4, {97, 1, 321, 1025}};
+    auto deconv3D_3 = addConv3DTranspose(plugin_factory, *network, *deconv3D_2_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {32, 3, 1, 3, 3}}, deconv3D_3_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_3_k"), weights.at("deconv3D_3_b"),
+                                  "deconv3D_3");
+    assert(deconv3D_3 != nullptr);
+
+    // deconv3D_3 output slice op.
+    auto deconv3D_3_slice_layer = addSlice(plugin_factory, *network, *deconv3D_3->getOutput(0),
+                                    deconv3D_3_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_3_out_dims.d[0] - 1, deconv3D_3_out_dims.d[1], deconv3D_3_out_dims.d[2], deconv3D_3_out_dims.d[3]}},
+                                    "deconv3D_3_slice");
+    assert(deconv3D_3_slice_layer != nullptr);
+
+    // Softargmax.
+    auto disp = addSoftargmax(plugin_factory, *network, *deconv3D_3_slice_layer->getOutput(0), "disp_softargmax");
+    assert(disp != nullptr);
+
+    auto disp_out = disp->getOutput(0);
+    disp_out->setName("disp");
+    network->markOutput(*disp_out);
+
+    return network;
+}
+
+} } // namespace
diff --git a/stereoDNN/sample_app/nvtiny_513x161_net.cpp b/stereoDNN/sample_app/nvtiny_513x161_net.cpp
new file mode 100644
index 0000000..5ed7a10
--- /dev/null
+++ b/stereoDNN/sample_app/nvtiny_513x161_net.cpp
@@ -0,0 +1,369 @@
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+//-------------------------------------------------------------------
+// !!! This file was automatically generated. Do not edit. !!!
+//-------------------------------------------------------------------
+
+#include <NvInfer.h>
+#include <cassert>
+#include <string>
+#include <unordered_map>
+#include "redtail_tensorrt_plugins.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+using weight_map = std::unordered_map<std::string, Weights>;
+
+INetworkDefinition* createNVTiny513x161Network(IBuilder& builder, IPluginContainer& plugin_factory,
+                                     DimsCHW img_dims, const weight_map& weights, DataType data_type,
+                                     ILogger& log)
+{
+    INetworkDefinition* network = builder.createNetwork();
+    assert(network != nullptr);
+    // Input tensor.
+    auto left = network->addInput("left", DataType::kFLOAT, img_dims);
+    assert(left != nullptr);
+
+    // Input tensor.
+    auto right = network->addInput("right", DataType::kFLOAT, img_dims);
+    assert(right != nullptr);
+
+    // Scaling op.
+    auto left_scale = network->addScale(*left, ScaleMode::kUNIFORM,
+                                 weights.at("left_scale_shift"), weights.at("left_scale_scale"), weights.at("left_scale_power"));
+    assert(left_scale != nullptr);
+
+    // Scaling op.
+    auto right_scale = network->addScale(*right, ScaleMode::kUNIFORM,
+                                 weights.at("right_scale_shift"), weights.at("right_scale_scale"), weights.at("right_scale_power"));
+    assert(right_scale != nullptr);
+
+    // left_conv1 convolution op.
+    auto left_conv1 = network->addConvolution(*left_scale->getOutput(0), 32, DimsHW {5, 5},
+                                       weights.at("left_conv1_k"), weights.at("left_conv1_b"));
+    assert(left_conv1 != nullptr);
+    left_conv1->setStride( DimsHW {2, 2});
+    left_conv1->setPadding(DimsHW {2, 2});
+
+    // left_conv1_act ELU activation op.
+    auto left_conv1_act = addElu(plugin_factory, *network, *left_conv1->getOutput(0), data_type, "left_conv1_act");
+    assert(left_conv1_act != nullptr);
+
+    // right_conv1 convolution op.
+    auto right_conv1 = network->addConvolution(*right_scale->getOutput(0), 32, DimsHW {5, 5},
+                                       weights.at("right_conv1_k"), weights.at("right_conv1_b"));
+    assert(right_conv1 != nullptr);
+    right_conv1->setStride( DimsHW {2, 2});
+    right_conv1->setPadding(DimsHW {2, 2});
+
+    // right_conv1_act ELU activation op.
+    auto right_conv1_act = addElu(plugin_factory, *network, *right_conv1->getOutput(0), data_type, "right_conv1_act");
+    assert(right_conv1_act != nullptr);
+
+    // left_conv2 convolution op.
+    auto left_conv2 = network->addConvolution(*left_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_conv2_k"), weights.at("left_conv2_b"));
+    assert(left_conv2 != nullptr);
+    left_conv2->setStride( DimsHW {1, 1});
+    left_conv2->setPadding(DimsHW {1, 1});
+
+    // left_conv2_act ELU activation op.
+    auto left_conv2_act = addElu(plugin_factory, *network, *left_conv2->getOutput(0), data_type, "left_conv2_act");
+    assert(left_conv2_act != nullptr);
+
+    // right_conv2 convolution op.
+    auto right_conv2 = network->addConvolution(*right_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_conv2_k"), weights.at("right_conv2_b"));
+    assert(right_conv2 != nullptr);
+    right_conv2->setStride( DimsHW {1, 1});
+    right_conv2->setPadding(DimsHW {1, 1});
+
+    // right_conv2_act ELU activation op.
+    auto right_conv2_act = addElu(plugin_factory, *network, *right_conv2->getOutput(0), data_type, "right_conv2_act");
+    assert(right_conv2_act != nullptr);
+
+    // left_conv3 convolution op.
+    auto left_conv3 = network->addConvolution(*left_conv2_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_conv3_k"), weights.at("left_conv3_b"));
+    assert(left_conv3 != nullptr);
+    left_conv3->setStride( DimsHW {1, 1});
+    left_conv3->setPadding(DimsHW {1, 1});
+
+    // left_conv3_act ELU activation op.
+    auto left_conv3_act = addElu(plugin_factory, *network, *left_conv3->getOutput(0), data_type, "left_conv3_act");
+    assert(left_conv3_act != nullptr);
+
+    // right_conv3 convolution op.
+    auto right_conv3 = network->addConvolution(*right_conv2_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_conv3_k"), weights.at("right_conv3_b"));
+    assert(right_conv3 != nullptr);
+    right_conv3->setStride( DimsHW {1, 1});
+    right_conv3->setPadding(DimsHW {1, 1});
+
+    // right_conv3_act ELU activation op.
+    auto right_conv3_act = addElu(plugin_factory, *network, *right_conv3->getOutput(0), data_type, "right_conv3_act");
+    assert(right_conv3_act != nullptr);
+
+    // left_conv4 convolution op.
+    auto left_conv4 = network->addConvolution(*left_conv3_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_conv4_k"), weights.at("left_conv4_b"));
+    assert(left_conv4 != nullptr);
+    left_conv4->setStride( DimsHW {1, 1});
+    left_conv4->setPadding(DimsHW {1, 1});
+
+    // left_conv4_act ELU activation op.
+    auto left_conv4_act = addElu(plugin_factory, *network, *left_conv4->getOutput(0), data_type, "left_conv4_act");
+    assert(left_conv4_act != nullptr);
+
+    // right_conv4 convolution op.
+    auto right_conv4 = network->addConvolution(*right_conv3_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_conv4_k"), weights.at("right_conv4_b"));
+    assert(right_conv4 != nullptr);
+    right_conv4->setStride( DimsHW {1, 1});
+    right_conv4->setPadding(DimsHW {1, 1});
+
+    // right_conv4_act ELU activation op.
+    auto right_conv4_act = addElu(plugin_factory, *network, *right_conv4->getOutput(0), data_type, "right_conv4_act");
+    assert(right_conv4_act != nullptr);
+
+    // left_conv5 convolution op.
+    auto left_conv5 = network->addConvolution(*left_conv4_act->getOutput(0), 8, DimsHW {3, 3},
+                                       weights.at("left_conv5_k"), weights.at("left_conv5_b"));
+    assert(left_conv5 != nullptr);
+    left_conv5->setStride( DimsHW {1, 1});
+    left_conv5->setPadding(DimsHW {1, 1});
+
+    // right_conv5 convolution op.
+    auto right_conv5 = network->addConvolution(*right_conv4_act->getOutput(0), 8, DimsHW {3, 3},
+                                       weights.at("right_conv5_k"), weights.at("right_conv5_b"));
+    assert(right_conv5 != nullptr);
+    right_conv5->setStride( DimsHW {1, 1});
+    right_conv5->setPadding(DimsHW {1, 1});
+
+    // cost_vol cost volume op.
+    auto cost_vol = addCostVolume(plugin_factory, *network, *left_conv5->getOutput(0), *right_conv5->getOutput(0), 24, "cost_vol");
+    assert(cost_vol != nullptr);
+
+    // conv3D_1 3D convolution op.
+    auto conv3D_1 = addConv3D(plugin_factory, *network, *cost_vol->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {16, 3, 16, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_1_k"), weights.at("conv3D_1_b"),
+                         "conv3D_1");
+    assert(conv3D_1 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_1_tran = addTransform(plugin_factory, *network, *conv3D_1->getOutput(0), {1, 0, 2, 3}, "conv3D_1_tran_transform");
+    assert(conv3D_1_tran != nullptr);
+
+    // conv3D_1_act ELU activation op.
+    auto conv3D_1_act = addElu(plugin_factory, *network, *conv3D_1_tran->getOutput(0), data_type, "conv3D_1_act");
+    assert(conv3D_1_act != nullptr);
+
+    // conv3D_2 3D convolution op.
+    auto conv3D_2 = addConv3D(plugin_factory, *network, *conv3D_1_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {16, 3, 16, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_2_k"), weights.at("conv3D_2_b"),
+                         "conv3D_2");
+    assert(conv3D_2 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_2_tran = addTransform(plugin_factory, *network, *conv3D_2->getOutput(0), {1, 0, 2, 3}, "conv3D_2_tran_transform");
+    assert(conv3D_2_tran != nullptr);
+
+    // conv3D_2_act ELU activation op.
+    auto conv3D_2_act = addElu(plugin_factory, *network, *conv3D_2_tran->getOutput(0), data_type, "conv3D_2_act");
+    assert(conv3D_2_act != nullptr);
+
+    // conv3D_3ds_pad padding op.
+    auto conv3D_3ds_pad = addPad(plugin_factory, *network, *conv3D_2_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_3ds_pad");
+    assert(conv3D_3ds_pad != nullptr);
+
+    // conv3D_3ds 3D convolution op.
+    auto conv3D_3ds = addConv3D(plugin_factory, *network, *conv3D_3ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {32, 3, 16, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_3ds_k"), weights.at("conv3D_3ds_b"),
+                         "conv3D_3ds");
+    assert(conv3D_3ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_3ds_tran = addTransform(plugin_factory, *network, *conv3D_3ds->getOutput(0), {1, 0, 2, 3}, "conv3D_3ds_tran_transform");
+    assert(conv3D_3ds_tran != nullptr);
+
+    // conv3D_3ds_act ELU activation op.
+    auto conv3D_3ds_act = addElu(plugin_factory, *network, *conv3D_3ds_tran->getOutput(0), data_type, "conv3D_3ds_act");
+    assert(conv3D_3ds_act != nullptr);
+
+    // conv3D_4 3D convolution op.
+    auto conv3D_4 = addConv3D(plugin_factory, *network, *conv3D_3ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_4_k"), weights.at("conv3D_4_b"),
+                         "conv3D_4");
+    assert(conv3D_4 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_4_tran = addTransform(plugin_factory, *network, *conv3D_4->getOutput(0), {1, 0, 2, 3}, "conv3D_4_tran_transform");
+    assert(conv3D_4_tran != nullptr);
+
+    // conv3D_4_act ELU activation op.
+    auto conv3D_4_act = addElu(plugin_factory, *network, *conv3D_4_tran->getOutput(0), data_type, "conv3D_4_act");
+    assert(conv3D_4_act != nullptr);
+
+    // conv3D_5 3D convolution op.
+    auto conv3D_5 = addConv3D(plugin_factory, *network, *conv3D_4_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_5_k"), weights.at("conv3D_5_b"),
+                         "conv3D_5");
+    assert(conv3D_5 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_5_tran = addTransform(plugin_factory, *network, *conv3D_5->getOutput(0), {1, 0, 2, 3}, "conv3D_5_tran_transform");
+    assert(conv3D_5_tran != nullptr);
+
+    // conv3D_5_act ELU activation op.
+    auto conv3D_5_act = addElu(plugin_factory, *network, *conv3D_5_tran->getOutput(0), data_type, "conv3D_5_act");
+    assert(conv3D_5_act != nullptr);
+
+    // conv3D_6ds_pad padding op.
+    auto conv3D_6ds_pad = addPad(plugin_factory, *network, *conv3D_5_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_6ds_pad");
+    assert(conv3D_6ds_pad != nullptr);
+
+    // conv3D_6ds 3D convolution op.
+    auto conv3D_6ds = addConv3D(plugin_factory, *network, *conv3D_6ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_6ds_k"), weights.at("conv3D_6ds_b"),
+                         "conv3D_6ds");
+    assert(conv3D_6ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_6ds_tran = addTransform(plugin_factory, *network, *conv3D_6ds->getOutput(0), {1, 0, 2, 3}, "conv3D_6ds_tran_transform");
+    assert(conv3D_6ds_tran != nullptr);
+
+    // conv3D_6ds_act ELU activation op.
+    auto conv3D_6ds_act = addElu(plugin_factory, *network, *conv3D_6ds_tran->getOutput(0), data_type, "conv3D_6ds_act");
+    assert(conv3D_6ds_act != nullptr);
+
+    // conv3D_7 3D convolution op.
+    auto conv3D_7 = addConv3D(plugin_factory, *network, *conv3D_6ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_7_k"), weights.at("conv3D_7_b"),
+                         "conv3D_7");
+    assert(conv3D_7 != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_7_tran = addTransform(plugin_factory, *network, *conv3D_7->getOutput(0), {1, 0, 2, 3}, "conv3D_7_tran_transform");
+    assert(conv3D_7_tran != nullptr);
+
+    // conv3D_7_act ELU activation op.
+    auto conv3D_7_act = addElu(plugin_factory, *network, *conv3D_7_tran->getOutput(0), data_type, "conv3D_7_act");
+    assert(conv3D_7_act != nullptr);
+
+    // conv3D_8 3D convolution op.
+    auto conv3D_8 = addConv3D(plugin_factory, *network, *conv3D_7_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_8_k"), weights.at("conv3D_8_b"),
+                         "conv3D_8");
+    assert(conv3D_8 != nullptr);
+
+    // conv3D_8_act ELU activation op.
+    auto conv3D_8_act = addElu(plugin_factory, *network, *conv3D_8->getOutput(0), data_type, "conv3D_8_act");
+    assert(conv3D_8_act != nullptr);
+
+    // deconv3D_1 3D transposed convolution op.
+    Dims deconv3D_1_out_dims{4, {13, 32, 41, 129}};
+    auto deconv3D_1 = addConv3DTranspose(plugin_factory, *network, *conv3D_8_act->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, deconv3D_1_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_1_k"), weights.at("deconv3D_1_b"),
+                                  "deconv3D_1");
+    assert(deconv3D_1 != nullptr);
+
+    // deconv3D_1 output slice op.
+    auto deconv3D_1_slice_layer = addSlice(plugin_factory, *network, *deconv3D_1->getOutput(0),
+                                    deconv3D_1_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_1_out_dims.d[0] - 1, deconv3D_1_out_dims.d[1], deconv3D_1_out_dims.d[2], deconv3D_1_out_dims.d[3]}},
+                                    "deconv3D_1_slice");
+    assert(deconv3D_1_slice_layer != nullptr);
+
+    // deconv3D_1_add_skip tensor add op.
+    auto deconv3D_1_add_skip = network->addElementWise(*(deconv3D_1_slice_layer->getOutput(0)), *(conv3D_5_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_1_add_skip != nullptr);
+
+    // deconv3D_1_act ELU activation op.
+    auto deconv3D_1_act = addElu(plugin_factory, *network, *deconv3D_1_add_skip->getOutput(0), data_type, "deconv3D_1_act");
+    assert(deconv3D_1_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_1_transform = addTransform(plugin_factory, *network, *deconv3D_1_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_1_transform_transform");
+    assert(deconv3D_1_transform != nullptr);
+
+    // deconv3D_2 3D transposed convolution op.
+    Dims deconv3D_2_out_dims{4, {25, 16, 81, 257}};
+    auto deconv3D_2 = addConv3DTranspose(plugin_factory, *network, *deconv3D_1_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {32, 3, 16, 3, 3}}, deconv3D_2_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_2_k"), weights.at("deconv3D_2_b"),
+                                  "deconv3D_2");
+    assert(deconv3D_2 != nullptr);
+
+    // deconv3D_2 output slice op.
+    auto deconv3D_2_slice_layer = addSlice(plugin_factory, *network, *deconv3D_2->getOutput(0),
+                                    deconv3D_2_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_2_out_dims.d[0] - 1, deconv3D_2_out_dims.d[1], deconv3D_2_out_dims.d[2], deconv3D_2_out_dims.d[3]}},
+                                    "deconv3D_2_slice");
+    assert(deconv3D_2_slice_layer != nullptr);
+
+    // deconv3D_2_add_skip tensor add op.
+    auto deconv3D_2_add_skip = network->addElementWise(*(deconv3D_2_slice_layer->getOutput(0)), *(conv3D_2_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_2_add_skip != nullptr);
+
+    // deconv3D_2_act ELU activation op.
+    auto deconv3D_2_act = addElu(plugin_factory, *network, *deconv3D_2_add_skip->getOutput(0), data_type, "deconv3D_2_act");
+    assert(deconv3D_2_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_2_transform = addTransform(plugin_factory, *network, *deconv3D_2_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_2_transform_transform");
+    assert(deconv3D_2_transform != nullptr);
+
+    // deconv3D_3 3D transposed convolution op.
+    Dims deconv3D_3_out_dims{4, {49, 1, 161, 513}};
+    auto deconv3D_3 = addConv3DTranspose(plugin_factory, *network, *deconv3D_2_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {16, 3, 1, 3, 3}}, deconv3D_3_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_3_k"), weights.at("deconv3D_3_b"),
+                                  "deconv3D_3");
+    assert(deconv3D_3 != nullptr);
+
+    // deconv3D_3 output slice op.
+    auto deconv3D_3_slice_layer = addSlice(plugin_factory, *network, *deconv3D_3->getOutput(0),
+                                    deconv3D_3_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_3_out_dims.d[0] - 1, deconv3D_3_out_dims.d[1], deconv3D_3_out_dims.d[2], deconv3D_3_out_dims.d[3]}},
+                                    "deconv3D_3_slice");
+    assert(deconv3D_3_slice_layer != nullptr);
+
+    // Softargmax.
+    auto disp = addSoftargmax(plugin_factory, *network, *deconv3D_3_slice_layer->getOutput(0), "disp_softargmax");
+    assert(disp != nullptr);
+
+    auto disp_out = disp->getOutput(0);
+    disp_out->setName("disp");
+    network->markOutput(*disp_out);
+
+    return network;
+}
+
+} } // namespace
diff --git a/stereoDNN/sample_app/resnet18_1025x321_net.cpp b/stereoDNN/sample_app/resnet18_1025x321_net.cpp
new file mode 100644
index 0000000..426a5fc
--- /dev/null
+++ b/stereoDNN/sample_app/resnet18_1025x321_net.cpp
@@ -0,0 +1,880 @@
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+//-------------------------------------------------------------------
+// !!! This file was automatically generated. Do not edit. !!!
+//-------------------------------------------------------------------
+
+#include <NvInfer.h>
+#include <cassert>
+#include <string>
+#include <unordered_map>
+#include "redtail_tensorrt_plugins.h"
+
+namespace redtail { namespace tensorrt
+{
+
+using namespace nvinfer1;
+
+using weight_map = std::unordered_map<std::string, Weights>;
+
+INetworkDefinition* createResNet18_1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory,
+                                     DimsCHW img_dims, const weight_map& weights, DataType data_type,
+                                     ILogger& log)
+{
+    INetworkDefinition* network = builder.createNetwork();
+    assert(network != nullptr);
+    // Input tensor.
+    auto left = network->addInput("left", DataType::kFLOAT, img_dims);
+    assert(left != nullptr);
+
+    // Input tensor.
+    auto right = network->addInput("right", DataType::kFLOAT, img_dims);
+    assert(right != nullptr);
+
+    // Scaling op.
+    auto left_scale = network->addScale(*left, ScaleMode::kUNIFORM,
+                                 weights.at("left_scale_shift"), weights.at("left_scale_scale"), weights.at("left_scale_power"));
+    assert(left_scale != nullptr);
+
+    // Scaling op.
+    auto right_scale = network->addScale(*right, ScaleMode::kUNIFORM,
+                                 weights.at("right_scale_shift"), weights.at("right_scale_scale"), weights.at("right_scale_power"));
+    assert(right_scale != nullptr);
+
+    // left_conv1 convolution op.
+    auto left_conv1 = network->addConvolution(*left_scale->getOutput(0), 32, DimsHW {5, 5},
+                                       weights.at("left_conv1_k"), weights.at("left_conv1_b"));
+    assert(left_conv1 != nullptr);
+    left_conv1->setStride( DimsHW {2, 2});
+    left_conv1->setPadding(DimsHW {2, 2});
+
+    // left_conv1_act ELU activation op.
+    auto left_conv1_act = addElu(plugin_factory, *network, *left_conv1->getOutput(0), data_type, "left_conv1_act");
+    assert(left_conv1_act != nullptr);
+
+    // right_conv1 convolution op.
+    auto right_conv1 = network->addConvolution(*right_scale->getOutput(0), 32, DimsHW {5, 5},
+                                       weights.at("right_conv1_k"), weights.at("right_conv1_b"));
+    assert(right_conv1 != nullptr);
+    right_conv1->setStride( DimsHW {2, 2});
+    right_conv1->setPadding(DimsHW {2, 2});
+
+    // right_conv1_act ELU activation op.
+    auto right_conv1_act = addElu(plugin_factory, *network, *right_conv1->getOutput(0), data_type, "right_conv1_act");
+    assert(right_conv1_act != nullptr);
+
+    // left_resblock1_conv1 convolution op.
+    auto left_resblock1_conv1 = network->addConvolution(*left_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock1_conv1_k"), weights.at("left_resblock1_conv1_b"));
+    assert(left_resblock1_conv1 != nullptr);
+    left_resblock1_conv1->setStride( DimsHW {1, 1});
+    left_resblock1_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock1_conv1_act ELU activation op.
+    auto left_resblock1_conv1_act = addElu(plugin_factory, *network, *left_resblock1_conv1->getOutput(0), data_type, "left_resblock1_conv1_act");
+    assert(left_resblock1_conv1_act != nullptr);
+
+    // left_resblock1_conv2 convolution op.
+    auto left_resblock1_conv2 = network->addConvolution(*left_resblock1_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock1_conv2_k"), weights.at("left_resblock1_conv2_b"));
+    assert(left_resblock1_conv2 != nullptr);
+    left_resblock1_conv2->setStride( DimsHW {1, 1});
+    left_resblock1_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock1_conv2_add tensor add op.
+    auto left_resblock1_conv2_add = network->addElementWise(*(left_resblock1_conv2->getOutput(0)), *(left_conv1_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(left_resblock1_conv2_add != nullptr);
+
+    // left_resblock1_conv2_add_act ELU activation op.
+    auto left_resblock1_conv2_add_act = addElu(plugin_factory, *network, *left_resblock1_conv2_add->getOutput(0), data_type, "left_resblock1_conv2_add_act");
+    assert(left_resblock1_conv2_add_act != nullptr);
+
+    // right_resblock1_conv1 convolution op.
+    auto right_resblock1_conv1 = network->addConvolution(*right_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock1_conv1_k"), weights.at("right_resblock1_conv1_b"));
+    assert(right_resblock1_conv1 != nullptr);
+    right_resblock1_conv1->setStride( DimsHW {1, 1});
+    right_resblock1_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock1_conv1_act ELU activation op.
+    auto right_resblock1_conv1_act = addElu(plugin_factory, *network, *right_resblock1_conv1->getOutput(0), data_type, "right_resblock1_conv1_act");
+    assert(right_resblock1_conv1_act != nullptr);
+
+    // right_resblock1_conv2 convolution op.
+    auto right_resblock1_conv2 = network->addConvolution(*right_resblock1_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock1_conv2_k"), weights.at("right_resblock1_conv2_b"));
+    assert(right_resblock1_conv2 != nullptr);
+    right_resblock1_conv2->setStride( DimsHW {1, 1});
+    right_resblock1_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock1_conv2_add tensor add op.
+    auto right_resblock1_conv2_add = network->addElementWise(*(right_resblock1_conv2->getOutput(0)), *(right_conv1_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock1_conv2_add != nullptr);
+
+    // right_resblock1_conv2_add_act ELU activation op.
+    auto right_resblock1_conv2_add_act = addElu(plugin_factory, *network, *right_resblock1_conv2_add->getOutput(0), data_type, "right_resblock1_conv2_add_act");
+    assert(right_resblock1_conv2_add_act != nullptr);
+
+    // left_resblock2_conv1 convolution op.
+    auto left_resblock2_conv1 = network->addConvolution(*left_resblock1_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock2_conv1_k"), weights.at("left_resblock2_conv1_b"));
+    assert(left_resblock2_conv1 != nullptr);
+    left_resblock2_conv1->setStride( DimsHW {1, 1});
+    left_resblock2_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock2_conv1_act ELU activation op.
+    auto left_resblock2_conv1_act = addElu(plugin_factory, *network, *left_resblock2_conv1->getOutput(0), data_type, "left_resblock2_conv1_act");
+    assert(left_resblock2_conv1_act != nullptr);
+
+    // left_resblock2_conv2 convolution op.
+    auto left_resblock2_conv2 = network->addConvolution(*left_resblock2_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock2_conv2_k"), weights.at("left_resblock2_conv2_b"));
+    assert(left_resblock2_conv2 != nullptr);
+    left_resblock2_conv2->setStride( DimsHW {1, 1});
+    left_resblock2_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock2_conv2_add tensor add op.
+    auto left_resblock2_conv2_add = network->addElementWise(*(left_resblock2_conv2->getOutput(0)), *(left_resblock1_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(left_resblock2_conv2_add != nullptr);
+
+    // left_resblock2_conv2_add_act ELU activation op.
+    auto left_resblock2_conv2_add_act = addElu(plugin_factory, *network, *left_resblock2_conv2_add->getOutput(0), data_type, "left_resblock2_conv2_add_act");
+    assert(left_resblock2_conv2_add_act != nullptr);
+
+    // right_resblock2_conv1 convolution op.
+    auto right_resblock2_conv1 = network->addConvolution(*right_resblock1_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock2_conv1_k"), weights.at("right_resblock2_conv1_b"));
+    assert(right_resblock2_conv1 != nullptr);
+    right_resblock2_conv1->setStride( DimsHW {1, 1});
+    right_resblock2_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock2_conv1_act ELU activation op.
+    auto right_resblock2_conv1_act = addElu(plugin_factory, *network, *right_resblock2_conv1->getOutput(0), data_type, "right_resblock2_conv1_act");
+    assert(right_resblock2_conv1_act != nullptr);
+
+    // right_resblock2_conv2 convolution op.
+    auto right_resblock2_conv2 = network->addConvolution(*right_resblock2_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock2_conv2_k"), weights.at("right_resblock2_conv2_b"));
+    assert(right_resblock2_conv2 != nullptr);
+    right_resblock2_conv2->setStride( DimsHW {1, 1});
+    right_resblock2_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock2_conv2_add tensor add op.
+    auto right_resblock2_conv2_add = network->addElementWise(*(right_resblock2_conv2->getOutput(0)), *(right_resblock1_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock2_conv2_add != nullptr);
+
+    // right_resblock2_conv2_add_act ELU activation op.
+    auto right_resblock2_conv2_add_act = addElu(plugin_factory, *network, *right_resblock2_conv2_add->getOutput(0), data_type, "right_resblock2_conv2_add_act");
+    assert(right_resblock2_conv2_add_act != nullptr);
+
+    // left_resblock3_conv1 convolution op.
+    auto left_resblock3_conv1 = network->addConvolution(*left_resblock2_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock3_conv1_k"), weights.at("left_resblock3_conv1_b"));
+    assert(left_resblock3_conv1 != nullptr);
+    left_resblock3_conv1->setStride( DimsHW {1, 1});
+    left_resblock3_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock3_conv1_act ELU activation op.
+    auto left_resblock3_conv1_act = addElu(plugin_factory, *network, *left_resblock3_conv1->getOutput(0), data_type, "left_resblock3_conv1_act");
+    assert(left_resblock3_conv1_act != nullptr);
+
+    // left_resblock3_conv2 convolution op.
+    auto left_resblock3_conv2 = network->addConvolution(*left_resblock3_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock3_conv2_k"), weights.at("left_resblock3_conv2_b"));
+    assert(left_resblock3_conv2 != nullptr);
+    left_resblock3_conv2->setStride( DimsHW {1, 1});
+    left_resblock3_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock3_conv2_add tensor add op.
+    auto left_resblock3_conv2_add = network->addElementWise(*(left_resblock3_conv2->getOutput(0)), *(left_resblock2_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(left_resblock3_conv2_add != nullptr);
+
+    // left_resblock3_conv2_add_act ELU activation op.
+    auto left_resblock3_conv2_add_act = addElu(plugin_factory, *network, *left_resblock3_conv2_add->getOutput(0), data_type, "left_resblock3_conv2_add_act");
+    assert(left_resblock3_conv2_add_act != nullptr);
+
+    // right_resblock3_conv1 convolution op.
+    auto right_resblock3_conv1 = network->addConvolution(*right_resblock2_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock3_conv1_k"), weights.at("right_resblock3_conv1_b"));
+    assert(right_resblock3_conv1 != nullptr);
+    right_resblock3_conv1->setStride( DimsHW {1, 1});
+    right_resblock3_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock3_conv1_act ELU activation op.
+    auto right_resblock3_conv1_act = addElu(plugin_factory, *network, *right_resblock3_conv1->getOutput(0), data_type, "right_resblock3_conv1_act");
+    assert(right_resblock3_conv1_act != nullptr);
+
+    // right_resblock3_conv2 convolution op.
+    auto right_resblock3_conv2 = network->addConvolution(*right_resblock3_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock3_conv2_k"), weights.at("right_resblock3_conv2_b"));
+    assert(right_resblock3_conv2 != nullptr);
+    right_resblock3_conv2->setStride( DimsHW {1, 1});
+    right_resblock3_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock3_conv2_add tensor add op.
+    auto right_resblock3_conv2_add = network->addElementWise(*(right_resblock3_conv2->getOutput(0)), *(right_resblock2_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock3_conv2_add != nullptr);
+
+    // right_resblock3_conv2_add_act ELU activation op.
+    auto right_resblock3_conv2_add_act = addElu(plugin_factory, *network, *right_resblock3_conv2_add->getOutput(0), data_type, "right_resblock3_conv2_add_act");
+    assert(right_resblock3_conv2_add_act != nullptr);
+
+    // left_resblock4_conv1 convolution op.
+    auto left_resblock4_conv1 = network->addConvolution(*left_resblock3_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock4_conv1_k"), weights.at("left_resblock4_conv1_b"));
+    assert(left_resblock4_conv1 != nullptr);
+    left_resblock4_conv1->setStride( DimsHW {1, 1});
+    left_resblock4_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock4_conv1_act ELU activation op.
+    auto left_resblock4_conv1_act = addElu(plugin_factory, *network, *left_resblock4_conv1->getOutput(0), data_type, "left_resblock4_conv1_act");
+    assert(left_resblock4_conv1_act != nullptr);
+
+    // left_resblock4_conv2 convolution op.
+    auto left_resblock4_conv2 = network->addConvolution(*left_resblock4_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock4_conv2_k"), weights.at("left_resblock4_conv2_b"));
+    assert(left_resblock4_conv2 != nullptr);
+    left_resblock4_conv2->setStride( DimsHW {1, 1});
+    left_resblock4_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock4_conv2_add tensor add op.
+    auto left_resblock4_conv2_add = network->addElementWise(*(left_resblock4_conv2->getOutput(0)), *(left_resblock3_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(left_resblock4_conv2_add != nullptr);
+
+    // left_resblock4_conv2_add_act ELU activation op.
+    auto left_resblock4_conv2_add_act = addElu(plugin_factory, *network, *left_resblock4_conv2_add->getOutput(0), data_type, "left_resblock4_conv2_add_act");
+    assert(left_resblock4_conv2_add_act != nullptr);
+
+    // right_resblock4_conv1 convolution op.
+    auto right_resblock4_conv1 = network->addConvolution(*right_resblock3_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock4_conv1_k"), weights.at("right_resblock4_conv1_b"));
+    assert(right_resblock4_conv1 != nullptr);
+    right_resblock4_conv1->setStride( DimsHW {1, 1});
+    right_resblock4_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock4_conv1_act ELU activation op.
+    auto right_resblock4_conv1_act = addElu(plugin_factory, *network, *right_resblock4_conv1->getOutput(0), data_type, "right_resblock4_conv1_act");
+    assert(right_resblock4_conv1_act != nullptr);
+
+    // right_resblock4_conv2 convolution op.
+    auto right_resblock4_conv2 = network->addConvolution(*right_resblock4_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock4_conv2_k"), weights.at("right_resblock4_conv2_b"));
+    assert(right_resblock4_conv2 != nullptr);
+    right_resblock4_conv2->setStride( DimsHW {1, 1});
+    right_resblock4_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock4_conv2_add tensor add op.
+    auto right_resblock4_conv2_add = network->addElementWise(*(right_resblock4_conv2->getOutput(0)), *(right_resblock3_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock4_conv2_add != nullptr);
+
+    // right_resblock4_conv2_add_act ELU activation op.
+    auto right_resblock4_conv2_add_act = addElu(plugin_factory, *network, *right_resblock4_conv2_add->getOutput(0), data_type, "right_resblock4_conv2_add_act");
+    assert(right_resblock4_conv2_add_act != nullptr);
+
+    // left_resblock5_conv1 convolution op.
+    auto left_resblock5_conv1 = network->addConvolution(*left_resblock4_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock5_conv1_k"), weights.at("left_resblock5_conv1_b"));
+    assert(left_resblock5_conv1 != nullptr);
+    left_resblock5_conv1->setStride( DimsHW {1, 1});
+    left_resblock5_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock5_conv1_act ELU activation op.
+    auto left_resblock5_conv1_act = addElu(plugin_factory, *network, *left_resblock5_conv1->getOutput(0), data_type, "left_resblock5_conv1_act");
+    assert(left_resblock5_conv1_act != nullptr);
+
+    // left_resblock5_conv2 convolution op.
+    auto left_resblock5_conv2 = network->addConvolution(*left_resblock5_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock5_conv2_k"), weights.at("left_resblock5_conv2_b"));
+    assert(left_resblock5_conv2 != nullptr);
+    left_resblock5_conv2->setStride( DimsHW {1, 1});
+    left_resblock5_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock5_conv2_add tensor add op.
+    auto left_resblock5_conv2_add = network->addElementWise(*(left_resblock5_conv2->getOutput(0)), *(left_resblock4_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(left_resblock5_conv2_add != nullptr);
+
+    // left_resblock5_conv2_add_act ELU activation op.
+    auto left_resblock5_conv2_add_act = addElu(plugin_factory, *network, *left_resblock5_conv2_add->getOutput(0), data_type, "left_resblock5_conv2_add_act");
+    assert(left_resblock5_conv2_add_act != nullptr);
+
+    // right_resblock5_conv1 convolution op.
+    auto right_resblock5_conv1 = network->addConvolution(*right_resblock4_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock5_conv1_k"), weights.at("right_resblock5_conv1_b"));
+    assert(right_resblock5_conv1 != nullptr);
+    right_resblock5_conv1->setStride( DimsHW {1, 1});
+    right_resblock5_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock5_conv1_act ELU activation op.
+    auto right_resblock5_conv1_act = addElu(plugin_factory, *network, *right_resblock5_conv1->getOutput(0), data_type, "right_resblock5_conv1_act");
+    assert(right_resblock5_conv1_act != nullptr);
+
+    // right_resblock5_conv2 convolution op.
+    auto right_resblock5_conv2 = network->addConvolution(*right_resblock5_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock5_conv2_k"), weights.at("right_resblock5_conv2_b"));
+    assert(right_resblock5_conv2 != nullptr);
+    right_resblock5_conv2->setStride( DimsHW {1, 1});
+    right_resblock5_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock5_conv2_add tensor add op.
+    auto right_resblock5_conv2_add = network->addElementWise(*(right_resblock5_conv2->getOutput(0)), *(right_resblock4_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock5_conv2_add != nullptr);
+
+    // right_resblock5_conv2_add_act ELU activation op.
+    auto right_resblock5_conv2_add_act = addElu(plugin_factory, *network, *right_resblock5_conv2_add->getOutput(0), data_type, "right_resblock5_conv2_add_act");
+    assert(right_resblock5_conv2_add_act != nullptr);
+
+    // left_resblock6_conv1 convolution op.
+    auto left_resblock6_conv1 = network->addConvolution(*left_resblock5_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock6_conv1_k"), weights.at("left_resblock6_conv1_b"));
+    assert(left_resblock6_conv1 != nullptr);
+    left_resblock6_conv1->setStride( DimsHW {1, 1});
+    left_resblock6_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock6_conv1_act ELU activation op.
+    auto left_resblock6_conv1_act = addElu(plugin_factory, *network, *left_resblock6_conv1->getOutput(0), data_type, "left_resblock6_conv1_act");
+    assert(left_resblock6_conv1_act != nullptr);
+
+    // left_resblock6_conv2 convolution op.
+    auto left_resblock6_conv2 = network->addConvolution(*left_resblock6_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock6_conv2_k"), weights.at("left_resblock6_conv2_b"));
+    assert(left_resblock6_conv2 != nullptr);
+    left_resblock6_conv2->setStride( DimsHW {1, 1});
+    left_resblock6_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock6_conv2_add tensor add op.
+    auto left_resblock6_conv2_add = network->addElementWise(*(left_resblock6_conv2->getOutput(0)), *(left_resblock5_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(left_resblock6_conv2_add != nullptr);
+
+    // left_resblock6_conv2_add_act ELU activation op.
+    auto left_resblock6_conv2_add_act = addElu(plugin_factory, *network, *left_resblock6_conv2_add->getOutput(0), data_type, "left_resblock6_conv2_add_act");
+    assert(left_resblock6_conv2_add_act != nullptr);
+
+    // right_resblock6_conv1 convolution op.
+    auto right_resblock6_conv1 = network->addConvolution(*right_resblock5_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock6_conv1_k"), weights.at("right_resblock6_conv1_b"));
+    assert(right_resblock6_conv1 != nullptr);
+    right_resblock6_conv1->setStride( DimsHW {1, 1});
+    right_resblock6_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock6_conv1_act ELU activation op.
+    auto right_resblock6_conv1_act = addElu(plugin_factory, *network, *right_resblock6_conv1->getOutput(0), data_type, "right_resblock6_conv1_act");
+    assert(right_resblock6_conv1_act != nullptr);
+
+    // right_resblock6_conv2 convolution op.
+    auto right_resblock6_conv2 = network->addConvolution(*right_resblock6_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock6_conv2_k"), weights.at("right_resblock6_conv2_b"));
+    assert(right_resblock6_conv2 != nullptr);
+    right_resblock6_conv2->setStride( DimsHW {1, 1});
+    right_resblock6_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock6_conv2_add tensor add op.
+    auto right_resblock6_conv2_add = network->addElementWise(*(right_resblock6_conv2->getOutput(0)), *(right_resblock5_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock6_conv2_add != nullptr);
+
+    // right_resblock6_conv2_add_act ELU activation op.
+    auto right_resblock6_conv2_add_act = addElu(plugin_factory, *network, *right_resblock6_conv2_add->getOutput(0), data_type, "right_resblock6_conv2_add_act");
+    assert(right_resblock6_conv2_add_act != nullptr);
+
+    // left_resblock7_conv1 convolution op.
+    auto left_resblock7_conv1 = network->addConvolution(*left_resblock6_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock7_conv1_k"), weights.at("left_resblock7_conv1_b"));
+    assert(left_resblock7_conv1 != nullptr);
+    left_resblock7_conv1->setStride( DimsHW {1, 1});
+    left_resblock7_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock7_conv1_act ELU activation op.
+    auto left_resblock7_conv1_act = addElu(plugin_factory, *network, *left_resblock7_conv1->getOutput(0), data_type, "left_resblock7_conv1_act");
+    assert(left_resblock7_conv1_act != nullptr);
+
+    // left_resblock7_conv2 convolution op.
+    auto left_resblock7_conv2 = network->addConvolution(*left_resblock7_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock7_conv2_k"), weights.at("left_resblock7_conv2_b"));
+    assert(left_resblock7_conv2 != nullptr);
+    left_resblock7_conv2->setStride( DimsHW {1, 1});
+    left_resblock7_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock7_conv2_add tensor add op.
+    auto left_resblock7_conv2_add = network->addElementWise(*(left_resblock7_conv2->getOutput(0)), *(left_resblock6_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(left_resblock7_conv2_add != nullptr);
+
+    // left_resblock7_conv2_add_act ELU activation op.
+    auto left_resblock7_conv2_add_act = addElu(plugin_factory, *network, *left_resblock7_conv2_add->getOutput(0), data_type, "left_resblock7_conv2_add_act");
+    assert(left_resblock7_conv2_add_act != nullptr);
+
+    // right_resblock7_conv1 convolution op.
+    auto right_resblock7_conv1 = network->addConvolution(*right_resblock6_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock7_conv1_k"), weights.at("right_resblock7_conv1_b"));
+    assert(right_resblock7_conv1 != nullptr);
+    right_resblock7_conv1->setStride( DimsHW {1, 1});
+    right_resblock7_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock7_conv1_act ELU activation op.
+    auto right_resblock7_conv1_act = addElu(plugin_factory, *network, *right_resblock7_conv1->getOutput(0), data_type, "right_resblock7_conv1_act");
+    assert(right_resblock7_conv1_act != nullptr);
+
+    // right_resblock7_conv2 convolution op.
+    auto right_resblock7_conv2 = network->addConvolution(*right_resblock7_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock7_conv2_k"), weights.at("right_resblock7_conv2_b"));
+    assert(right_resblock7_conv2 != nullptr);
+    right_resblock7_conv2->setStride( DimsHW {1, 1});
+    right_resblock7_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock7_conv2_add tensor add op.
+    auto right_resblock7_conv2_add = network->addElementWise(*(right_resblock7_conv2->getOutput(0)), *(right_resblock6_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock7_conv2_add != nullptr);
+
+    // right_resblock7_conv2_add_act ELU activation op.
+    auto right_resblock7_conv2_add_act = addElu(plugin_factory, *network, *right_resblock7_conv2_add->getOutput(0), data_type, "right_resblock7_conv2_add_act");
+    assert(right_resblock7_conv2_add_act != nullptr);
+
+    // left_resblock8_conv1 convolution op.
+    auto left_resblock8_conv1 = network->addConvolution(*left_resblock7_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock8_conv1_k"), weights.at("left_resblock8_conv1_b"));
+    assert(left_resblock8_conv1 != nullptr);
+    left_resblock8_conv1->setStride( DimsHW {1, 1});
+    left_resblock8_conv1->setPadding(DimsHW {1, 1});
+
+    // left_resblock8_conv1_act ELU activation op.
+    auto left_resblock8_conv1_act = addElu(plugin_factory, *network, *left_resblock8_conv1->getOutput(0), data_type, "left_resblock8_conv1_act");
+    assert(left_resblock8_conv1_act != nullptr);
+
+    // left_resblock8_conv2 convolution op.
+    auto left_resblock8_conv2 = network->addConvolution(*left_resblock8_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_resblock8_conv2_k"), weights.at("left_resblock8_conv2_b"));
+    assert(left_resblock8_conv2 != nullptr);
+    left_resblock8_conv2->setStride( DimsHW {1, 1});
+    left_resblock8_conv2->setPadding(DimsHW {1, 1});
+
+    // left_resblock8_conv2_add tensor add op.
+    auto left_resblock8_conv2_add = network->addElementWise(*(left_resblock8_conv2->getOutput(0)), *(left_resblock7_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(left_resblock8_conv2_add != nullptr);
+
+    // left_resblock8_conv2_add_act ELU activation op.
+    auto left_resblock8_conv2_add_act = addElu(plugin_factory, *network, *left_resblock8_conv2_add->getOutput(0), data_type, "left_resblock8_conv2_add_act");
+    assert(left_resblock8_conv2_add_act != nullptr);
+
+    // right_resblock8_conv1 convolution op.
+    auto right_resblock8_conv1 = network->addConvolution(*right_resblock7_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock8_conv1_k"), weights.at("right_resblock8_conv1_b"));
+    assert(right_resblock8_conv1 != nullptr);
+    right_resblock8_conv1->setStride( DimsHW {1, 1});
+    right_resblock8_conv1->setPadding(DimsHW {1, 1});
+
+    // right_resblock8_conv1_act ELU activation op.
+    auto right_resblock8_conv1_act = addElu(plugin_factory, *network, *right_resblock8_conv1->getOutput(0), data_type, "right_resblock8_conv1_act");
+    assert(right_resblock8_conv1_act != nullptr);
+
+    // right_resblock8_conv2 convolution op.
+    auto right_resblock8_conv2 = network->addConvolution(*right_resblock8_conv1_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_resblock8_conv2_k"), weights.at("right_resblock8_conv2_b"));
+    assert(right_resblock8_conv2 != nullptr);
+    right_resblock8_conv2->setStride( DimsHW {1, 1});
+    right_resblock8_conv2->setPadding(DimsHW {1, 1});
+
+    // right_resblock8_conv2_add tensor add op.
+    auto right_resblock8_conv2_add = network->addElementWise(*(right_resblock8_conv2->getOutput(0)), *(right_resblock7_conv2_add_act->getOutput(0)),
+    ElementWiseOperation::kSUM);
+    assert(right_resblock8_conv2_add != nullptr);
+
+    // right_resblock8_conv2_add_act ELU activation op.
+    auto right_resblock8_conv2_add_act = addElu(plugin_factory, *network, *right_resblock8_conv2_add->getOutput(0), data_type, "right_resblock8_conv2_add_act");
+    assert(right_resblock8_conv2_add_act != nullptr);
+
+    // left_encoder2D_out convolution op.
+    auto left_encoder2D_out = network->addConvolution(*left_resblock8_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("left_encoder2D_out_k"), weights.at("left_encoder2D_out_b"));
+    assert(left_encoder2D_out != nullptr);
+    left_encoder2D_out->setStride( DimsHW {1, 1});
+    left_encoder2D_out->setPadding(DimsHW {1, 1});
+
+    // right_encoder2D_out convolution op.
+    auto right_encoder2D_out = network->addConvolution(*right_resblock8_conv2_add_act->getOutput(0), 32, DimsHW {3, 3},
+                                       weights.at("right_encoder2D_out_k"), weights.at("right_encoder2D_out_b"));
+    assert(right_encoder2D_out != nullptr);
+    right_encoder2D_out->setStride( DimsHW {1, 1});
+    right_encoder2D_out->setPadding(DimsHW {1, 1});
+
+    // cost_vol cost volume op.
+    auto cost_vol = addCostVolume(plugin_factory, *network, *left_encoder2D_out->getOutput(0), *right_encoder2D_out->getOutput(0), 68, "cost_vol");
+    assert(cost_vol != nullptr);
+
+    // conv3D_1a 3D convolution op.
+    auto conv3D_1a = addConv3D(plugin_factory, *network, *cost_vol->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {32, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_1a_k"), weights.at("conv3D_1a_b"),
+                         "conv3D_1a");
+    assert(conv3D_1a != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_1a_tran = addTransform(plugin_factory, *network, *conv3D_1a->getOutput(0), {1, 0, 2, 3}, "conv3D_1a_tran_transform");
+    assert(conv3D_1a_tran != nullptr);
+
+    // conv3D_1a_act ELU activation op.
+    auto conv3D_1a_act = addElu(plugin_factory, *network, *conv3D_1a_tran->getOutput(0), data_type, "conv3D_1a_act");
+    assert(conv3D_1a_act != nullptr);
+
+    // conv3D_1b 3D convolution op.
+    auto conv3D_1b = addConv3D(plugin_factory, *network, *conv3D_1a_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_1b_k"), weights.at("conv3D_1b_b"),
+                         "conv3D_1b");
+    assert(conv3D_1b != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_1b_tran = addTransform(plugin_factory, *network, *conv3D_1b->getOutput(0), {1, 0, 2, 3}, "conv3D_1b_tran_transform");
+    assert(conv3D_1b_tran != nullptr);
+
+    // conv3D_1b_act ELU activation op.
+    auto conv3D_1b_act = addElu(plugin_factory, *network, *conv3D_1b_tran->getOutput(0), data_type, "conv3D_1b_act");
+    assert(conv3D_1b_act != nullptr);
+
+    // conv3D_1ds_pad padding op.
+    auto conv3D_1ds_pad = addPad(plugin_factory, *network, *conv3D_1b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_1ds_pad");
+    assert(conv3D_1ds_pad != nullptr);
+
+    // conv3D_1ds 3D convolution op.
+    auto conv3D_1ds = addConv3D(plugin_factory, *network, *conv3D_1ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_1ds_k"), weights.at("conv3D_1ds_b"),
+                         "conv3D_1ds");
+    assert(conv3D_1ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_1ds_tran = addTransform(plugin_factory, *network, *conv3D_1ds->getOutput(0), {1, 0, 2, 3}, "conv3D_1ds_tran_transform");
+    assert(conv3D_1ds_tran != nullptr);
+
+    // conv3D_1ds_act ELU activation op.
+    auto conv3D_1ds_act = addElu(plugin_factory, *network, *conv3D_1ds_tran->getOutput(0), data_type, "conv3D_1ds_act");
+    assert(conv3D_1ds_act != nullptr);
+
+    // conv3D_2a 3D convolution op.
+    auto conv3D_2a = addConv3D(plugin_factory, *network, *conv3D_1ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_2a_k"), weights.at("conv3D_2a_b"),
+                         "conv3D_2a");
+    assert(conv3D_2a != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_2a_tran = addTransform(plugin_factory, *network, *conv3D_2a->getOutput(0), {1, 0, 2, 3}, "conv3D_2a_tran_transform");
+    assert(conv3D_2a_tran != nullptr);
+
+    // conv3D_2a_act ELU activation op.
+    auto conv3D_2a_act = addElu(plugin_factory, *network, *conv3D_2a_tran->getOutput(0), data_type, "conv3D_2a_act");
+    assert(conv3D_2a_act != nullptr);
+
+    // conv3D_2b 3D convolution op.
+    auto conv3D_2b = addConv3D(plugin_factory, *network, *conv3D_2a_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_2b_k"), weights.at("conv3D_2b_b"),
+                         "conv3D_2b");
+    assert(conv3D_2b != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_2b_tran = addTransform(plugin_factory, *network, *conv3D_2b->getOutput(0), {1, 0, 2, 3}, "conv3D_2b_tran_transform");
+    assert(conv3D_2b_tran != nullptr);
+
+    // conv3D_2b_act ELU activation op.
+    auto conv3D_2b_act = addElu(plugin_factory, *network, *conv3D_2b_tran->getOutput(0), data_type, "conv3D_2b_act");
+    assert(conv3D_2b_act != nullptr);
+
+    // conv3D_2ds_pad padding op.
+    auto conv3D_2ds_pad = addPad(plugin_factory, *network, *conv3D_2b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_2ds_pad");
+    assert(conv3D_2ds_pad != nullptr);
+
+    // conv3D_2ds 3D convolution op.
+    auto conv3D_2ds = addConv3D(plugin_factory, *network, *conv3D_2ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_2ds_k"), weights.at("conv3D_2ds_b"),
+                         "conv3D_2ds");
+    assert(conv3D_2ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_2ds_tran = addTransform(plugin_factory, *network, *conv3D_2ds->getOutput(0), {1, 0, 2, 3}, "conv3D_2ds_tran_transform");
+    assert(conv3D_2ds_tran != nullptr);
+
+    // conv3D_2ds_act ELU activation op.
+    auto conv3D_2ds_act = addElu(plugin_factory, *network, *conv3D_2ds_tran->getOutput(0), data_type, "conv3D_2ds_act");
+    assert(conv3D_2ds_act != nullptr);
+
+    // conv3D_3a 3D convolution op.
+    auto conv3D_3a = addConv3D(plugin_factory, *network, *conv3D_2ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_3a_k"), weights.at("conv3D_3a_b"),
+                         "conv3D_3a");
+    assert(conv3D_3a != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_3a_tran = addTransform(plugin_factory, *network, *conv3D_3a->getOutput(0), {1, 0, 2, 3}, "conv3D_3a_tran_transform");
+    assert(conv3D_3a_tran != nullptr);
+
+    // conv3D_3a_act ELU activation op.
+    auto conv3D_3a_act = addElu(plugin_factory, *network, *conv3D_3a_tran->getOutput(0), data_type, "conv3D_3a_act");
+    assert(conv3D_3a_act != nullptr);
+
+    // conv3D_3b 3D convolution op.
+    auto conv3D_3b = addConv3D(plugin_factory, *network, *conv3D_3a_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_3b_k"), weights.at("conv3D_3b_b"),
+                         "conv3D_3b");
+    assert(conv3D_3b != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_3b_tran = addTransform(plugin_factory, *network, *conv3D_3b->getOutput(0), {1, 0, 2, 3}, "conv3D_3b_tran_transform");
+    assert(conv3D_3b_tran != nullptr);
+
+    // conv3D_3b_act ELU activation op.
+    auto conv3D_3b_act = addElu(plugin_factory, *network, *conv3D_3b_tran->getOutput(0), data_type, "conv3D_3b_act");
+    assert(conv3D_3b_act != nullptr);
+
+    // conv3D_3ds_pad padding op.
+    auto conv3D_3ds_pad = addPad(plugin_factory, *network, *conv3D_3b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_3ds_pad");
+    assert(conv3D_3ds_pad != nullptr);
+
+    // conv3D_3ds 3D convolution op.
+    auto conv3D_3ds = addConv3D(plugin_factory, *network, *conv3D_3ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_3ds_k"), weights.at("conv3D_3ds_b"),
+                         "conv3D_3ds");
+    assert(conv3D_3ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_3ds_tran = addTransform(plugin_factory, *network, *conv3D_3ds->getOutput(0), {1, 0, 2, 3}, "conv3D_3ds_tran_transform");
+    assert(conv3D_3ds_tran != nullptr);
+
+    // conv3D_3ds_act ELU activation op.
+    auto conv3D_3ds_act = addElu(plugin_factory, *network, *conv3D_3ds_tran->getOutput(0), data_type, "conv3D_3ds_act");
+    assert(conv3D_3ds_act != nullptr);
+
+    // conv3D_4a 3D convolution op.
+    auto conv3D_4a = addConv3D(plugin_factory, *network, *conv3D_3ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_4a_k"), weights.at("conv3D_4a_b"),
+                         "conv3D_4a");
+    assert(conv3D_4a != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_4a_tran = addTransform(plugin_factory, *network, *conv3D_4a->getOutput(0), {1, 0, 2, 3}, "conv3D_4a_tran_transform");
+    assert(conv3D_4a_tran != nullptr);
+
+    // conv3D_4a_act ELU activation op.
+    auto conv3D_4a_act = addElu(plugin_factory, *network, *conv3D_4a_tran->getOutput(0), data_type, "conv3D_4a_act");
+    assert(conv3D_4a_act != nullptr);
+
+    // conv3D_4b 3D convolution op.
+    auto conv3D_4b = addConv3D(plugin_factory, *network, *conv3D_4a_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_4b_k"), weights.at("conv3D_4b_b"),
+                         "conv3D_4b");
+    assert(conv3D_4b != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_4b_tran = addTransform(plugin_factory, *network, *conv3D_4b->getOutput(0), {1, 0, 2, 3}, "conv3D_4b_tran_transform");
+    assert(conv3D_4b_tran != nullptr);
+
+    // conv3D_4b_act ELU activation op.
+    auto conv3D_4b_act = addElu(plugin_factory, *network, *conv3D_4b_tran->getOutput(0), data_type, "conv3D_4b_act");
+    assert(conv3D_4b_act != nullptr);
+
+    // conv3D_4ds_pad padding op.
+    auto conv3D_4ds_pad = addPad(plugin_factory, *network, *conv3D_4b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_4ds_pad");
+    assert(conv3D_4ds_pad != nullptr);
+
+    // conv3D_4ds 3D convolution op.
+    auto conv3D_4ds = addConv3D(plugin_factory, *network, *conv3D_4ds_pad->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}},
+                         Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_4ds_k"), weights.at("conv3D_4ds_b"),
+                         "conv3D_4ds");
+    assert(conv3D_4ds != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_4ds_tran = addTransform(plugin_factory, *network, *conv3D_4ds->getOutput(0), {1, 0, 2, 3}, "conv3D_4ds_tran_transform");
+    assert(conv3D_4ds_tran != nullptr);
+
+    // conv3D_4ds_act ELU activation op.
+    auto conv3D_4ds_act = addElu(plugin_factory, *network, *conv3D_4ds_tran->getOutput(0), data_type, "conv3D_4ds_act");
+    assert(conv3D_4ds_act != nullptr);
+
+    // conv3D_5a 3D convolution op.
+    auto conv3D_5a = addConv3D(plugin_factory, *network, *conv3D_4ds_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_5a_k"), weights.at("conv3D_5a_b"),
+                         "conv3D_5a");
+    assert(conv3D_5a != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto conv3D_5a_tran = addTransform(plugin_factory, *network, *conv3D_5a->getOutput(0), {1, 0, 2, 3}, "conv3D_5a_tran_transform");
+    assert(conv3D_5a_tran != nullptr);
+
+    // conv3D_5a_act ELU activation op.
+    auto conv3D_5a_act = addElu(plugin_factory, *network, *conv3D_5a_tran->getOutput(0), data_type, "conv3D_5a_act");
+    assert(conv3D_5a_act != nullptr);
+
+    // conv3D_5b 3D convolution op.
+    auto conv3D_5b = addConv3D(plugin_factory, *network, *conv3D_5a_act->getOutput(0),
+                         Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}},
+                         Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                         weights.at("conv3D_5b_k"), weights.at("conv3D_5b_b"),
+                         "conv3D_5b");
+    assert(conv3D_5b != nullptr);
+
+    // conv3D_5b_act ELU activation op.
+    auto conv3D_5b_act = addElu(plugin_factory, *network, *conv3D_5b->getOutput(0), data_type, "conv3D_5b_act");
+    assert(conv3D_5b_act != nullptr);
+
+    // deconv3D_1 3D transposed convolution op.
+    Dims deconv3D_1_out_dims{4, {9, 64, 21, 65}};
+    auto deconv3D_1 = addConv3DTranspose(plugin_factory, *network, *conv3D_5b_act->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}}, deconv3D_1_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                                  weights.at("deconv3D_1_k"), weights.at("deconv3D_1_b"),
+                                  "deconv3D_1");
+    assert(deconv3D_1 != nullptr);
+
+    // deconv3D_1_add_skip tensor add op.
+    auto deconv3D_1_add_skip = network->addElementWise(*(deconv3D_1->getOutput(0)), *(conv3D_4b_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_1_add_skip != nullptr);
+
+    // deconv3D_1_act ELU activation op.
+    auto deconv3D_1_act = addElu(plugin_factory, *network, *deconv3D_1_add_skip->getOutput(0), data_type, "deconv3D_1_act");
+    assert(deconv3D_1_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_1_transform = addTransform(plugin_factory, *network, *deconv3D_1_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_1_transform_transform");
+    assert(deconv3D_1_transform != nullptr);
+
+    // deconv3D_2 3D transposed convolution op.
+    Dims deconv3D_2_out_dims{4, {17, 64, 41, 129}};
+    auto deconv3D_2 = addConv3DTranspose(plugin_factory, *network, *deconv3D_1_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, deconv3D_2_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}},
+                                  weights.at("deconv3D_2_k"), weights.at("deconv3D_2_b"),
+                                  "deconv3D_2");
+    assert(deconv3D_2 != nullptr);
+
+    // deconv3D_2_add_skip tensor add op.
+    auto deconv3D_2_add_skip = network->addElementWise(*(deconv3D_2->getOutput(0)), *(conv3D_3b_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_2_add_skip != nullptr);
+
+    // deconv3D_2_act ELU activation op.
+    auto deconv3D_2_act = addElu(plugin_factory, *network, *deconv3D_2_add_skip->getOutput(0), data_type, "deconv3D_2_act");
+    assert(deconv3D_2_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_2_transform = addTransform(plugin_factory, *network, *deconv3D_2_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_2_transform_transform");
+    assert(deconv3D_2_transform != nullptr);
+
+    // deconv3D_3 3D transposed convolution op.
+    Dims deconv3D_3_out_dims{4, {35, 64, 81, 257}};
+    auto deconv3D_3 = addConv3DTranspose(plugin_factory, *network, *deconv3D_2_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, deconv3D_3_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_3_k"), weights.at("deconv3D_3_b"),
+                                  "deconv3D_3");
+    assert(deconv3D_3 != nullptr);
+
+    // deconv3D_3 output slice op.
+    auto deconv3D_3_slice_layer = addSlice(plugin_factory, *network, *deconv3D_3->getOutput(0),
+                                    deconv3D_3_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_3_out_dims.d[0] - 1, deconv3D_3_out_dims.d[1], deconv3D_3_out_dims.d[2], deconv3D_3_out_dims.d[3]}},
+                                    "deconv3D_3_slice");
+    assert(deconv3D_3_slice_layer != nullptr);
+
+    // deconv3D_3_add_skip tensor add op.
+    auto deconv3D_3_add_skip = network->addElementWise(*(deconv3D_3_slice_layer->getOutput(0)), *(conv3D_2b_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_3_add_skip != nullptr);
+
+    // deconv3D_3_act ELU activation op.
+    auto deconv3D_3_act = addElu(plugin_factory, *network, *deconv3D_3_add_skip->getOutput(0), data_type, "deconv3D_3_act");
+    assert(deconv3D_3_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_3_transform = addTransform(plugin_factory, *network, *deconv3D_3_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_3_transform_transform");
+    assert(deconv3D_3_transform != nullptr);
+
+    // deconv3D_4 3D transposed convolution op.
+    Dims deconv3D_4_out_dims{4, {69, 32, 161, 513}};
+    auto deconv3D_4 = addConv3DTranspose(plugin_factory, *network, *deconv3D_3_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, deconv3D_4_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_4_k"), weights.at("deconv3D_4_b"),
+                                  "deconv3D_4");
+    assert(deconv3D_4 != nullptr);
+
+    // deconv3D_4 output slice op.
+    auto deconv3D_4_slice_layer = addSlice(plugin_factory, *network, *deconv3D_4->getOutput(0),
+                                    deconv3D_4_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_4_out_dims.d[0] - 1, deconv3D_4_out_dims.d[1], deconv3D_4_out_dims.d[2], deconv3D_4_out_dims.d[3]}},
+                                    "deconv3D_4_slice");
+    assert(deconv3D_4_slice_layer != nullptr);
+
+    // deconv3D_4_add_skip tensor add op.
+    auto deconv3D_4_add_skip = network->addElementWise(*(deconv3D_4_slice_layer->getOutput(0)), *(conv3D_1b_act->getOutput(0)), ElementWiseOperation::kSUM);
+    assert(deconv3D_4_add_skip != nullptr);
+
+    // deconv3D_4_act ELU activation op.
+    auto deconv3D_4_act = addElu(plugin_factory, *network, *deconv3D_4_add_skip->getOutput(0), data_type, "deconv3D_4_act");
+    assert(deconv3D_4_act != nullptr);
+
+    // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+    auto deconv3D_4_transform = addTransform(plugin_factory, *network, *deconv3D_4_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_4_transform_transform");
+    assert(deconv3D_4_transform != nullptr);
+
+    // deconv3D_5 3D transposed convolution op.
+    Dims deconv3D_5_out_dims{4, {137, 1, 321, 1025}};
+    auto deconv3D_5 = addConv3DTranspose(plugin_factory, *network, *deconv3D_4_transform->getOutput(0),
+                                  Conv3DType::kTensorFlow, {5, {32, 3, 1, 3, 3}}, deconv3D_5_out_dims,
+                                  Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}},
+                                  weights.at("deconv3D_5_k"), weights.at("deconv3D_5_b"),
+                                  "deconv3D_5");
+    assert(deconv3D_5 != nullptr);
+
+    // deconv3D_5 output slice op.
+    auto deconv3D_5_slice_layer = addSlice(plugin_factory, *network, *deconv3D_5->getOutput(0),
+                                    deconv3D_5_out_dims,
+                                    {4, {0, 0, 0, 0}},
+                                    {4, {deconv3D_5_out_dims.d[0] - 1, deconv3D_5_out_dims.d[1], deconv3D_5_out_dims.d[2], deconv3D_5_out_dims.d[3]}},
+                                    "deconv3D_5_slice");
+    assert(deconv3D_5_slice_layer != nullptr);
+
+    // Softargmax.
+    auto disp = addSoftargmax(plugin_factory, *network, *deconv3D_5_slice_layer->getOutput(0), "disp_softargmax");
+    assert(disp != nullptr);
+
+    auto disp_out = disp->getOutput(0);
+    disp_out->setName("disp");
+    network->markOutput(*disp_out);
+
+    return network;
+}
+
+} } // namespace
diff --git a/stereoDNN/scripts/data_converters.py b/stereoDNN/scripts/data_converters.py
new file mode 100644
index 0000000..3aa28f1
--- /dev/null
+++ b/stereoDNN/scripts/data_converters.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+"""
+Contains data conversion routines used in other scripts.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+def nhwc_to_nchw(src):
+    """Converts tensor from NHWC to NCHW format. Usually
+    used for converting input/output tensors.
+    NHWC format is commonly used by TensorFlow 
+    while NCHW - TensorRT/cuDNN."""
+    assert len(src.shape) == 4, "Expected 4D tensor."
+    return np.transpose(src, [0, 3, 1, 2])
+
+def rsck_to_kcrs(src):
+    """Converts tensor from RSCK to KCRS format. Usually
+    used for converting convolution filter tensors.
+    cuDNN notation is used for dimensions where RS are
+    spatial dimensions (height and width), C - number
+    of input channels and K - number of output channels.
+    """
+    assert len(src.shape) == 4, "Expected 4D tensor."
+    return np.transpose(src, [3, 2, 0, 1])
+
+def ndhwc_to_ndchw(src):
+    """Converts tensor from NDHWC to NDCHW format. Usually
+    used for converting input/output tensors.
+    NDHWC format is commonly used by TensorFlow 
+    while NDCHW - TensorRT/cuDNN."""
+    assert len(src.shape) == 5, "Expected 5D tensor."
+    return np.transpose(src, [0, 1, 4, 2, 3])
+
+def ndhwc_to_ncdhw(src):
+    """Converts tensor from NDHWC to NCDHW format. Usually
+    used for converting input/output tensors.
+    NDHWC format is commonly used by TensorFlow 
+    while NCDHW - TensorRT/cuDNN. This particular format
+    is used to convert input of 3D convolution from TensorFlow
+    to cuDNN format."""
+    assert len(src.shape) == 5, "Expected 5D tensor."
+    return np.transpose(src, [0, 4, 1, 2, 3])
+
+def vrsck_to_kvcrs(src):
+    """Converts tensor from VRSCK to KVCRS format. Usually
+    used for converting convolution filter tensors.
+    cuDNN notation is used for dimensions where RS are
+    spatial dimensions (height and width), V - depth
+    dimension, C - number of input channels 
+    and K - number of output channels.
+    """
+    assert len(src.shape) == 5, "Expected 5D tensor."
+    return np.transpose(src, [4, 0, 3, 1, 2])
+
diff --git a/stereoDNN/scripts/model_builder.py b/stereoDNN/scripts/model_builder.py
new file mode 100644
index 0000000..7e49d44
--- /dev/null
+++ b/stereoDNN/scripts/model_builder.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+"""
+Generates TensorRT C++ API code from TensorFlow model.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+import warnings
+# Ignore 'FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated' warning.
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=FutureWarning, module='h5py')
+    import tensorflow as tf
+
+import tensorrt_model_builder
+import model_nvsmall
+import model_resnet18
+
+def check_model_type(src):
+    if src == 'nvsmall' or src == 'resnet18':
+        return src
+    else:
+        raise argparse.ArgumentTypeError('Invalid model type {}. Supported: nvsmall, resnet18'.format(src))
+
+def check_data_type(src):
+    if src == 'fp32' or src == 'fp16':
+        return src
+    else:
+        raise argparse.ArgumentTypeError('Invalid data type {}. Supported: fp32, fp16'.format(src))
+
+parser = argparse.ArgumentParser(description='Stereo DNN TensorRT C++ code generator')
+
+parser.add_argument('--model_type',      type=check_model_type, help='model type, currently supported: nvsmall', required=True)
+parser.add_argument('--net_name',        type=str, help='network name to use in C++ code generation',  required=True)
+parser.add_argument('--checkpoint_path', type=str, help='path to checkpoint file (without extension)', required=True)
+parser.add_argument('--weights_file',    type=str, help='path to generated weights file',              required=True)
+parser.add_argument('--cpp_file',        type=str, help='path to generated TensorRT C++ model file',   required=True)
+parser.add_argument('--data_type',       type=check_data_type, help='model data type, supported: fp32, fp16', default='fp32')
+
+args = parser.parse_args()
+
+def read_model(model_path, session):
+    print('Reading model...')
+    saver = tf.train.import_meta_graph(model_path + '.meta')
+    print('Loaded graph meta.')
+    saver.restore(session, model_path)
+    print('Loaded weights.')
+    print('Done reading model.')
+
+def main():
+    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
+    config.gpu_options.allow_growth = True
+    sess = tf.InteractiveSession(config=config)
+
+    model = read_model(args.checkpoint_path, sess)
+
+    with open(args.weights_file, 'wb') as weights_w:
+        with open(args.cpp_file, 'w') as cpp_w:
+            builder = tensorrt_model_builder.TrtModelBuilder(model, args.net_name, cpp_w, weights_w, args.data_type)
+            if args.model_type == 'nvsmall':
+                model_nvsmall.create(builder)
+            elif args.model_type == 'resnet18':
+                model_resnet18.create(builder)
+            else:
+                # Should never happen, yeah.
+                assert False, 'Not supported.'
+    print('Done.')
+
+if __name__ == '__main__':
+    main()
diff --git a/stereoDNN/scripts/model_nvsmall.py b/stereoDNN/scripts/model_nvsmall.py
new file mode 100644
index 0000000..cf559ea
--- /dev/null
+++ b/stereoDNN/scripts/model_nvsmall.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+"""
+Generates TensorRT code for NVSmall model.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+def create(builder):
+    def write_2d_encoder():
+        sides = ['left', 'right']
+        builder.write_input(sides[0])
+        builder.write_input(sides[1])
+        layer_inp = [sides[0] + '_scale', sides[1] + '_scale']
+        builder.write_scale(sides[0], layer_inp[0])
+        builder.write_scale(sides[1], layer_inp[1])
+        for l in ['conv1', 'conv2', 'conv3', 'conv4']:
+            for i in range(len(sides)):
+                cur = '{}_{}'.format(sides[i], l)
+                builder.write_2d_convolution(layer_inp[i], cur, os.path.join('model/encoder2D', l))
+                builder.write_elu(cur, cur + '_act')
+                layer_inp[i] = cur + '_act'
+        left  = builder.write_2d_convolution(layer_inp[0], sides[0] + '_conv5', 'model/encoder2D/conv5')
+        right = builder.write_2d_convolution(layer_inp[1], sides[1] + '_conv5', 'model/encoder2D/conv5')
+        return left, right
+
+    def write_3d_encoder(input):
+        input = 'cost_vol'
+        for l in ['conv3D_1', 'conv3D_2', 'conv3D_3ds', 'conv3D_4', 'conv3D_5', 'conv3D_6ds', 'conv3D_7', 'conv3D_8']:
+            # Pad after each D stride == 2.
+            if l in ['conv3D_3ds', 'conv3D_6ds']:
+                input = builder.write_conv3d_pad(input, l + '_pad')
+            input = builder.write_3d_convolution(input, l, 'model/encoder3D')
+            # No transpose for conv3D_8 as it goes directly to decoder.
+            if l != 'conv3D_8':
+                input = builder.write_conv3d_transform(input, l + '_tran')
+            input = builder.write_elu(input, l + '_act')
+        return input
+
+    def write_3d_decoder(input):
+        # deconv3D_1
+        cur  = builder.write_3d_convolution_transpose(input, 'deconv3D_1', 'model/decoder3D')
+        cur  = builder.write_add_tensors(cur, 'conv3D_5_act', 'deconv3D_1_add_skip')
+        cur  = builder.write_elu(cur, 'deconv3D_1_act')
+        # deconv3D_2
+        cur  = builder.write_conv3d_transform(cur, 'deconv3D_1_transform')
+        cur  = builder.write_3d_convolution_transpose(cur, 'deconv3D_2', 'model/decoder3D')
+        cur  = builder.write_add_tensors(cur, 'conv3D_2_act', 'deconv3D_2_add_skip')
+        cur  = builder.write_elu(cur, 'deconv3D_2_act')
+        # deconv3D_3
+        cur  = builder.write_conv3d_transform(cur, 'deconv3D_2_transform')
+        cur  = builder.write_3d_convolution_transpose(cur, 'deconv3D_3', 'model/decoder3D')
+        return cur
+
+    builder.write_header()
+    builder.do_indent()
+    left, right = write_2d_encoder()
+    cur = builder.write_cost_vol(left, right, 'cost_vol', 'model/cost_vol/cost_volume_left')
+    cur = write_3d_encoder(cur)
+    cur = write_3d_decoder(cur)
+    # Softargmax
+    cur = builder.write_softargmax(cur, 'disp')
+    builder.write_output(cur)
+    builder.write_footer()
diff --git a/stereoDNN/scripts/model_resnet18.py b/stereoDNN/scripts/model_resnet18.py
new file mode 100644
index 0000000..42b93b6
--- /dev/null
+++ b/stereoDNN/scripts/model_resnet18.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+"""
+Generates TensorRT code for NVSmall model.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+def create(builder):
+    def write_residual_block(input, name, path):
+        cur = builder.write_2d_convolution(input, name + '_conv1', os.path.join(path, 'res_conv1'))
+        cur = builder.write_elu(cur, cur + '_act')
+        cur = builder.write_2d_convolution(cur, name + '_conv2', os.path.join(path, 'res_conv2'))
+        cur = builder.write_add_tensors(cur, input, cur + '_add')
+        cur = builder.write_elu(cur, cur + '_act')
+        return cur
+
+    def write_2d_encoder():
+        sides = ['left', 'right']
+        builder.write_input(sides[0])
+        builder.write_input(sides[1])
+        layer_inp = [sides[0] + '_scale', sides[1] + '_scale']
+        builder.write_scale(sides[0], layer_inp[0])
+        builder.write_scale(sides[1], layer_inp[1])
+        # conv1
+        for i in range(len(sides)):
+            cur = builder.write_2d_convolution(layer_inp[i], '{}_{}'.format(sides[i], 'conv1'), os.path.join('model/encoder2D', 'conv1'))
+            cur = builder.write_elu(cur, cur + '_act')
+            layer_inp[i] = cur
+        # resblock 1 - 8
+        for l in ['resblock1', 'resblock2', 'resblock3', 'resblock4', 'resblock5', 'resblock6', 'resblock7', 'resblock8']:
+            for i in range(len(sides)):
+                cur = '{}_{}'.format(sides[i], l)
+                layer_inp[i] =  write_residual_block(layer_inp[i], cur, os.path.join('model/encoder2D', l))
+        # encoder2D_out
+        left  = builder.write_2d_convolution(layer_inp[0], sides[0] + '_encoder2D_out', 'model/encoder2D/encoder2D_out')
+        right = builder.write_2d_convolution(layer_inp[1], sides[1] + '_encoder2D_out', 'model/encoder2D/encoder2D_out')
+        return left, right
+
+    def write_3d_encoder(input):
+        input = 'cost_vol'
+        for l in ['conv3D_1a', 'conv3D_1b', 'conv3D_1ds',
+                  'conv3D_2a', 'conv3D_2b', 'conv3D_2ds',
+                  'conv3D_3a', 'conv3D_3b', 'conv3D_3ds',
+                  'conv3D_4a', 'conv3D_4b', 'conv3D_4ds',
+                  'conv3D_5a', 'conv3D_5b']:
+            # Pad after each D stride == 2.
+            if l in ['conv3D_1ds', 'conv3D_2ds', 'conv3D_3ds', 'conv3D_4ds']:
+                input = builder.write_conv3d_pad(input, l + '_pad')
+            input = builder.write_3d_convolution(input, l, 'model/encoder3D')
+            # No transpose for conv3D_5b as it goes directly to decoder.
+            if l != 'conv3D_5b':
+                input = builder.write_conv3d_transform(input, l + '_tran')
+            input = builder.write_elu(input, l + '_act')
+        return input
+
+    def write_3d_decoder(input):
+        cur = input
+        for i in range(1, 5):
+            l = 'deconv3D_{}'.format(i)
+            cur = builder.write_3d_convolution_transpose(cur, l, 'model/decoder3D')
+            cur = builder.write_add_tensors(cur, 'conv3D_{}b_act'.format(5 - i), l + '_add_skip')
+            cur = builder.write_elu(cur, l + '_act')
+            cur = builder.write_conv3d_transform(cur, l + '_transform')
+        # deconv3D_5
+        cur  = builder.write_3d_convolution_transpose(cur, 'deconv3D_5', 'model/decoder3D')
+        return cur
+
+    builder.write_header()
+    builder.do_indent()
+    left, right = write_2d_encoder()
+    cur = builder.write_cost_vol(left, right, 'cost_vol', 'model/cost_vol/cost_volume_left')
+    cur = write_3d_encoder(cur)
+    cur = write_3d_decoder(cur)
+    # Softargmax
+    cur = builder.write_softargmax(cur, 'disp')
+    builder.write_output(cur)
+    builder.write_footer()
diff --git a/stereoDNN/scripts/tensorrt_model_builder.py b/stereoDNN/scripts/tensorrt_model_builder.py
new file mode 100644
index 0000000..b7da802
--- /dev/null
+++ b/stereoDNN/scripts/tensorrt_model_builder.py
@@ -0,0 +1,462 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+"""
+Generates TensorRT C++ API code from TensorFlow model.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import os
+import textwrap as tw
+import time
+import struct
+import sys
+
+import warnings
+# Ignore 'FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated' warning.
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=FutureWarning, module='h5py')
+    import tensorflow as tf
+
+from data_converters import *
+
+class TrtModelBuilder(object):
+    def __init__(self, model, net_name, code_writer, weights_writer, data_type):
+        self.default_indent = 4
+        self.cur_indent     = 0
+        self.max_line_width = 160
+        self.indent = tw.TextWrapper(initial_indent=' '*self.cur_indent,
+                                     subsequent_indent=' '*(self.cur_indent + self.default_indent),
+                                     width=self.max_line_width, break_long_words=False)
+        self.model          = model 
+        self.net_name       = net_name
+        self.code_writer    = code_writer
+        self.weights_writer = weights_writer
+        self.data_type      = data_type
+
+    def _indent_lines(self, src):
+        src = src.split('\n')
+        for i in range(len(src)):
+            src[i] = self.indent.fill(src[i])
+        return '\n'.join(src)
+
+    def do_indent(self):
+        self.cur_indent = self.cur_indent + self.default_indent
+        self.indent.initial_indent = ' '*self.cur_indent
+
+    def _write_weights(self, name, src):
+        # Write name as null-terminated string.
+        self.weights_writer.write(struct.pack('%ds'%len(name), name.encode()))
+        self.weights_writer.write(struct.pack('B', 0))
+        # Weight count and data.
+        src_flat = np.reshape(src, -1)
+        src_flat = src_flat.astype(np.float16) if self.data_type == 'fp16' else src_flat.astype(np.float32)
+        self.weights_writer.write(struct.pack('<I', len(src_flat)))
+        src_flat.tofile(self.weights_writer)
+
+    def write_header(self):
+        code = """\
+// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+//-------------------------------------------------------------------
+// !!! This file was automatically generated. Do not edit. !!!
+//-------------------------------------------------------------------
+
+#include <NvInfer.h>
+#include <cassert>
+#include <string>
+#include <unordered_map>
+#include "redtail_tensorrt_plugins.h"
+
+namespace redtail {{ namespace tensorrt
+{{
+
+using namespace nvinfer1;
+
+using weight_map = std::unordered_map<std::string, Weights>;
+
+INetworkDefinition* create{0}Network(IBuilder& builder, IPluginContainer& plugin_factory,
+                                     DimsCHW img_dims, const weight_map& weights, DataType data_type,
+                                     ILogger& log)
+{{
+    INetworkDefinition* network = builder.createNetwork();
+    assert(network != nullptr);
+"""
+        self.code_writer.write(code.format(self.net_name))
+
+    def write_footer(self):
+        code = """\
+    return network;
+}
+
+} } // namespace
+"""
+        self.code_writer.write(code)
+
+    def write_input(self, name):
+        code = """\
+// Input tensor.
+auto {0} = network->addInput("{0}", DataType::kFLOAT, img_dims);
+assert({0} != nullptr);
+
+"""
+        code = code.format(name)
+        self.code_writer.write(self._indent_lines(code))
+
+    def write_output(self, output):
+        code = """\
+auto {0}_out = {0}->getOutput(0);
+{0}_out->setName("{0}");
+network->markOutput(*{0}_out);
+
+    """
+        self.code_writer.write(self._indent_lines(code.format(output)))
+
+    def write_scale(self, input, name):
+        # Write code.
+        code = """\
+// Scaling op.
+auto {1} = network->addScale(*{0}, ScaleMode::kUNIFORM,
+                             weights.at("{1}_shift"), weights.at("{1}_scale"), weights.at("{1}_power"));
+assert({1} != nullptr);
+
+"""
+        code = code.format(input, name)
+        self.code_writer.write(self._indent_lines(code))
+        # REVIEW alexeyk: default for now.
+        self._write_weights(name + '_shift', [0.0])
+        self._write_weights(name + '_scale', [1.0])
+        self._write_weights(name + '_power', [1.0])
+
+    # TensorFlow padding computation, taken from:
+    # https://www.tensorflow.org/api_guides/python/nn#Convolution
+    def _compute_tf_padding(self, in_dim, kern_dim, stride_dim):
+        if in_dim % stride_dim == 0:
+            pad_along = max(kern_dim - stride_dim, 0)
+        else:
+            pad_along = max(kern_dim - (in_dim % stride_dim), 0)
+        pad_start = pad_along // 2
+        pad_end   = pad_along - pad_start
+        return pad_start, pad_end
+
+    def write_2d_convolution(self, input, name, op_path):
+        # Convolution code.
+        conv_code = """\
+// {1} convolution op.
+auto {1} = network->addConvolution(*{0}->getOutput(0), {2}, DimsHW {{{3}, {4}}},
+                                   weights.at("{1}_k"), weights.at("{1}_b"));
+assert({1} != nullptr);
+{1}->setStride( DimsHW {{{5}, {6}}});
+{1}->setPadding(DimsHW {{{7}, {8}}});
+
+"""
+        # Padding code.
+        pad_code = """\
+// {1} padding op.
+auto {1}_pad = network->addPadding(*{0}->getOutput(0), DimsHW {{{2}, {3}}}, DimsHW {{{4}, {5}}});
+assert({1}_pad != nullptr);
+
+"""
+        g = tf.get_default_graph()
+        assert g is not None, "No default graph set"
+        # Get and check convolution operation.
+        conv_op = g.get_operation_by_name(os.path.join(op_path, 'Conv2D'))
+        # Assuming input/output of 2D convolution are in NHWC format.
+        assert conv_op.type == 'Conv2D', 'Expected Conv2D operation but got {}.'.format(conv_op.type)
+        assert conv_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 2D convolutions.'
+        assert len(conv_op.inputs)  == 2, 'Convolution expected to have 2 inputs.'
+        assert len(conv_op.outputs) == 1, 'Convolution expected to have only one output.'
+        # Get and check bias operation
+        bias_op = g.get_operation_by_name(os.path.join(op_path, 'BiasAdd'))
+        assert bias_op.type == 'BiasAdd', 'Expected BiasAdd operation but got {}.'.format(bias_op.type)
+        assert bias_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 2D convolution biases.'
+        assert len(bias_op.inputs)  == 2, 'BiasAdd expected to have 2 inputs.'
+        assert len(bias_op.outputs) == 1, 'BiasAdd expected to have only one output.'
+
+        # Get weights, stride and padding.
+        kernel_weights = conv_op.inputs[1].eval()
+        kernel_shape   = kernel_weights.shape
+        assert len(kernel_shape) == 4, 'Convolution kernel weights tensor expected to have rank 4.'
+        # Weights in TF are in RSCK format (in TensorRT speak).
+        kh, kw, kc, kk = kernel_shape
+        strides = conv_op.get_attr('strides')
+        assert len(strides) == 4, 'Convolution strides tensor expected to have length 4.'
+        # Compute padding.
+        in_shape  = conv_op.inputs[0].shape.as_list()
+        out_shape = conv_op.outputs[0].shape.as_list()
+        pad_top,  pad_bottom = self._compute_tf_padding(in_shape[1], kh, strides[1])
+        pad_left, pad_right  = self._compute_tf_padding(in_shape[2], kw, strides[2])
+        # If padding is symetrical - use TensorRT convolution padding
+        # otherwise need to explicitly pad the input.
+        trt_conv_pad_h = 0
+        trt_conv_pad_w = 0
+        code = ''
+        conv_input = input
+        if pad_top == pad_bottom and pad_left == pad_right:
+            trt_conv_pad_h = pad_top
+            trt_conv_pad_w = pad_left
+        else:
+            assert False, 'Not supported at the moment due to bug (#3199) in TRT.'
+            p_b = pad_bottom - pad_top
+            p_r = pad_right  - pad_left
+            assert 0 <= p_b and p_b <= 1, 'Bottom pad should not be greater than top pad by more than 1.'
+            assert 0 <= p_r and p_r <= 1, 'Right pad should not be greater than left pad by more than 1.'
+            # Write padding layer.
+            code = pad_code.format(input, name, pad_top, pad_left, pad_bottom, pad_right)
+            conv_input = name + '_pad'
+        # Write code.
+        code += conv_code.format(conv_input, name, kk, kh, kw, strides[1], strides[2], trt_conv_pad_h, trt_conv_pad_w)
+        self.code_writer.write(self._indent_lines(code))
+        # TRT requires kernel weights to be in KCRS format while TensorFlow uses RSCK.
+        kernel_weights = rsck_to_kcrs(kernel_weights)
+        # Write kernel weights.
+        self._write_weights(name + '_k', kernel_weights)
+        # Write bias weights.
+        bias_weights = bias_op.inputs[1].eval()
+        bias_shape   = bias_weights.shape
+        assert len(bias_shape) == 1, 'Convolution bias weights tensor expected to have rank 1.'
+        assert bias_shape[0] == kernel_shape[3], 'Convolution bias size does not match convolution output channels.'
+        self._write_weights(name + '_b', bias_weights)
+        return name
+
+    def write_3d_convolution(self, input, name, op_path):
+        # Write code.
+        code = """\
+// {1} 3D convolution op.
+auto {1} = addConv3D(plugin_factory, *network, *{0}->getOutput(0),
+                     Conv3DType::kTensorFlow, {{5, {{{k_dims}}}}},
+                     Dims{{3, {{{stride_dims}}}}}, Dims{{3, {{{pad_start_dims}}}}}, Dims{{3, {{{pad_end_dims}}}}},
+                     weights.at("{1}_k"), weights.at("{1}_b"),
+                     "{1}");
+assert({1} != nullptr);
+
+"""
+        g = tf.get_default_graph()
+        assert g is not None, "No default graph set"
+        # Get and check convolution operation.
+        conv_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'Conv3D')))
+        # Assuming input/output of 3D convolution are in NHWC format.
+        assert conv_op.type == 'Conv3D', 'Expected Conv3D operation but got {}.'.format(conv_op.type)
+        assert conv_op.get_attr('data_format').decode() == 'NDHWC', 'Only NDHWC format is currently supported for 3D convolutions.'
+        assert len(conv_op.inputs)  == 2, 'Convolution expected to have 2 inputs.'
+        assert len(conv_op.outputs) == 1, 'Convolution expected to have only one output.'
+        # Get and check bias operation
+        bias_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'BiasAdd')))
+        assert bias_op.type == 'BiasAdd', 'Expected BiasAdd operation but got {}.'.format(bias_op.type)
+        # REVIEW alexeyk: is this a bug in our model code? Should the bias tensor be NDCHW dim as well?
+        assert bias_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 3D convolution biases.'
+        assert len(bias_op.inputs)  == 2, 'BiasAdd expected to have 2 inputs.'
+        assert len(bias_op.outputs) == 1, 'BiasAdd expected to have only one output.'
+
+        # Get weights, stride and padding.
+        kernel_weights = conv_op.inputs[1].eval()
+        kernel_shape   = kernel_weights.shape
+        assert len(kernel_shape) == 5, '3D convolution kernel weights tensor expected to have rank 5.'
+        # Weights in TF are in VRSCK format (in TensorRT speak).
+        kd, kh, kw, kc, kk = kernel_shape
+        strides = conv_op.get_attr('strides')
+        assert len(strides) == 5, '3D convolution strides tensor expected to have length 5.'
+        # Compute padding.
+        in_shape  = conv_op.inputs[0].shape.as_list()
+        out_shape = conv_op.outputs[0].shape.as_list()
+        pad_c_start, pad_c_end = self._compute_tf_padding(in_shape[1], kd, strides[1])
+        pad_h_start, pad_h_end = self._compute_tf_padding(in_shape[2], kh, strides[2])
+        pad_w_start, pad_w_end = self._compute_tf_padding(in_shape[3], kw, strides[3])
+        # cuDNN limitations...
+        assert pad_h_start == pad_h_end, 'Only symmetrical padding is currently supported for H dimension.'
+        assert pad_w_start == pad_w_end, 'Only symmetrical padding is currently supported for W dimension.'
+        # REVIEW alexeyk: C padding is done by padding plugin now, not by conv3d plugin.
+        p_c = pad_c_end - pad_c_start
+        assert 0 <= p_c and p_c <= 1, 'Depth end pad should not be greater than start pad by more than 1.'
+
+        code = code.format(input, name,
+                           k_dims         ='{}, {}, {}, {}, {}'.format(kk, kd, kc, kh, kw),
+                           stride_dims    ='{}, {}, {}'.format(strides[1], strides[2], strides[3]),
+                           pad_start_dims ='{}, {}, {}'.format(pad_c_start, pad_h_start, pad_w_start),
+                           pad_end_dims   ='{}, {}, {}'.format(pad_c_end, pad_h_end, pad_w_end))
+        self.code_writer.write(self._indent_lines(code))
+        # TRT requires kernel weights to be in KVCRS format while TensorFlow uses VRSCK.
+        kernel_weights = vrsck_to_kvcrs(kernel_weights)
+        # Write kernel weights.
+        self._write_weights(name + '_k', kernel_weights)
+        # Write bias weights.
+        bias_weights = bias_op.inputs[1].eval()
+        bias_shape   = bias_weights.shape
+        assert len(bias_shape) == 1, '3D convolution bias weights tensor expected to have rank 1.'
+        # REVIEW alexeyk: should really assert against D?
+        assert bias_shape[0] == kernel_shape[4], '3D convolution bias size does not match convolution output channels.'
+        self._write_weights(name + '_b', bias_weights)
+        return name
+
+    def write_3d_convolution_transpose(self, input, name, op_path):
+        # Write code.
+        code = """\
+// {1} 3D transposed convolution op.
+Dims {1}_out_dims{{4, {{{out_dims}}}}};
+auto {1} = addConv3DTranspose(plugin_factory, *network, *{0}->getOutput(0),
+                              Conv3DType::kTensorFlow, {{5, {{{k_dims}}}}}, {1}_out_dims,
+                              Dims{{3, {{{stride_dims}}}}}, Dims{{3, {{{pad_start_dims}}}}}, Dims{{3, {{{pad_end_dims}}}}},
+                              weights.at("{1}_k"), weights.at("{1}_b"),
+                              "{1}");
+assert({1} != nullptr);
+
+"""
+        slice_code = """\
+// {0} output slice op.
+auto {0}_slice_layer = addSlice(plugin_factory, *network, *{0}->getOutput(0),
+                                {0}_out_dims,
+                                {{4, {{0, 0, 0, 0}}}},
+                                {{4, {{{0}_out_dims.d[0] - 1, {0}_out_dims.d[1], {0}_out_dims.d[2], {0}_out_dims.d[3]}}}},
+                                "{0}_slice");
+assert({0}_slice_layer != nullptr);
+
+        """
+        g = tf.get_default_graph()
+        assert g is not None, "No default graph set"
+        # Get and check tran convolution operation.
+        conv_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'conv3d_transpose')))
+        # Assuming input/output of 3D convolution are in NHWC format.
+        assert conv_op.type == 'Conv3DBackpropInputV2', 'Expected Conv3DBackpropInputV2 operation but got {}.'.format(conv_op.type)
+        assert conv_op.get_attr('data_format').decode() == 'NDHWC', 'Only NDHWC format is currently supported for 3D transposed convolutions.'
+        assert len(conv_op.inputs)  == 3, 'Transposed convolution expected to have 3 inputs.'
+        assert len(conv_op.outputs) == 1, 'Transposed convolution expected to have only one output.'
+        # Get and check bias operation
+        bias_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'BiasAdd')))
+        assert bias_op.type == 'BiasAdd', 'Expected BiasAdd operation but got {}.'.format(bias_op.type)
+        # REVIEW alexeyk: is this a bug in our model code? Should the bias tensor be NDCHW dim as well?
+        assert bias_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 3D transposed convolution biases.'
+        assert len(bias_op.inputs)  == 2, 'BiasAdd expected to have 2 inputs.'
+        assert len(bias_op.outputs) == 1, 'BiasAdd expected to have only one output.'
+
+        # Get weights, stride and padding.
+        kernel_weights = conv_op.inputs[1].eval()
+        kernel_shape   = kernel_weights.shape
+        assert len(kernel_shape) == 5, '3D transposed convolution kernel weights tensor expected to have rank 5.'
+        # Weights in TF are in VRSCK format.
+        kd, kh, kw, kc, kk = kernel_shape
+        strides = conv_op.get_attr('strides')
+        assert len(strides) == 5, '3D transposed convolution strides tensor expected to have length 5.'
+        # Compute padding. 
+        # Note that padding is with respect to convolution input, that is transposed convolution output.
+        conv_out_shape = conv_op.inputs[0].shape.as_list()
+        conv_in_shape  = np.array(conv_op.outputs[0].shape.as_list())
+        pad_c_start, pad_c_end = self._compute_tf_padding(conv_in_shape[1], kd, strides[1])
+        pad_h_start, pad_h_end = self._compute_tf_padding(conv_in_shape[2], kh, strides[2])
+        pad_w_start, pad_w_end = self._compute_tf_padding(conv_in_shape[3], kw, strides[3])
+        # cuDNN limitations...
+        assert pad_h_start == pad_h_end, 'Only symmetrical padding is currently supported for H dimension.'
+        assert pad_w_start == pad_w_end, 'Only symmetrical padding is currently supported for W dimension.'
+        p_c = pad_c_end - pad_c_start
+        assert 0 <= p_c and p_c <= 1, 'Depth end pad should not be greater than start pad by more than 1.'
+        # This is a special case for 3D transposed convolution:
+        # Do not pad in C dimension in cuDNN but rather increase corresponding output (i.e. convo input) dimension.
+        # In such configuration, this transposed convolution should be followed by slicing plugin.
+        if pad_c_end != pad_c_start:
+            conv_in_shape[1] += 1
+            pad_c_start       = 0
+            pad_c_end         = 0
+        code = code.format(input, name,
+                           k_dims         ='{}, {}, {}, {}, {}'.format(kk, kd, kc, kh, kw),
+                           out_dims       ='{}, {}, {}, {}'.format(*(conv_in_shape[1:])[[0, 3, 1, 2]]),
+                           stride_dims    ='{}, {}, {}'.format(strides[1], strides[2], strides[3]),
+                           pad_start_dims ='{}, {}, {}'.format(pad_c_start, pad_h_start, pad_w_start),
+                           pad_end_dims   ='{}, {}, {}'.format(pad_c_end,   pad_h_end,   pad_w_end))
+        self.code_writer.write(self._indent_lines(code))
+        out_name = name
+        # Write slice layer in case of asymmetric C padding.
+        if p_c != 0:
+            self.code_writer.write(self._indent_lines(slice_code.format(name)))
+            out_name += '_slice_layer'
+        # TRT requires kernel weights to be in KVCRS format while TensorFlow uses VRSCK.
+        kernel_weights = vrsck_to_kvcrs(kernel_weights)
+        # Write kernel weights.
+        self._write_weights(name + '_k', kernel_weights)
+        # Write bias weights.
+        bias_weights = bias_op.inputs[1].eval()
+        bias_shape   = bias_weights.shape
+        assert len(bias_shape) == 1, '3D transposed convolution bias weights tensor expected to have rank 1.'
+        # REVIEW alexeyk: should really assert against D?
+        assert bias_shape[0] == kernel_shape[3], '3D transposed convolution bias size does not match convolution output channels.'
+        self._write_weights(name + '_b', bias_weights)
+        return out_name
+
+    def write_elu(self, input, name):
+        # Write code.
+        code = """\
+// {1} ELU activation op.
+auto {1} = addElu(plugin_factory, *network, *{0}->getOutput(0), data_type, "{1}");
+assert({1} != nullptr);
+
+"""
+        code = code.format(input, name)
+        self.code_writer.write(self._indent_lines(code))
+        return name
+
+    def write_cost_vol(self, left_input, right_input, name, op_path):
+        # Write code.
+        code = """\
+// {2} cost volume op.
+auto {2} = addCostVolume(plugin_factory, *network, *{0}->getOutput(0), *{1}->getOutput(0), {3}, "{2}");
+assert({2} != nullptr);
+
+"""
+        g = tf.get_default_graph()
+        assert g is not None, "No default graph set"
+        last_op   = g.get_operation_by_name(os.path.join(op_path, 'concat'))
+        out_shape = last_op.outputs[0].shape
+        assert len(out_shape) == 5, 'Cost volume output tensor expected to have rank 5 but got {}.'.format(len(out_shape))
+        # Max disparity is the second outermost dimension in the output.
+        max_disp = out_shape[1]
+        code = code.format(left_input, right_input, name, max_disp)
+        self.code_writer.write(self._indent_lines(code))
+        return name
+
+    def write_conv3d_pad(self, input, name):
+        # Write code.
+        code = """\
+// {1} padding op.
+auto {1} = addPad(plugin_factory, *network, *{0}->getOutput(0), {{0, 0, 0, 0}}, {{1, 0, 0, 0}}, "{1}");
+assert({1} != nullptr);
+
+"""
+        code = code.format(input, name)
+        self.code_writer.write(self._indent_lines(code))
+        return name
+
+    def write_conv3d_transform(self, input, name):
+        code = """\
+// Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose
+auto {1} = addTransform(plugin_factory, *network, *{0}->getOutput(0), {{1, 0, 2, 3}}, "{1}_transform");
+assert({1} != nullptr);
+
+"""
+        code = code.format(input, name)
+        self.code_writer.write(self._indent_lines(code))
+        return name
+
+    def write_add_tensors(self, t1, t2, name):
+        # Write code.
+        code = """\
+// {2} tensor add op.
+auto {2} = network->addElementWise(*({0}->getOutput(0)), *({1}->getOutput(0)), ElementWiseOperation::kSUM);
+assert({2} != nullptr);
+
+"""
+        code = code.format(t1, t2, name)
+        self.code_writer.write(self._indent_lines(code))
+        return name
+
+    def write_softargmax(self, input, name):
+        code = """\
+// Softargmax.
+auto {1} = addSoftargmax(plugin_factory, *network, *{0}->getOutput(0), "{1}_softargmax");
+assert({1} != nullptr);
+
+"""
+        code = code.format(input, name)
+        self.code_writer.write(self._indent_lines(code))
+        return name
diff --git a/stereoDNN/scripts/test_data_generator.py b/stereoDNN/scripts/test_data_generator.py
new file mode 100644
index 0000000..bcf4948
--- /dev/null
+++ b/stereoDNN/scripts/test_data_generator.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+"""
+Generates data for nvstereo_inference library tests.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import numpy as np
+import os
+import struct
+import time
+
+import warnings
+# Ignore 'FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated' warning.
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=FutureWarning, module='h5py')
+    import tensorflow as tf
+
+from data_converters import *
+
+parser = argparse.ArgumentParser(description='Stereo DNN TensorRT C++ test data generator')
+
+parser.add_argument('--data_dir', type=str, help='directory path to store generated test data', required=True)
+
+args = parser.parse_args()
+
+def get_rand(shape):
+    return np.random.randn(*shape).astype(np.float32)
+
+def write_bin(src, filename):
+    with open(filename, 'wb') as w:
+        w.write(struct.pack('<i', len(src.shape)))
+        for d in src.shape:
+            w.write(struct.pack('<i', d))
+        src.reshape(-1).tofile(w)
+
+def create_elu_plugin_data(data_dir):
+    # Create data for ELU plugin.
+    print("---")
+    print("Creating data for ELU plugin...")
+    print("---")
+    # Basic test.
+    np.random.seed(1)
+    dims   = [1, 2, 4, 3] # NHWC
+    input  = 10 * get_rand(dims) - 5
+    output = tf.nn.elu(input).eval()
+    write_bin(nhwc_to_nchw(input),  os.path.join(data_dir, 'elu_i_01.bin'))
+    write_bin(nhwc_to_nchw(output), os.path.join(data_dir, 'elu_o_01.bin'))
+
+    # 4D input/batch size 2.
+    np.random.seed(1)
+    dims   = [2, 2, 3, 4, 3] # NDHWC
+    input  = 10 * get_rand(dims) - 5
+    output = tf.nn.elu(input).eval()
+    write_bin(ndhwc_to_ndchw(input),  os.path.join(data_dir, 'elu_i_02.bin'))
+    write_bin(ndhwc_to_ndchw(output), os.path.join(data_dir, 'elu_o_02.bin'))
+
+def create_conv3d_plugin_data(data_dir):
+    print("---")
+    print("Creating data for Conv3D plugin...")
+    print("---")
+    # The most basic test: single output/feature map, no padding/unit strides.
+    np.random.seed(1)
+    x = get_rand([1, 1, 3, 3, 3]) # NDHWC
+    w = get_rand([1, 3, 3, 3, 1]) # VRSCK
+    s = [1, 1, 1, 1, 1]
+    y = tf.nn.conv3d(x, w, s, padding='VALID').eval()
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_01_x.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_01_w.bin'))
+    write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_01_y.bin'))
+
+    # Testing HW padding and strides (D == 1, K == 1).
+    np.random.seed(1)
+    x = get_rand([1, 1, 5, 5, 3]) # NDHWC
+    w = get_rand([1, 3, 3, 3, 1]) # VRSCK
+    s = [1, 1, 2, 2, 1]
+    y = tf.nn.conv3d(x, w, s, padding='SAME').eval()
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_02_x.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_02_w.bin'))
+    write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_02_y.bin'))
+
+    # Testing HW strides with DHW padding.
+    np.random.seed(1)
+    x = get_rand([1, 2, 3, 3, 3]) # NDHWC
+    w = get_rand([2, 3, 3, 3, 1]) # VRSCK
+    s = [1, 1, 2, 2, 1]
+    y = tf.nn.conv3d(x, w, s, padding='SAME').eval()
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_03_x.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_03_w.bin'))
+    write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_03_y.bin'))
+
+    # Testing unit stride and DHW padding (padding is symmetrical in D dim) and multiple feature maps.
+    # Note: similar to conv3D_1 block in NVSmall.
+    np.random.seed(1)
+    x = get_rand([1, 8, 9, 9, 3]) # NDHWC
+    w = get_rand([3, 3, 3, 3, 4]) # VRSCK
+    s = [1, 1, 1, 1, 1]
+    y = tf.nn.conv3d(x, w, s, padding='SAME').eval()
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_04_x.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_04_w.bin'))
+    write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_04_y.bin'))
+    #write_bin(np.transpose(y, [0, 4, 1, 2, 3]), os.path.join(data_dir, 'conv3d_04_y.bin'))
+
+    # Testing DHW strides with DHW padding (padding is non-symmetrical in D dim) and multiple feature maps.
+    # Note: similar to conv3D_3ds block in NVSmall.
+    np.random.seed(1)
+    x = get_rand([1, 8, 9, 9, 3]) # NDHWC
+    w = get_rand([3, 3, 3, 3, 4]) # VRSCK
+    s = [1, 2, 2, 2, 1]
+    y = tf.nn.conv3d(x, w, s, padding='SAME').eval()
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_05_x.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_05_w.bin'))
+    write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_05_y.bin'))
+
+    # Testing DHW strides with DHW padding (padding is non-symmetrical in D dim) and multiple feature maps
+    # as well as bias and ELU activation.
+    # Note: similar to full conv3D_3ds block in NVSmall.
+    np.random.seed(1)
+    x = get_rand([1, 8, 9, 9, 3]) # NDHWC
+    w = get_rand([3, 3, 3, 3, 6]) # VRSCK
+    b = get_rand([6])             # K
+    s = [1, 2, 2, 2, 1]
+    y   = tf.nn.conv3d(x, w, s, padding='SAME')
+    y_b = tf.nn.bias_add(y, b)
+    y_a = tf.nn.elu(y_b).eval()
+    write_bin(ndhwc_to_ndchw(x),   os.path.join(data_dir, 'conv3d_06_x.bin'))
+    write_bin(vrsck_to_kvcrs(w),   os.path.join(data_dir, 'conv3d_06_w.bin'))
+    write_bin(b,                   os.path.join(data_dir, 'conv3d_06_b.bin'))
+    write_bin(ndhwc_to_ndchw(y_a), os.path.join(data_dir, 'conv3d_06_y.bin'))
+
+    # Testing end output padding in D dimension.
+    # Note: similar to full conv3D_3ds->conv3D_4 blocks in NVSmall.
+    np.random.seed(1)
+    x   = get_rand([1, 8, 9, 9, 3]) # NDHWC
+    w   = get_rand([3, 3, 3, 3, 3]) # VRSCK
+    s_1 = [1, 1, 1, 1, 1]
+    s_2 = [1, 2, 2, 2, 1]
+    # First convo has non-symmetrical D padding.
+    y_1 = tf.nn.conv3d(x,   w, s_1, padding='SAME')
+    y_2 = tf.nn.conv3d(y_1, w, s_2, padding='SAME').eval()
+    write_bin(ndhwc_to_ndchw(x),   os.path.join(data_dir, 'conv3d_07_x.bin'))
+    write_bin(vrsck_to_kvcrs(w),   os.path.join(data_dir, 'conv3d_07_w.bin'))
+    write_bin(ndhwc_to_ndchw(y_2), os.path.join(data_dir, 'conv3d_07_y.bin'))
+
+def create_conv3d_tran_plugin_data(data_dir):
+    print("---")
+    print("Creating data for Conv3dTranspose plugin...")
+    print("---")
+    # The most basic test: single input/feature map, no padding/unit strides.
+    np.random.seed(1)
+    y = get_rand([1, 1, 1, 1, 1]) # NDHWK
+    w = get_rand([1, 3, 3, 3, 1]) # VRSCK
+    s          = [1, 1, 1, 1, 1]
+    x_shape    = [1, 1, 3, 3, 3]  # NDHWC
+    x = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='VALID').eval()
+    write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_tran_01_y.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_tran_01_w.bin'))
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_tran_01_x.bin'))
+
+    # Testing HW padding and strides (D == 1, K == 1).
+    np.random.seed(1)
+    y = get_rand([1, 1, 3, 3, 1]) # NDHWK
+    w = get_rand([1, 3, 3, 3, 1]) # VRSCK
+    s          = [1, 1, 2, 2, 1]
+    x_shape    = [1, 1, 5, 5, 3]  # NDHWC
+    x = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='SAME').eval()
+    write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_tran_02_y.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_tran_02_w.bin'))
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_tran_02_x.bin'))
+
+    # Testing DHW padding and strides.
+    # Note: similar to deconv3D_1 convo in NVSmall.
+    np.random.seed(1)
+    y = get_rand([1, 4, 5, 5, 8]) # NDHWK
+    w = get_rand([3, 3, 3, 4, 8]) # VRSCK
+    s          = [1, 2, 2, 2, 1]
+    x_shape    = [1, 8, 9, 9, 4]  # NDHWC
+    x = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='SAME').eval()
+    write_bin(ndhwc_to_ncdhw(y), os.path.join(data_dir, 'conv3d_tran_03_y.bin'))
+    write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_tran_03_w.bin'))
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_tran_03_x.bin'))
+
+    # Testing DHW padding and strides with bias and ELU.
+    # Note: similar to deconv3D_1 convo in NVSmall.
+    np.random.seed(1)
+    y = get_rand([1, 4, 5, 5, 8]) # NDHWK
+    w = get_rand([3, 3, 3, 4, 8]) # VRSCK
+    b = get_rand([4])             # C
+    s          = [1, 2, 2, 2, 1]
+    x_shape    = [1, 8, 9, 9, 4]  # NDHWC
+    x   = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='SAME')
+    x_b = tf.nn.bias_add(x, b)
+    x_a = tf.nn.elu(x_b).eval()
+    write_bin(ndhwc_to_ncdhw(y),   os.path.join(data_dir, 'conv3d_tran_04_y.bin'))
+    write_bin(vrsck_to_kvcrs(w),   os.path.join(data_dir, 'conv3d_tran_04_w.bin'))
+    write_bin(ndhwc_to_ndchw(x_a), os.path.join(data_dir, 'conv3d_tran_04_x.bin'))
+    write_bin(b,                   os.path.join(data_dir, 'conv3d_tran_04_b.bin'))
+
+    # Testing DHW padding and strides in 2 subsequent deconvolutions.
+    # Note: similar to deconv3D_1->deconv3D_2 convo in NVSmall.
+    np.random.seed(1)
+    # First convo^T.
+    y  = get_rand([1,  4,  5,  5, 16]) # NDHWK
+    w1 = get_rand([3,  3,  3,  8, 16]) # VRSCK
+    s1          = [1,  2,  2,  2,  1]
+    x1_shape    = [1,  8,  9,  9,  8]  # NDHWC
+    # Second convo^T.
+    w2 = get_rand([3,  3,  3,  4,  8]) # VRSCK
+    s2          = [1,  2,  2,  2,  1]
+    x2_shape    = [1, 16, 17, 17,  4]  # NDHWC
+    x1 = tf.nn.conv3d_transpose(y,  w1, output_shape=x1_shape, strides=s1, padding='SAME')
+    x2 = tf.nn.conv3d_transpose(x1, w2, output_shape=x2_shape, strides=s2, padding='SAME').eval()
+    write_bin(ndhwc_to_ncdhw(y),  os.path.join(data_dir, 'conv3d_tran_05_y.bin'))
+    write_bin(vrsck_to_kvcrs(w1), os.path.join(data_dir, 'conv3d_tran_05_w1.bin'))
+    write_bin(vrsck_to_kvcrs(w2), os.path.join(data_dir, 'conv3d_tran_05_w2.bin'))
+    write_bin(ndhwc_to_ndchw(x2), os.path.join(data_dir, 'conv3d_tran_05_x.bin'))
+
+def create_cost_volume_data(data_dir):
+    def cost_volume(left, right, max_disp):
+        height = int(left.shape[1])
+        width  = int(left.shape[2])
+        depth  = int(left.shape[3])
+
+        right_padded = tf.pad(right, [[0, 0], [0, 0], [max_disp - 1, 0], [0,0]], "CONSTANT")
+        right_disp   = tf.extract_image_patches(right_padded, [1, height, width, 1], [1, 1, 1, 1], [1, 1, 1, 1], padding="VALID")
+        right_disp   = tf.squeeze(right_disp, axis=1)
+        disp_dim     = int(right_disp.shape[1])
+        right_disp   = tf.reshape(right_disp, [-1, disp_dim, height, width, depth])
+        right_disp   = tf.reverse(right_disp, [1])
+        
+        left_disp = tf.expand_dims(left, axis=1)
+        left_disp = tf.tile(left_disp, [1, disp_dim, 1, 1, 1])
+        
+        cost_volume = tf.concat([left_disp, right_disp], axis=4)
+    
+        return cost_volume
+        
+    print("---")
+    print("Creating data for CostVolume plugin...")
+    print("---")
+
+    # Basic test: 6x6x4 input, max_disp == 2, output is 2x6x6x8.
+    np.random.seed(1)
+    in_shape = [1, 6, 6, 4] # NHWC
+    left  = get_rand(in_shape) 
+    right = get_rand(in_shape)
+    max_disp = 2
+    cost_vol = cost_volume(left, right, max_disp).eval()
+    write_bin(nhwc_to_nchw(left),       os.path.join(data_dir, 'cost_vol_01_l.bin'))
+    write_bin(nhwc_to_nchw(right),      os.path.join(data_dir, 'cost_vol_01_r.bin'))
+    write_bin(ndhwc_to_ndchw(cost_vol), os.path.join(data_dir, 'cost_vol_01_cv.bin'))
+
+    # More realistic test: 33x32x8 input, max_disp == 8, output is 8x33x32x16.
+    np.random.seed(1)
+    in_shape = [1, 32, 33, 8] # NHWC
+    #in_shape = [1, 128, 257, 32] # NHWC
+    left  = get_rand(in_shape) 
+    right = get_rand(in_shape)
+    max_disp = 12
+    #max_disp = 24
+    cost_vol = cost_volume(left, right, max_disp).eval()
+    write_bin(nhwc_to_nchw(left),       os.path.join(data_dir, 'cost_vol_02_l.bin'))
+    write_bin(nhwc_to_nchw(right),      os.path.join(data_dir, 'cost_vol_02_r.bin'))
+    write_bin(ndhwc_to_ndchw(cost_vol), os.path.join(data_dir, 'cost_vol_02_cv.bin'))
+
+def create_softargmax_data(data_dir):
+    def softargmax(volume):
+        input_depth = int(volume.shape[1])
+        index_steps = tf.constant(np.reshape(np.array(range(input_depth)),(1,input_depth,1,1,1)), dtype=tf.float32)
+        prob_volume = tf.nn.softmax(tf.multiply(volume, -1.0), dim=1)
+        softargmax_result = tf.reduce_sum(tf.multiply(prob_volume, index_steps), axis=1)
+    
+        return softargmax_result
+        
+    print("---")
+    print("Creating data for Softargmax plugin...")
+    print("---")
+
+    # Basic test: 4x5x5x1 input, output is 5x5x1.
+    np.random.seed(1)
+    x = get_rand([1, 4, 5, 7, 1]) # NDHWC
+    y = softargmax(x).eval()
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'softargmax_01_x.bin'))
+    write_bin(nhwc_to_nchw(y),   os.path.join(data_dir, 'softargmax_01_y.bin'))
+
+    # Large test: 2x12x33x65x1 input, output is 2x33x65x1.
+    np.random.seed(1)
+    x = get_rand([2, 12, 33, 65, 1]) # NDHWC
+    y = softargmax(x).eval()
+    write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'softargmax_02_x.bin'))
+    write_bin(nhwc_to_nchw(y),   os.path.join(data_dir, 'softargmax_02_y.bin'))
+
+def main():
+    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
+    config.gpu_options.allow_growth = True
+    sess = tf.InteractiveSession(config=config)
+
+    create_elu_plugin_data(args.data_dir)
+    create_conv3d_plugin_data(args.data_dir)
+    create_conv3d_tran_plugin_data(args.data_dir)
+    create_cost_volume_data(args.data_dir)
+    create_softargmax_data(args.data_dir)
+
+    print("Done.")
+
+if __name__ == '__main__':
+    main()
diff --git a/stereoDNN/tests/CMakeLists.txt b/stereoDNN/tests/CMakeLists.txt
new file mode 100644
index 0000000..37bd2db
--- /dev/null
+++ b/stereoDNN/tests/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.5)
+
+set(PROJECT_NAME nvstereo_tests)
+project(${PROJECT_NAME})
+
+find_package(CUDA 9.0 REQUIRED)
+find_package(OpenCV 3.3.1 REQUIRED)
+
+set(GTEST_ROOT "/usr/src/gtest")
+find_package(GTest REQUIRED)
+
+include_directories(${CUDA_INCLUDE_DIRS})
+include_directories(${GTEST_INCLUDE_DIR})
+include_directories(${CMAKE_SOURCE_DIR}/lib)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+
+file(GLOB ${PROJECT_NAME}_sources ./*.cpp)
+set(PROJECT_SOURCES ${${PROJECT_NAME}_sources})
+
+set(TARGET_NAME ${PROJECT_NAME}${TARGET_SUFFIX})
+
+add_executable(${TARGET_NAME} ${PROJECT_SOURCES})
+
+target_link_libraries(${TARGET_NAME}
+    nvstereo_inference${TARGET_SUFFIX}
+    ${CUDA_LIBRARIES}
+    cudnn
+    nvinfer
+    ${GTEST_LIBRARIES}
+    pthread
+    opencv_core)
diff --git a/stereoDNN/tests/data/conv3d_01_w.bin b/stereoDNN/tests/data/conv3d_01_w.bin
new file mode 100644
index 0000000..ef006ca
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_01_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_01_x.bin b/stereoDNN/tests/data/conv3d_01_x.bin
new file mode 100644
index 0000000..deb71fe
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_01_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_01_y.bin b/stereoDNN/tests/data/conv3d_01_y.bin
new file mode 100644
index 0000000..223950d
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_01_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_02_w.bin b/stereoDNN/tests/data/conv3d_02_w.bin
new file mode 100644
index 0000000..9d5fe0f
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_02_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_02_x.bin b/stereoDNN/tests/data/conv3d_02_x.bin
new file mode 100644
index 0000000..6ec89cd
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_02_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_02_y.bin b/stereoDNN/tests/data/conv3d_02_y.bin
new file mode 100644
index 0000000..9629d27
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_02_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_03_w.bin b/stereoDNN/tests/data/conv3d_03_w.bin
new file mode 100644
index 0000000..88c9fad
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_03_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_03_x.bin b/stereoDNN/tests/data/conv3d_03_x.bin
new file mode 100644
index 0000000..95ae1ec
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_03_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_03_y.bin b/stereoDNN/tests/data/conv3d_03_y.bin
new file mode 100644
index 0000000..61cab77
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_03_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_04_w.bin b/stereoDNN/tests/data/conv3d_04_w.bin
new file mode 100644
index 0000000..04464ff
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_04_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_04_x.bin b/stereoDNN/tests/data/conv3d_04_x.bin
new file mode 100644
index 0000000..6b565e0
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_04_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_04_y.bin b/stereoDNN/tests/data/conv3d_04_y.bin
new file mode 100644
index 0000000..51b3bc3
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_04_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_05_w.bin b/stereoDNN/tests/data/conv3d_05_w.bin
new file mode 100644
index 0000000..04464ff
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_05_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_05_x.bin b/stereoDNN/tests/data/conv3d_05_x.bin
new file mode 100644
index 0000000..6b565e0
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_05_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_05_y.bin b/stereoDNN/tests/data/conv3d_05_y.bin
new file mode 100644
index 0000000..68ae000
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_05_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_06_b.bin b/stereoDNN/tests/data/conv3d_06_b.bin
new file mode 100644
index 0000000..9eba91c
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_b.bin differ
diff --git a/stereoDNN/tests/data/conv3d_06_w.bin b/stereoDNN/tests/data/conv3d_06_w.bin
new file mode 100644
index 0000000..f281e5f
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_06_x.bin b/stereoDNN/tests/data/conv3d_06_x.bin
new file mode 100644
index 0000000..6b565e0
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_06_y.bin b/stereoDNN/tests/data/conv3d_06_y.bin
new file mode 100644
index 0000000..d94cdd0
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_07_w.bin b/stereoDNN/tests/data/conv3d_07_w.bin
new file mode 100644
index 0000000..78711c2
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_07_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_07_x.bin b/stereoDNN/tests/data/conv3d_07_x.bin
new file mode 100644
index 0000000..6b565e0
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_07_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_07_y.bin b/stereoDNN/tests/data/conv3d_07_y.bin
new file mode 100644
index 0000000..f2b3793
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_07_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_01_w.bin b/stereoDNN/tests/data/conv3d_tran_01_w.bin
new file mode 100644
index 0000000..53dad4a
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_01_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_01_x.bin b/stereoDNN/tests/data/conv3d_tran_01_x.bin
new file mode 100644
index 0000000..88f9fd8
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_01_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_01_y.bin b/stereoDNN/tests/data/conv3d_tran_01_y.bin
new file mode 100644
index 0000000..3100966
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_01_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_02_w.bin b/stereoDNN/tests/data/conv3d_tran_02_w.bin
new file mode 100644
index 0000000..e7deb6e
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_02_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_02_x.bin b/stereoDNN/tests/data/conv3d_tran_02_x.bin
new file mode 100644
index 0000000..c92034e
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_02_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_02_y.bin b/stereoDNN/tests/data/conv3d_tran_02_y.bin
new file mode 100644
index 0000000..a0e96b9
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_02_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_03_w.bin b/stereoDNN/tests/data/conv3d_tran_03_w.bin
new file mode 100644
index 0000000..0be9855
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_03_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_03_x.bin b/stereoDNN/tests/data/conv3d_tran_03_x.bin
new file mode 100644
index 0000000..10ec61f
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_03_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_03_y.bin b/stereoDNN/tests/data/conv3d_tran_03_y.bin
new file mode 100644
index 0000000..cf0bea3
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_03_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_04_b.bin b/stereoDNN/tests/data/conv3d_tran_04_b.bin
new file mode 100644
index 0000000..bd752ea
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_b.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_04_w.bin b/stereoDNN/tests/data/conv3d_tran_04_w.bin
new file mode 100644
index 0000000..0be9855
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_w.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_04_x.bin b/stereoDNN/tests/data/conv3d_tran_04_x.bin
new file mode 100644
index 0000000..2dfad65
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_04_y.bin b/stereoDNN/tests/data/conv3d_tran_04_y.bin
new file mode 100644
index 0000000..cf0bea3
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_y.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_05_w1.bin b/stereoDNN/tests/data/conv3d_tran_05_w1.bin
new file mode 100644
index 0000000..c6e49dd
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_w1.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_05_w2.bin b/stereoDNN/tests/data/conv3d_tran_05_w2.bin
new file mode 100644
index 0000000..84cd269
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_w2.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_05_x.bin b/stereoDNN/tests/data/conv3d_tran_05_x.bin
new file mode 100644
index 0000000..ef1f2d0
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_x.bin differ
diff --git a/stereoDNN/tests/data/conv3d_tran_05_y.bin b/stereoDNN/tests/data/conv3d_tran_05_y.bin
new file mode 100644
index 0000000..ea06255
Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_y.bin differ
diff --git a/stereoDNN/tests/data/cost_vol_01_cv.bin b/stereoDNN/tests/data/cost_vol_01_cv.bin
new file mode 100644
index 0000000..a1543e5
Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_01_cv.bin differ
diff --git a/stereoDNN/tests/data/cost_vol_01_l.bin b/stereoDNN/tests/data/cost_vol_01_l.bin
new file mode 100644
index 0000000..f20a15b
Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_01_l.bin differ
diff --git a/stereoDNN/tests/data/cost_vol_01_r.bin b/stereoDNN/tests/data/cost_vol_01_r.bin
new file mode 100644
index 0000000..421fadf
Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_01_r.bin differ
diff --git a/stereoDNN/tests/data/cost_vol_02_cv.bin b/stereoDNN/tests/data/cost_vol_02_cv.bin
new file mode 100644
index 0000000..dd4cfe8
Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_02_cv.bin differ
diff --git a/stereoDNN/tests/data/cost_vol_02_l.bin b/stereoDNN/tests/data/cost_vol_02_l.bin
new file mode 100644
index 0000000..b21dca5
Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_02_l.bin differ
diff --git a/stereoDNN/tests/data/cost_vol_02_r.bin b/stereoDNN/tests/data/cost_vol_02_r.bin
new file mode 100644
index 0000000..f194f58
Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_02_r.bin differ
diff --git a/stereoDNN/tests/data/elu_i_01.bin b/stereoDNN/tests/data/elu_i_01.bin
new file mode 100644
index 0000000..b933285
Binary files /dev/null and b/stereoDNN/tests/data/elu_i_01.bin differ
diff --git a/stereoDNN/tests/data/elu_i_02.bin b/stereoDNN/tests/data/elu_i_02.bin
new file mode 100644
index 0000000..53e745c
Binary files /dev/null and b/stereoDNN/tests/data/elu_i_02.bin differ
diff --git a/stereoDNN/tests/data/elu_o_01.bin b/stereoDNN/tests/data/elu_o_01.bin
new file mode 100644
index 0000000..58c4fe7
Binary files /dev/null and b/stereoDNN/tests/data/elu_o_01.bin differ
diff --git a/stereoDNN/tests/data/elu_o_02.bin b/stereoDNN/tests/data/elu_o_02.bin
new file mode 100644
index 0000000..f034c43
Binary files /dev/null and b/stereoDNN/tests/data/elu_o_02.bin differ
diff --git a/stereoDNN/tests/data/softargmax_01_x.bin b/stereoDNN/tests/data/softargmax_01_x.bin
new file mode 100644
index 0000000..b62ba78
Binary files /dev/null and b/stereoDNN/tests/data/softargmax_01_x.bin differ
diff --git a/stereoDNN/tests/data/softargmax_01_y.bin b/stereoDNN/tests/data/softargmax_01_y.bin
new file mode 100644
index 0000000..8914737
Binary files /dev/null and b/stereoDNN/tests/data/softargmax_01_y.bin differ
diff --git a/stereoDNN/tests/data/softargmax_02_x.bin b/stereoDNN/tests/data/softargmax_02_x.bin
new file mode 100644
index 0000000..1523e7c
Binary files /dev/null and b/stereoDNN/tests/data/softargmax_02_x.bin differ
diff --git a/stereoDNN/tests/data/softargmax_02_y.bin b/stereoDNN/tests/data/softargmax_02_y.bin
new file mode 100644
index 0000000..2d75a09
Binary files /dev/null and b/stereoDNN/tests/data/softargmax_02_y.bin differ
diff --git a/stereoDNN/tests/tests_main.cpp b/stereoDNN/tests/tests_main.cpp
new file mode 100644
index 0000000..4a54fcf
--- /dev/null
+++ b/stereoDNN/tests/tests_main.cpp
@@ -0,0 +1,988 @@
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Full license terms provided in LICENSE.md file.
+
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <opencv2/opencv.hpp>
+
+#include "redtail_tensorrt_plugins.h"
+#include "internal_utils.h"
+
+using namespace nvinfer1;
+using namespace redtail::tensorrt;
+
+using FloatVec = std::vector<float>;
+
+// -----------------------------------------------------------------
+// TensorRT logger.
+// -----------------------------------------------------------------
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(ILogger::Severity max_log_level) :
+        max_log_level_(max_log_level)
+    {
+    }
+
+    void log(nvinfer1::ILogger::Severity severity, const char* msg) override
+    {
+        if (severity > max_log_level_)
+            return;
+
+        switch (severity)
+        {
+            case Severity::kINTERNAL_ERROR: std::cerr << "TRT INTERNAL_ERROR: "; break;
+            case Severity::kERROR:          std::cerr << "TRT ERROR: "; break;
+            case Severity::kWARNING:        std::cerr << "TRT WARNING: "; break;
+            case Severity::kINFO:           std::cerr << "TRT INFO: "; break;
+            default:                        std::cerr << "TRT UNKNOWN: "; break;
+        }
+        std::cerr << msg << std::endl;
+    }
+
+private:
+    Severity max_log_level_;
+};
+
+static std::unique_ptr<Logger> g_logger;
+
+std::string g_data_dir("");
+
+// -----------------------------------------------------------------
+// Helper struct to store network input information.
+// -----------------------------------------------------------------
+struct NetInput
+{
+    std::string name;
+    Dims        dims;
+    FloatVec    data;
+};
+
+// -----------------------------------------------------------------
+// Main test driver function. Creates all required TensorRT components
+// to run TRT plugin. The plugin is created by the provided factory.
+// Returns the output of the plugin.
+// -----------------------------------------------------------------
+template<typename FactoryOp, typename PostProcOp>
+FloatVec runPlugin(int batch_size, const std::vector<NetInput>& inputs, Dims out_dims,
+                   FactoryOp factory_op, PostProcOp post_proc_op,
+                   IPluginContainer& factory, DataType data_type = DataType::kFLOAT)
+{
+    // REVIEW alexeyk: assuming single output for now.
+    const char* output_name = "output";
+
+    IBuilder* builder = createInferBuilder(*g_logger);
+    // Note: must use EXPECT_* as ASSERT_ contains return statement.
+    EXPECT_NE(builder, nullptr);
+
+    INetworkDefinition* network = builder->createNetwork();
+    EXPECT_NE(network, nullptr);
+
+    // Add inputs.
+    std::vector<ITensor*> plugin_inputs(inputs.size());
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        // Must have at least rank 2 and same batch size.
+        EXPECT_GT(inputs[i].dims.nbDims, 1);
+        EXPECT_EQ(batch_size, inputs[i].dims.d[0]);
+        // Get input dims without batch index dim.
+        Dims in_dims;
+        in_dims.nbDims = inputs[i].dims.nbDims - 1;
+        std::copy(inputs[i].dims.d + 1, inputs[i].dims.d + inputs[i].dims.nbDims, in_dims.d);
+        // addInput currently supports only 1D-3D input.
+        if (in_dims.nbDims <= 3)
+        {
+            auto input = network->addInput(inputs[i].name.c_str(), data_type,
+                                           DimsCHW(in_dims.d[0], in_dims.d[1], in_dims.d[2]));
+            EXPECT_NE(input, nullptr);
+            plugin_inputs[i] = input;
+        }
+        else if (in_dims.nbDims == 4)
+        {
+            // Create input with flattened dims.
+            EXPECT_EQ(DimsUtils::getTensorSize(in_dims), (int)DimsUtils::getTensorSize(in_dims));
+            DimsCHW flat_dims{1, 1, (int)DimsUtils::getTensorSize(in_dims)};
+            auto input = network->addInput(inputs[i].name.c_str(), data_type, flat_dims);
+            EXPECT_NE(input, nullptr);
+            // Add reshape layer to restore original dims.
+            auto shuf = network->addShuffle(*input);
+            EXPECT_NE(shuf, nullptr);
+            shuf->setReshapeDimensions(DimsNCHW(in_dims.d[0], in_dims.d[1], in_dims.d[2], in_dims.d[3]));
+            plugin_inputs[i] = shuf->getOutput(0);
+        }
+        else
+            assert(false); // TRT does not support input tensors with rank > 4.
+    }
+
+    // Create plugin. The factory method can create additional layers/plugins.
+    IPlugin* plugin = factory_op();
+    EXPECT_NE(plugin, nullptr);
+
+    // Add the plugin to the network.
+    auto plugin_layer = network->addPlugin(&plugin_inputs[0], inputs.size(), *plugin);
+    EXPECT_NE(plugin_layer, nullptr);
+
+    ILayer* out_layer = post_proc_op(network, plugin_layer, factory);
+
+    // Setup network output.
+    auto out_layer_out = out_layer->getOutput(0);
+    EXPECT_NE(out_layer_out, nullptr);
+    out_layer_out->setName(output_name);
+    network->markOutput(*out_layer_out);
+
+    // Build the engine.
+    builder->setMaxBatchSize(batch_size);
+    // "ought to be enough for anybody."
+    builder->setMaxWorkspaceSize(1024 * 1024 * 1024);
+
+    builder->setHalf2Mode(data_type == DataType::kHALF);
+    
+    auto engine = builder->buildCudaEngine(*network);
+    EXPECT_NE(engine, nullptr);
+    // Network and builder can be destroyed right after network is built.
+    // This follows the behavior in real (non-test) code.
+    builder->destroy();
+    network->destroy();
+
+    // Setup input and output buffers.
+    std::vector<void*> buffers(inputs.size() + 1);
+    EXPECT_EQ(engine->getNbBindings(), buffers.size());
+
+    size_t elt_size_bytes = data_type == DataType::kHALF ? 2 : 4;
+
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        // Expecting binding indices to match inputs.
+        EXPECT_EQ(engine->getBindingIndex(inputs[i].name.c_str()), i);
+        // Allocate and copy.
+        CHECKL(cudaMalloc(&buffers[i], inputs[i].data.size() * elt_size_bytes), *g_logger);
+        // Do FP32 -> FP16 of input if necessary.
+        if (data_type == DataType::kFLOAT)
+            CHECKL(cudaMemcpy(buffers[i], inputs[i].data.data(), inputs[i].data.size() * elt_size_bytes, cudaMemcpyHostToDevice), *g_logger);
+        else
+        {
+            cv::Mat dst;
+            cv::convertFp16(cv::Mat(inputs[i].data), dst);
+            CHECKL(cudaMemcpy(buffers[i], dst.data, inputs[i].data.size() * elt_size_bytes, cudaMemcpyHostToDevice), *g_logger);
+        }
+    }
+
+    int    out_idx  = engine->getBindingIndex(output_name);
+    size_t out_size = DimsUtils::getTensorSize(out_dims);
+    EXPECT_EQ(out_idx, buffers.size() - 1);
+    CHECKL(cudaMalloc(&buffers[out_idx], out_size * elt_size_bytes), *g_logger);
+
+    // Create the context.
+    IExecutionContext *context = engine->createExecutionContext();
+    EXPECT_NE(context, nullptr);
+
+    // Run (finally).
+    auto host_start = std::chrono::high_resolution_clock::now();
+    auto res = context->execute(batch_size, buffers.data());
+    auto host_end   = std::chrono::high_resolution_clock::now();
+    EXPECT_TRUE(res);
+    auto host_elapsed_ms = std::chrono::duration<float, std::milli>(host_end - host_start).count();
+    auto msg = std::string("Host execution time: ") + std::to_string(host_elapsed_ms);
+    g_logger->log(ILogger::Severity::kINFO, msg.c_str());
+
+    // Copy results back to host.
+    FloatVec out_h(out_size);
+    // Do FP32 -> FP16 of input if necessary.
+    if (data_type == DataType::kFLOAT)
+    {    
+        auto out_h_p = const_cast<float*>(out_h.data());
+        CHECKL(cudaMemcpy(out_h_p, buffers[out_idx], out_h.size() * sizeof(float), cudaMemcpyDeviceToHost), *g_logger);
+    }
+    else if (data_type == DataType::kHALF)
+    {
+        std::vector<uint16_t> out_h_16(out_size);
+        auto out_h_p = const_cast<uint16_t*>(out_h_16.data());
+        CHECKL(cudaMemcpy(out_h_p, buffers[out_idx], out_h.size() * elt_size_bytes, cudaMemcpyDeviceToHost), *g_logger);
+        cv::Mat dst;
+        cv::convertFp16(cv::Mat(out_size, 1, CV_16S, out_h_p), dst);
+        std::copy((float*)dst.data, (float*)dst.data + out_size, out_h.begin());
+    }
+    else
+        ADD_FAILURE();
+    // Clean up.
+    for (size_t i = 0; i < buffers.size(); i++)
+        CHECKL(cudaFree(buffers[i]), *g_logger);
+    context->destroy();
+    engine->destroy();
+
+    return out_h;
+}
+
+// -----------------------------------------------------------------
+// Reads the binary file produced by Python test_data_generator.py script.
+// The format of the file:
+// - number of dimensions (int32)
+// - dimensions (int32[])
+// - data (float32[])
+// -----------------------------------------------------------------
+FloatVec readBinaryFile(const std::string& filename, Dims& dims)
+{
+    std::ifstream file(filename, std::ios::binary);
+    EXPECT_TRUE(file.is_open());
+
+    int32_t dim_count;
+    file.read(reinterpret_cast<char*>(&dim_count), sizeof(int32_t));
+    EXPECT_GE(dim_count, 1);
+    EXPECT_LE(dim_count, sizeof(dims.d) / sizeof(dims.d[0]));
+
+    dims.nbDims = dim_count;
+    file.read(reinterpret_cast<char*>(dims.d), dim_count * sizeof(int32_t));
+
+    FloatVec res(DimsUtils::getTensorSize(dims));
+    file.read(reinterpret_cast<char*>(&res[0]), res.size() * sizeof(float));
+    return res;
+}
+
+// -----------------------------------------------------------------
+// ELU plugin tests.
+// -----------------------------------------------------------------
+TEST(EluPluginTests, Basic)
+{
+    Dims in_dims;
+    Dims out_dims;
+    FloatVec in  = readBinaryFile(g_data_dir + "elu_i_01.bin", in_dims);
+    FloatVec out = readBinaryFile(g_data_dir + "elu_o_01.bin", out_dims);
+    ASSERT_EQ(in_dims.nbDims, out_dims.nbDims);
+    ASSERT_EQ(in_dims.nbDims, 4);
+    ASSERT_EQ(in_dims.d[0],   1);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"input", in_dims, in}}, out_dims,
+                             [&] { return factory->createEluPlugin(DataType::kFLOAT, "ELU"); },
+                             [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; },
+                             *factory);
+
+    ASSERT_EQ(out.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+        EXPECT_FLOAT_EQ(out[i], actual[i]) << "Vectors 'actual' and 'out' differ at index " << i;
+}
+
+TEST(EluPluginTests, Input4DBatchSize2)
+{
+    Dims in_dims;
+    Dims out_dims;
+    FloatVec in  = readBinaryFile(g_data_dir + "elu_i_02.bin", in_dims);
+    FloatVec out = readBinaryFile(g_data_dir + "elu_o_02.bin", out_dims);
+    ASSERT_EQ(in_dims.nbDims, out_dims.nbDims);
+    ASSERT_EQ(in_dims.nbDims, 5);
+    ASSERT_EQ(in_dims.d[0],   2);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(2, {{"input", in_dims, in}}, out_dims,
+                             [&] { return factory->createEluPlugin(DataType::kFLOAT, "ELU"); },
+                             [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; },
+                             *factory);
+
+    ASSERT_EQ(out.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+        EXPECT_FLOAT_EQ(out[i], actual[i]) << "Vectors 'actual' and 'out' differ at index " << i;
+}
+
+// -----------------------------------------------------------------
+// Conv3DPlugin plugin tests.
+// REVIEW alexeyk: consider converting all these tests to data-driven test.
+// -----------------------------------------------------------------
+
+// Post processing for Conv3D layer.
+ILayer* addConv3DPostProcessing(INetworkDefinition* network, ILayer* plugin, IPluginContainer& factory)
+{
+    auto transform = factory.createTransformPlugin({1, 0, 2, 3}, "Conv3DTransform");
+    EXPECT_NE(transform, nullptr);
+    auto transform_in = plugin->getOutput(0);
+    auto transform_layer = network->addPlugin(&transform_in, 1, *transform);
+    EXPECT_NE(transform_layer, nullptr);
+    return transform_layer;
+}
+
+TEST(Conv3DPluginTests, Basic)
+{
+    Dims x_dims;
+    Dims w_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_01_x.bin", x_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_01_w.bin", w_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_01_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                             [&]
+                             {
+                                 return factory->createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                    w_dims, {3, {1, 1, 1}},
+                                                                    {3, {0, 0, 0}}, {3, {0, 0, 0}},
+                                                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                                                    "Conv3D");
+                             },
+                             addConv3DPostProcessing,
+                             *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_FLOAT_EQ(y[i], actual[i]) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+TEST(Conv3DPluginTests, HWStridesAndPadWithMultiK)
+{
+    Dims x_dims;
+    Dims w_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_02_x.bin", x_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_02_w.bin", w_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_02_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                             [&]
+                             {
+                                 return factory->createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                    w_dims, {3, {1, 2, 2}},
+                                                                    {3, {0, 1, 1}}, {3, {0, 1, 1}},
+                                                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                                                    "Conv3D");
+                             },
+                             addConv3DPostProcessing,
+                             *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(y[i], actual[i], 0.00001) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+TEST(Conv3DPluginTests, DHWStridesAndPadWithMultiK)
+{
+    Dims x_dims;
+    Dims w_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_03_x.bin", x_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_03_w.bin", w_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_03_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    // In this test we have to manually pad input in D dimension.
+    x_dims.d[1] += 1;
+    FloatVec zero_pad(x_dims.d[2] * x_dims.d[3] * x_dims.d[4]);
+    x.insert(x.end(), zero_pad.cbegin(), zero_pad.cend());
+    auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                            [&]
+                            {
+                                 return factory->createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                    w_dims, {3, {1, 2, 2}},
+                                                                    {3, {0, 1, 1}}, {3, {0, 1, 1}},
+                                                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                                                    "Conv3D");
+                            },
+                            addConv3DPostProcessing,
+                            *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(y[i], actual[i], 0.00001) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+TEST(Conv3DPluginTests, UnitStridesAndPadSymDWithMultiK)
+{
+    Dims x_dims;
+    Dims w_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_04_x.bin", x_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_04_w.bin", w_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_04_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                             [&]
+                             {
+                                 return factory->createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                    w_dims, {3, {1, 1, 1}},
+                                                                    {3, {1, 1, 1}}, {3, {1, 1, 1}},
+                                                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                                                    "Conv3D");
+                             },
+                             addConv3DPostProcessing,
+                             *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+TEST(Conv3DPluginTests, DHWStridesAndPadAsymDWithMultiK)
+{
+    Dims x_dims;
+    Dims w_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_05_x.bin", x_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_05_w.bin", w_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_05_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    // In this test we have to manually pad input in D dimension.
+    x_dims.d[1] += 1;
+    FloatVec zero_pad(x_dims.d[2] * x_dims.d[3] * x_dims.d[4]);
+    x.insert(x.end(), zero_pad.cbegin(), zero_pad.cend());
+    auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                            [&]
+                            {
+                                return factory->createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                   w_dims, {3, {2, 2, 2}},
+                                                                   {3, {0, 1, 1}}, {3, {1, 1, 1}},
+                                                                   Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                   Weights{DataType::kFLOAT, nullptr, 0 },
+                                                                   "Conv3D");
+                            },
+                            addConv3DPostProcessing,
+                            *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+TEST(Conv3DPluginTests, DHWStridesAndPadAsymDWithMultiKWithBiasAndElu)
+{
+    Dims x_dims;
+    Dims w_dims;
+    Dims b_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_06_x.bin", x_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_06_w.bin", w_dims);
+    FloatVec b = readBinaryFile(g_data_dir + "conv3d_06_b.bin", b_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_06_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(b_dims.nbDims, 1);
+    ASSERT_EQ(y_dims.nbDims, 5);
+
+    auto data_type = DataType::kFLOAT;
+    auto factory = IPluginContainer::create(*g_logger);
+    // In this test we have to manually pad input in D dimension.
+    x_dims.d[1] += 1;
+    FloatVec zero_pad(x_dims.d[2] * x_dims.d[3] * x_dims.d[4]);
+    x.insert(x.end(), zero_pad.cbegin(), zero_pad.cend());
+    auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                            [&]
+                            {
+                                return factory->createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                   w_dims, {3, {2, 2, 2}},
+                                                                   {3, {0, 1, 1}}, {3, {1, 1, 1}},
+                                                                   Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                   Weights{DataType::kFLOAT, b.data(), (int64_t)b.size() },
+                                                                   "Conv3D");
+                            },
+                            [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f)
+                            {
+                                auto transform = addConv3DPostProcessing(network, plugin, f);
+                                // Add ELU.
+                                auto elu_plugin    = f.createEluPlugin(data_type, "ELU");
+                                EXPECT_NE(elu_plugin, nullptr);
+                                auto elu_plugin_in = transform->getOutput(0);
+                                auto plugin_layer  = network->addPlugin(&elu_plugin_in, 1, *elu_plugin);
+                                EXPECT_NE(plugin_layer, nullptr);
+                                return plugin_layer;
+                            },
+                            *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+TEST(Conv3DPluginTests, Multiple)
+{
+    Dims x_dims;
+    Dims w_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_07_x.bin", x_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_07_w.bin", w_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_07_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                             [&]
+                             {
+                                 return factory->createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                    w_dims, {3, {1, 1, 1}},
+                                                                    {3, {1, 1, 1}}, {3, {1, 1, 1}},
+                                                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                                                    "Conv3D_1");
+                             },
+                             [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f)
+                             {
+                                 auto transform_1 = addConv3DPostProcessing(network, plugin, f);
+                                 auto pad_plugin = f.createPaddingPlugin({0, 0, 0, 0}, {1, 0, 0, 0}, "Pad_1");
+                                 EXPECT_NE(pad_plugin, nullptr);
+                                 auto pad_plugin_in = transform_1->getOutput(0);
+                                 auto pad_plugin_layer  = network->addPlugin(&pad_plugin_in, 1, *pad_plugin);
+                                 EXPECT_NE(pad_plugin_layer, nullptr);
+                                 // Add second Conv3D.
+                                 auto conv_plugin_2  = f.createConv3DPlugin(Conv3DType::kTensorFlow,
+                                                                            w_dims, DimsCHW{2, 2, 2},
+                                                                            DimsCHW{0, 1, 1}, DimsCHW{0, 1, 1},
+                                                                            Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                                                            Weights{DataType::kFLOAT, nullptr, 0 },
+                                                                            "Conv3D_2");
+                                 EXPECT_NE(conv_plugin_2, nullptr);
+                                 auto conv_plugin_2_in = pad_plugin_layer->getOutput(0);
+                                 auto plugin_layer_2  = network->addPlugin(&conv_plugin_2_in, 1, *conv_plugin_2);
+                                 EXPECT_NE(plugin_layer_2, nullptr);
+                                 // Add transform for the second convo.
+                                 auto transform_2 = addConv3DPostProcessing(network, plugin_layer_2, f);
+                                 return transform_2;
+                             },
+                             *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+// -----------------------------------------------------------------
+// Conv3DTransposePlugin plugin tests.
+// -----------------------------------------------------------------
+
+// Post processing for Conv3DTranspose layer.
+ILayer* addConv3DTranPostProc(INetworkDefinition* network, ILayer* plugin, IPluginContainer& factory)
+{
+    auto transform = factory.createTransformPlugin({1, 0, 2, 3}, "Conv3DTransposeTransform");
+    EXPECT_NE(transform, nullptr);
+    auto transform_in = plugin->getOutput(0);
+    auto transform_layer = network->addPlugin(&transform_in, 1, *transform);
+    EXPECT_NE(transform_layer, nullptr);
+    return transform_layer;
+}
+
+ILayer* addConv3DTranSliceLayer(Dims dims, INetworkDefinition* network, ILayer* src_layer, IPluginContainer& factory)
+{
+    auto slice_plugin = factory.createSlicePlugin(dims, 
+                                                  {4, {0, 0, 0, 0}},
+                                                  {4, {dims.d[0] - 1, dims.d[1], dims.d[2], dims.d[3]}},
+                                                  "Slice");
+    EXPECT_NE(slice_plugin, nullptr);
+    auto slice_plugin_in = src_layer->getOutput(0);
+    auto slice_plugin_layer  = network->addPlugin(&slice_plugin_in, 1, *slice_plugin);
+    EXPECT_NE(slice_plugin_layer, nullptr);
+    return slice_plugin_layer;
+}
+
+TEST(Conv3DTransposePluginTests, Basic)
+{
+    Dims y_dims;
+    Dims w_dims;
+    Dims x_dims;
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_01_y.bin", y_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_01_w.bin", w_dims);
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_01_x.bin", x_dims);
+    ASSERT_EQ(y_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(x_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims,
+                            [&]
+                            {
+                                return factory->createConv3DTransposePlugin(
+                                    Conv3DType::kTensorFlow,
+                                    w_dims, {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}},
+                                    {3, {1, 1, 1}}, {3, {0, 0, 0}}, {3, {0, 0, 0}},
+                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                    "Conv3DTranspose");
+                            },
+                            addConv3DTranPostProc,
+                            *factory);
+
+    ASSERT_EQ(x.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_FLOAT_EQ(x[i], actual[i]) << "Vectors 'x' and 'actual' differ at index " << i;
+}
+
+TEST(Conv3DTransposePluginTests, HWStridesAndPadWithMultiK)
+{
+    Dims y_dims;
+    Dims w_dims;
+    Dims x_dims;
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_02_y.bin", y_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_02_w.bin", w_dims);
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_02_x.bin", x_dims);
+    ASSERT_EQ(y_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(x_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual = runPlugin(1, {{"y",y_dims, y}}, x_dims,
+                            [&]
+                            {
+                                return factory->createConv3DTransposePlugin(
+                                    Conv3DType::kTensorFlow,
+                                    w_dims, {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}},
+                                    {3, {1, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}},
+                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                    "Conv3DTranspose");
+                            },
+                            addConv3DTranPostProc,
+                            *factory);
+
+    ASSERT_EQ(x.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i;
+}
+
+TEST(Conv3DTransposePluginTests, DHWStridesAndPadAsymDWithMultiK)
+{
+    Dims y_dims;
+    Dims w_dims;
+    Dims x_dims;
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_03_y.bin", y_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_03_w.bin", w_dims);
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_03_x.bin", x_dims);
+    ASSERT_EQ(y_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(x_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    // In this test we have to manually pad output by 1 in D dimension.
+    auto out_dims = DimsNCHW(x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]);
+    out_dims.d[0] += 1;
+    auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims,
+                            [&]
+                            {
+                                return factory->createConv3DTransposePlugin(
+                                    Conv3DType::kTensorFlow,
+                                    w_dims, out_dims,
+                                    {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}},
+                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                    "Conv3DTranspose");
+                            },
+                            [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f)
+                            {
+                                 auto slice_plugin = f.createSlicePlugin(out_dims, 
+                                                                         {4, {0, 0, 0, 0}},
+                                                                         {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}},
+                                                                         "Slice");
+                                 EXPECT_NE(slice_plugin, nullptr);
+                                 auto slice_plugin_in = plugin->getOutput(0);
+                                 auto slice_plugin_layer  = network->addPlugin(&slice_plugin_in, 1, *slice_plugin);
+                                 EXPECT_NE(slice_plugin_layer, nullptr);
+                                 return slice_plugin_layer;
+                            },
+                            *factory);
+
+    ASSERT_EQ(x.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i;
+}
+
+TEST(Conv3DTransposePluginTests, DHWStridesAndPadAsymDWithMultiKWithBiasAndElu)
+{
+    Dims y_dims;
+    Dims w_dims;
+    Dims x_dims;
+    Dims b_dims;
+    FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_04_y.bin", y_dims);
+    FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_04_w.bin", w_dims);
+    FloatVec b = readBinaryFile(g_data_dir + "conv3d_tran_04_b.bin", b_dims);
+    FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_04_x.bin", x_dims);
+    ASSERT_EQ(y_dims.nbDims, 5);
+    ASSERT_EQ(w_dims.nbDims, 5);
+    ASSERT_EQ(b_dims.nbDims, 1);
+    ASSERT_EQ(x_dims.nbDims, 5);
+
+    auto data_type = DataType::kFLOAT;
+    auto factory = IPluginContainer::create(*g_logger);
+    // In this test we have to manually pad output by 1 in D dimension.
+    auto out_dims = DimsNCHW(x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]);
+    out_dims.d[0] += 1;
+    auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims,
+                            [&]
+                            {
+                                return factory->createConv3DTransposePlugin(
+                                    Conv3DType::kTensorFlow,
+                                    w_dims, out_dims,
+                                    {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}},
+                                    Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() },
+                                    Weights{DataType::kFLOAT, b.data(), (int64_t)b.size() },
+                                    "Conv3DTranspose");
+                            },
+                            [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f)
+                            {
+                                 auto slice_plugin = f.createSlicePlugin(out_dims, 
+                                                                         {4, {0, 0, 0, 0}},
+                                                                         {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}},
+                                                                         "Slice");
+                                 EXPECT_NE(slice_plugin, nullptr);
+                                 auto slice_plugin_in = plugin->getOutput(0);
+                                 auto slice_plugin_layer  = network->addPlugin(&slice_plugin_in, 1, *slice_plugin);
+                                 EXPECT_NE(slice_plugin_layer, nullptr);
+                                 // Add ELU.
+                                 auto elu_plugin    = f.createEluPlugin(data_type, "ELU");
+                                 EXPECT_NE(elu_plugin, nullptr);
+                                 auto elu_plugin_in = slice_plugin_layer->getOutput(0);
+                                 auto elu_layer  = network->addPlugin(&elu_plugin_in, 1, *elu_plugin);
+                                 EXPECT_NE(elu_layer, nullptr);
+                                 return elu_layer;
+                            },
+                            *factory);
+
+    ASSERT_EQ(x.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i;
+}
+
+TEST(Conv3DTransposePluginTests, Multiple)
+{
+    Dims y_dims;
+    Dims w1_dims;
+    Dims w2_dims;
+    Dims x_dims;
+    FloatVec y  = readBinaryFile(g_data_dir + "conv3d_tran_05_y.bin",  y_dims);
+    FloatVec w1 = readBinaryFile(g_data_dir + "conv3d_tran_05_w1.bin", w1_dims);
+    FloatVec w2 = readBinaryFile(g_data_dir + "conv3d_tran_05_w2.bin", w2_dims);
+    FloatVec x  = readBinaryFile(g_data_dir + "conv3d_tran_05_x.bin",  x_dims);
+    ASSERT_EQ(y_dims.nbDims,  5);
+    ASSERT_EQ(w1_dims.nbDims, 5);
+    ASSERT_EQ(w2_dims.nbDims, 5);
+    ASSERT_EQ(x_dims.nbDims,  5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    // In this test we have to manually pad output by 1 in D dimension.
+    auto out_dims1 = DimsNCHW(8 + 1, 8, 9, 9); // REVIEW alexeyk: hardcoded for now.
+    auto out_dims2 = DimsNCHW(x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]);
+    out_dims2.d[0] += 1;
+    auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims,
+                            [&]
+                            {
+                                return factory->createConv3DTransposePlugin(
+                                    Conv3DType::kTensorFlow,
+                                    w1_dims, out_dims1,
+                                    {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}},
+                                    Weights{DataType::kFLOAT, w1.data(), (int64_t)w1.size() },
+                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                    "Conv3DTranspose_1");
+                            },
+                            [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f)
+                            {
+                                // Slice for the first convo.
+                                auto slice_1_plugin_layer = addConv3DTranSliceLayer(out_dims1, network, plugin, f);
+                                auto transform_layer      = addConv3DTranPostProc(network, slice_1_plugin_layer, f);
+
+                                 // Second convo.
+                                auto conv_tran_2_plugin = f.createConv3DTransposePlugin(
+                                    Conv3DType::kTensorFlow,
+                                    w2_dims, out_dims2,
+                                    {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}},
+                                    Weights{DataType::kFLOAT, w2.data(), (int64_t)w2.size() },
+                                    Weights{DataType::kFLOAT, nullptr, 0 },
+                                    "Conv3DTranspose_2");
+                                 EXPECT_NE(conv_tran_2_plugin, nullptr);
+                                 auto conv_tran_2_plugin_in = transform_layer->getOutput(0);
+                                 auto conv_tran_2_plugin_layer  = network->addPlugin(&conv_tran_2_plugin_in, 1, *conv_tran_2_plugin);
+                                 EXPECT_NE(conv_tran_2_plugin_layer, nullptr);
+
+                                // Slice for the second convo.
+                                auto slice_2_plugin_layer = addConv3DTranSliceLayer(out_dims2, network, conv_tran_2_plugin_layer, f);
+                                return slice_2_plugin_layer;
+                            },
+                            *factory);
+
+    ASSERT_EQ(x.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i;
+}
+
+// -----------------------------------------------------------------
+// CostVolumePlugin plugin tests.
+// -----------------------------------------------------------------
+
+TEST(CostVolumePluginTests, Basic)
+{
+    Dims left_dims;
+    Dims right_dims;
+    Dims cost_vol_dims;
+    FloatVec left     = readBinaryFile(g_data_dir + "cost_vol_01_l.bin",  left_dims);
+    FloatVec right    = readBinaryFile(g_data_dir + "cost_vol_01_r.bin",  right_dims);
+    FloatVec cost_vol = readBinaryFile(g_data_dir + "cost_vol_01_cv.bin", cost_vol_dims);
+    ASSERT_EQ(left_dims.nbDims,     4);
+    ASSERT_EQ(right_dims.nbDims,    4);
+    ASSERT_EQ(cost_vol_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"left", left_dims, left}, {"right", right_dims, right}}, cost_vol_dims,
+                             [&]
+                             {
+                                 return factory->createCostVolumePlugin(cost_vol_dims.d[1], "CostVolume");
+                             },
+                             [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; },
+                             *factory);
+
+    ASSERT_EQ(cost_vol.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_FLOAT_EQ(cost_vol[i], actual[i]) << "Vectors 'actual' and 'cost_vol' differ at index " << i;
+}
+
+TEST(CostVolumePluginTests, Large)
+{
+    Dims left_dims;
+    Dims right_dims;
+    Dims cost_vol_dims;
+    FloatVec left     = readBinaryFile(g_data_dir + "cost_vol_02_l.bin",  left_dims);
+    FloatVec right    = readBinaryFile(g_data_dir + "cost_vol_02_r.bin",  right_dims);
+    FloatVec cost_vol = readBinaryFile(g_data_dir + "cost_vol_02_cv.bin", cost_vol_dims);
+    ASSERT_EQ(left_dims.nbDims,     4);
+    ASSERT_EQ(right_dims.nbDims,    4);
+    ASSERT_EQ(cost_vol_dims.nbDims, 5);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"left", left_dims, left}, {"right", right_dims, right}}, cost_vol_dims,
+                             [&]
+                             {
+                                 return factory->createCostVolumePlugin(cost_vol_dims.d[1], "CostVolume");
+                             },
+                             [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; },
+                             *factory);
+
+    ASSERT_EQ(cost_vol.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_FLOAT_EQ(cost_vol[i], actual[i]) << "Vectors 'actual' and 'cost_vol' differ at index " << i;
+}
+
+// Performance tests, should be run under nvprof.
+// REVIEW alexeyk: add more details and script to run as well as parse nvprof results.
+TEST(CostVolumePluginPerfTests, NVSmall)
+{
+    Dims in_dims{4, {1,      32, 161, 513}};
+    Dims cv_dims{5, {1, 48,  64, 161, 513}};
+
+    // REVIEW alexeyk: populate with random values.
+    FloatVec left(DimsUtils::getTensorSize(in_dims));
+    FloatVec right(DimsUtils::getTensorSize(in_dims));
+    FloatVec cost_vol(DimsUtils::getTensorSize(cv_dims));
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"left", in_dims, left}, {"right", in_dims, right}}, cv_dims,
+                             [&]
+                             {
+                                 return factory->createCostVolumePlugin(cv_dims.d[1], "CostVolume");
+                             },
+                             [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; },
+                             *factory);
+
+    ASSERT_EQ(cost_vol.size(), actual.size());
+}
+
+// -----------------------------------------------------------------
+// SoftargmaxPlugin plugin tests.
+// -----------------------------------------------------------------
+
+TEST(SoftargmaxPluginTests, Basic)
+{
+    Dims x_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "softargmax_01_x.bin", x_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "softargmax_01_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 4);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(1, {{"x", x_dims, x}}, y_dims,
+                             [&]
+                             {
+                                 return factory->createSoftargmaxPlugin("Softargmax");
+                             },
+                             [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; },
+                             *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_FLOAT_EQ(y[i], actual[i]) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+TEST(SoftargmaxPluginTests, BatchSize2)
+{
+    Dims x_dims;
+    Dims y_dims;
+    FloatVec x = readBinaryFile(g_data_dir + "softargmax_02_x.bin", x_dims);
+    FloatVec y = readBinaryFile(g_data_dir + "softargmax_02_y.bin", y_dims);
+    ASSERT_EQ(x_dims.nbDims, 5);
+    ASSERT_EQ(y_dims.nbDims, 4);
+
+    auto factory = IPluginContainer::create(*g_logger);
+    auto actual  = runPlugin(2, {{"x", x_dims, x}}, y_dims,
+                             [&]
+                             {
+                                 return factory->createSoftargmaxPlugin("Softargmax");
+                             },
+                             [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; },
+                             *factory);
+
+    ASSERT_EQ(y.size(), actual.size());
+    for (size_t i = 0; i < actual.size(); i++)
+         EXPECT_NEAR(y[i], actual[i], 0.00001) << "Vectors 'actual' and 'y' differ at index " << i;
+}
+
+// -----------------------------------------------------------------
+// End of tests.
+// -----------------------------------------------------------------
+
+int main(int argc, char* argv[])
+{
+    testing::InitGoogleTest(&argc, argv);
+    if (argc < 2)
+    {
+        std::cout << "Usage: nvstereo_tests <test_data_directory> [TRT log level: 0|1|2|3]" << std::endl;
+        return 1;
+    }
+
+    // Set tests data directory.
+    g_data_dir = argv[1];
+    assert(g_data_dir.size() > 0);
+    if (g_data_dir[g_data_dir.size() - 1] != '/')
+        g_data_dir += '/';
+
+    // Create logger.
+    auto max_log_level = ILogger::Severity::kWARNING;
+    if (argc > 2)
+    {
+        max_log_level = (ILogger::Severity)std::stoi(argv[2]);
+        assert((int)max_log_level < EnumMax<ILogger::Severity>());
+    }
+    g_logger = std::make_unique<Logger>(max_log_level);
+
+    //getchar();
+    return RUN_ALL_TESTS();
+}
diff --git a/tools/simulation/docker/Dockerfile.kinetic b/tools/simulation/docker/Dockerfile.kinetic
index 1baf130..0e507bd 100644
--- a/tools/simulation/docker/Dockerfile.kinetic
+++ b/tools/simulation/docker/Dockerfile.kinetic
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
 
 # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 # Full license terms provided in LICENSE.md file.
@@ -138,7 +138,7 @@ RUN echo 'source /opt/ros/kinetic/setup.bash' >> ${HOME}/.bashrc
 # Install OpenCV with CUDA support.
 WORKDIR ${HOME}
 RUN git clone https://github.com/opencv/opencv.git && cd opencv \
-    && git checkout 2.4.13.2                \
+    && git checkout 3.3.1                   \
     && mkdir build && cd build              \
     && cmake -D CMAKE_BUILD_TYPE=RELEASE    \
         -D CMAKE_INSTALL_PREFIX=/usr/local  \
@@ -160,7 +160,8 @@ WORKDIR ${HOME}
 COPY ${TENSORRT_TAR_FILE} ${HOME}
 
 ENV TENSORRT_BASE_DIR /opt/tensorrt
-ENV TENSORRT_DIR      ${TENSORRT_BASE_DIR}/TensorRT-2.1.2
+ENV TENSORRT_VER      3.0.2
+ENV TENSORRT_DIR      ${TENSORRT_BASE_DIR}/TensorRT-${TENSORRT_VER}
 
 RUN mkdir ${TENSORRT_BASE_DIR} \
     && tar -xf ${TENSORRT_TAR_FILE} -C ${TENSORRT_BASE_DIR} \
diff --git a/tools/simulation/docker/build_redtail_image.sh b/tools/simulation/docker/build_redtail_image.sh
index 834913b..e54e4d8 100755
--- a/tools/simulation/docker/build_redtail_image.sh
+++ b/tools/simulation/docker/build_redtail_image.sh
@@ -7,11 +7,20 @@ TENSORRT_TAR_FILE=$1
 if [[ -z "${TENSORRT_TAR_FILE}" ]]; then
     echo "First argument is missing."
     echo "Usage  : build_redtail_image.sh <full_path_to_TensorRT_tar_file>"
-    echo "Example: build_redtail_image.sh /data/downloads/NVIDIA/TensorRT-2.1.2.x86_64.cuda-8.0-14-04.tar.bz2"
+    echo "Example: build_redtail_image.sh /data/downloads/NVIDIA/TensorRT-3.0.2.Ubuntu-16.04.3.x86_64.cuda-9.0.cudnn7.0.tar.gz"
     exit 1
 fi
 
+# Image tag suffix, e.g. v2.
+IMAGE_TAG_SUFFIX=$2
+if [[ -n "${IMAGE_TAG_SUFFIX}" ]]; then
+    IMAGE_TAG_SUFFIX="-${IMAGE_TAG_SUFFIX}"
+else
+    IMAGE_TAG_SUFFIX="-v2"
+fi
+echo "Using ${IMAGE_TAG_SUFFIX} image suffix."
+
 # Copy the file to Docker context first.
 cp ${TENSORRT_TAR_FILE} .
 # Build the image.
-docker build -t nvidia-redtail-sim:kinetic --build-arg TENSORRT_TAR_FILE=`basename ${TENSORRT_TAR_FILE}` -f Dockerfile.kinetic .
+docker build -t nvidia-redtail-sim:kinetic${IMAGE_TAG_SUFFIX} --build-arg TENSORRT_TAR_FILE=`basename ${TENSORRT_TAR_FILE}` -f Dockerfile.kinetic .
diff --git a/tools/simulation/docker/scripts/init_workspace.sh b/tools/simulation/docker/scripts/init_workspace.sh
index bb33b88..1bcdefe 100755
--- a/tools/simulation/docker/scripts/init_workspace.sh
+++ b/tools/simulation/docker/scripts/init_workspace.sh
@@ -17,20 +17,9 @@ cd ..
 catkin_make
 source devel/setup.bash
 
-# Install gscam ROS package and its dependencies.
-cd ${HOME}
-git clone https://github.com/ros-perception/image_common.git
-ln -s ${HOME}/image_common/camera_calibration_parsers ${CATKIN_WS}/src/
-ln -s ${HOME}/image_common/camera_info_manager ${CATKIN_WS}/src/
-ln -s ${HOME}/image_common/image_transport ${CATKIN_WS}/src/
-
+# Install gscam ROS package.
 cd ${HOME}
 git clone https://github.com/ros-drivers/gscam.git
-cd gscam
-# At present, master branch does not support GStreamer 1.0 so need to switch to gstreamer-1-0-support branch.
-git checkout gstreamer-1-0-support
-# Currently you get a build error in gscam (‘scoped_ptr’ in namespace ‘boost’ does not name a template type).
-sed -i '9i\#include <boost/scoped_ptr.hpp>' ./include/gscam/gscam_nodelet.h
 # Create symlink to catkin workspace.
 ln -s ${HOME}/gscam ${CATKIN_WS}/src/
 
diff --git a/tools/simulation/run_redtail_docker.sh b/tools/simulation/run_redtail_docker.sh
index 729b25c..a620c0a 100755
--- a/tools/simulation/run_redtail_docker.sh
+++ b/tools/simulation/run_redtail_docker.sh
@@ -5,7 +5,7 @@
 
 REDTAIL_NAME=$1
 if [[ -z "${REDTAIL_NAME}" ]]; then
-    REDTAIL_NAME=redtail-sim
+    REDTAIL_NAME=redtail-sim-v2
 fi
 
 HOST_DATA_DIR=$2
@@ -25,7 +25,7 @@ REDTAIL_ID=`docker ps -aqf "name=^/${REDTAIL_NAME}$"`
 if [ -z "${REDTAIL_ID}" ]; then
     echo "Creating new redtail container."
     xhost +
-    nvidia-docker run -it --privileged --network=host -v ${HOST_DATA_DIR}:${CONTAINER_DATA_DIR}:rw -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=unix${DISPLAY} -p 14556:14556/udp --name=${REDTAIL_NAME} nvidia-redtail-sim:kinetic bash
+    nvidia-docker run -it --privileged --network=host -v ${HOST_DATA_DIR}:${CONTAINER_DATA_DIR}:rw -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=unix${DISPLAY} -p 14556:14556/udp --name=${REDTAIL_NAME} nvidia-redtail-sim:kinetic-v2 bash
 else
     echo "Found redtail container: ${REDTAIL_ID}."
     # Check if the container is already running and start if necessary.
diff --git a/tools/tensorflow/docker/build_redtail_tf_image.sh b/tools/tensorflow/docker/build_redtail_tf_image.sh
new file mode 100755
index 0000000..0060b78
--- /dev/null
+++ b/tools/tensorflow/docker/build_redtail_tf_image.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+# Image tag suffix, e.g. 1.5.
+IMAGE_TAG_SUFFIX=$2
+if [[ -z "${IMAGE_TAG_SUFFIX}" ]]; then
+    IMAGE_TAG_SUFFIX="1.5"
+fi
+echo "Using ${IMAGE_TAG_SUFFIX} image suffix."
+
+# Build the image.
+docker build -t nvidia-redtail-tf:${IMAGE_TAG_SUFFIX} -f tf1.5.Dockerfile .
diff --git a/tools/tensorflow/docker/run_redtail_tf_docker.sh b/tools/tensorflow/docker/run_redtail_tf_docker.sh
new file mode 100755
index 0000000..5468669
--- /dev/null
+++ b/tools/tensorflow/docker/run_redtail_tf_docker.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+REDTAIL_TF_NAME=$1
+if [[ -z "${REDTAIL_TF_NAME}" ]]; then
+    REDTAIL_TF_NAME=nvidia-redtail-tf
+fi
+
+HOST_DATA_DIR=$2
+if [[ -z "${HOST_DATA_DIR}" ]]; then
+    HOST_DATA_DIR=/data/
+fi
+
+CONTAINER_DATA_DIR=$3
+if [[ -z "${CONTAINER_DATA_DIR}" ]]; then
+    CONTAINER_DATA_DIR=/data/
+fi
+
+echo "Container name    : ${REDTAIL_TF_NAME}"
+echo "Host data dir     : ${HOST_DATA_DIR}"
+echo "Container data dir: ${CONTAINER_DATA_DIR}"
+REDTAIL_ID=`docker ps -aqf "name=^/${REDTAIL_TF_NAME}$"`
+if [ -z "${REDTAIL_ID}" ]; then
+    echo "Creating new redtail container."
+    xhost +
+    nvidia-docker run -it --privileged --network=host -v ${HOST_DATA_DIR}:${CONTAINER_DATA_DIR}:rw -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=unix${DISPLAY} -p 8888:8888 -p 6006:6006 --name=${REDTAIL_TF_NAME} nvidia-redtail-tf:1.5 bash
+else
+    echo "Found redtail container: ${REDTAIL_ID}."
+    # Check if the container is already running and start if necessary.
+    if [ -z `docker ps -qf "name=^/${REDTAIL_TF_NAME}$"` ]; then
+        xhost +local:${REDTAIL_ID}
+        echo "Starting and attaching to ${REDTAIL_TF_NAME} container..."
+        docker start ${REDTAIL_ID}
+        docker attach ${REDTAIL_ID}
+    else
+        echo "Found running ${REDTAIL_TF_NAME} container, attaching bash..."
+        docker exec -it ${REDTAIL_ID} bash
+    fi
+fi
+
diff --git a/tools/tensorflow/docker/scripts/install_tensorflow.sh b/tools/tensorflow/docker/scripts/install_tensorflow.sh
new file mode 100755
index 0000000..57ed5cd
--- /dev/null
+++ b/tools/tensorflow/docker/scripts/install_tensorflow.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+# Stop in case of any error.
+set -e
+
+wget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
+
+conda create -n tensorflow python=3.5 anaconda
+source activate tensorflow
+conda install -y -c menpo opencv3
+pip install --ignore-installed --upgrade tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl
+pip install plyfile
+
+echo "export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/cuda/extras/CUPTI/lib64/" >> ${HOME}/.bashrc
+
+echo "source activate tensorflow" >> ${HOME}/.bashrc
+
diff --git a/tools/tensorflow/docker/tf1.5.Dockerfile b/tools/tensorflow/docker/tf1.5.Dockerfile
new file mode 100644
index 0000000..898b5d9
--- /dev/null
+++ b/tools/tensorflow/docker/tf1.5.Dockerfile
@@ -0,0 +1,63 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+# Full license terms provided in LICENSE.md file.
+
+# Build with:
+# docker build -t nvidia-redtail-tf:1.5 -f tf1.5.Dockerfile .
+
+ENV HOME /root
+
+ARG TENSORRT_TAR_FILE
+
+WORKDIR ${HOME}
+
+RUN apt-get update && apt-get -y  --no-install-recommends install software-properties-common \
+    && add-apt-repository ppa:ubuntu-toolchain-r/test
+
+# REVIEW alexeyk: this fixes the problem with TF 1.5 built against cuDNN 7.0 while recent version of the image has 7.1.
+# https://github.com/tensorflow/tensorflow/issues/17566
+RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends libcudnn7=7.0.5.15-1+cuda9.0 libcudnn7-dev=7.0.5.15-1+cuda9.0
+
+RUN apt-get update && apt-get -y --no-install-recommends install \
+        curl \
+        git \
+        gosu \
+        libgtk2.0-dev \
+        libjpeg-dev \
+        libpng-dev \
+        iproute \
+        iputils-ping \
+        less \
+        libasound2 \
+        libx11-xcb-dev \
+        libxss1 \
+        mc \
+        nano \
+        net-tools \
+        patch \
+        pkg-config \
+        protobuf-compiler \
+        rsync \
+        sudo \
+        unzip \
+        wget \
+        zip \
+    && apt-get -y autoremove \
+    && apt-get clean autoclean \
+    # cleanup
+    && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/*
+
+# Anaconda
+WORKDIR ${HOME}
+RUN wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.0.1-Linux-x86_64.sh \
+    && chmod +x ./Anaconda3-5.0.0.1-Linux-x86_64.sh \
+    && ./Anaconda3-5.0.0.1-Linux-x86_64.sh -b
+
+ENV PATH ${HOME}/anaconda3/bin:${PATH}
+
+# TensorFlow conda environment setup.
+COPY ./scripts/install_tensorflow.sh ${HOME}
+RUN ${HOME}/install_tensorflow.sh
+
+WORKDIR ${HOME}