diff --git a/README.md b/README.md index 203b8b5..f2baf60 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ This project contains deep neural networks, computer vision and control code, ha The project's deep neural networks (DNNs) can be trained from scratch using publicly available data. A few [pre-trained DNNs](../blob/master/models/pretrained/) are also available as a part of this project. In case you want to train TrailNet DNN from scratch, follow the steps on [this page](./Models). +The project also contains [Stereo DNN](../blob/master/stereoDNN/) models and runtime which allow to estimate depth from stereo camera on NVIDIA platforms. + ## References and Demos * [arXiv paper](https://arxiv.org/abs/1705.02550) * GTC 2017 talk: [slides](http://on-demand.gputechconf.com/gtc/2017/presentation/s7172-nikolai-smolyanskiy-autonomous-drone-navigation-with-deep-learning.pdf), [video](http://on-demand.gputechconf.com/gtc/2017/video/s7172-smolyanskiy-autonomous-drone-navigation-with-deep-learning%20(1).PNG.mp4) @@ -14,8 +16,15 @@ The project's deep neural networks (DNNs) can be trained from scratch using publ * [Demo video showing generalization to ground vehicle control and other environments](https://www.youtube.com/watch?v=ZKF5N8xUxfw) # News -* **2018-02-15**: added support for the TBS discovery platform. - * Step by step instructions on how to assemble the [TBS discovery drone](../../wiki/Skypad-TBS-Discovery-Setup). +**GTC 2018**: in case you will be at [GTC 2018](https://www.nvidia.com/en-us/gtc/) next week, you may be interested in attending our [Stereo DNN session](https://2018gputechconf.smarteventscloud.com/connect/sessionDetail.ww?SESSION_ID=152050). We'll be happy to chat about redtail in general and some of the interesting work that we've been doing. + +* **2018-03-22**: redtail 2.0. + * Added Stereo DNN models and inference library (TensorFlow/TensorRT). For more details, see the [README](../blob/master/stereoDNN/). + * Migrated to JetPack 3.2. This change brings latest components such as CUDA 9.0, cuDNN 7.0, TensorRT 3.0, OpenCV 3.3 and others to Jetson platform. Note that this is a breaking change. + * Added support for INT8 inference. This enables fast inference on devices that have hardware implementation of INT8 instructions. More details are on [our wiki](../../wiki/ROS-Nodes#int8-inference). + +* **2018-02-15**: added support for the TBS Discovery platform. + * Step by step instructions on how to assemble the [TBS Discovery drone](../../wiki/Skypad-TBS-Discovery-Setup). * Instructions on how to attach and use a [ZED stereo camera](https://www.stereolabs.com/zed/). * Detailed instructions on how to calibrate, test and fly the drone. diff --git a/ros/packages/caffe_ros/CMakeLists.txt b/ros/packages/caffe_ros/CMakeLists.txt index 73c0e43..32c59d5 100644 --- a/ros/packages/caffe_ros/CMakeLists.txt +++ b/ros/packages/caffe_ros/CMakeLists.txt @@ -12,64 +12,13 @@ find_package(catkin REQUIRED COMPONENTS ## System dependencies are found with CMake's conventions #find_package(Boost REQUIRED COMPONENTS system) find_package(CUDA) -# OpenCV 2.4.13 is default on Jetson. Might be required to install in simulation environment. -find_package(OpenCV 2.4.13 REQUIRED) - -## Uncomment this if the package has a setup.py. This macro ensures -## modules and global scripts declared therein get installed -## See http://ros.org/doc/api/catkin/html/user_guide/setup_dot_py.html -# catkin_python_setup() - -################################################ -## Declare ROS messages, services and actions ## -################################################ - -## To declare and build messages, services or actions from within this -## package, follow these steps: -## * Let MSG_DEP_SET be the set of packages whose message types you use in -## your messages/services/actions (e.g. std_msgs, actionlib_msgs, ...). -## * In the file package.xml: -## * add a build_depend tag for "message_generation" -## * add a build_depend and a run_depend tag for each package in MSG_DEP_SET -## * If MSG_DEP_SET isn't empty the following dependency has been pulled in -## but can be declared for certainty nonetheless: -## * add a run_depend tag for "message_runtime" -## * In this file (CMakeLists.txt): -## * add "message_generation" and every package in MSG_DEP_SET to -## find_package(catkin REQUIRED COMPONENTS ...) -## * add "message_runtime" and every package in MSG_DEP_SET to -## catkin_package(CATKIN_DEPENDS ...) -## * uncomment the add_*_files sections below as needed -## and list every .msg/.srv/.action file to be processed -## * uncomment the generate_messages entry below -## * add every package in MSG_DEP_SET to generate_messages(DEPENDENCIES ...) - -## Generate messages in the 'msg' folder -# add_message_files( -# FILES -# Message1.msg -# Message2.msg -# ) - -## Generate services in the 'srv' folder -# add_service_files( -# FILES -# Service1.srv -# Service2.srv -# ) - -## Generate actions in the 'action' folder -# add_action_files( -# FILES -# Action1.action -# Action2.action -# ) - -## Generate added messages and services with any dependencies listed here -# generate_messages( -# DEPENDENCIES -# std_msgs -# ) +# OpenCV 3.3.1 is default on Jetson starting JetPack 3.2. Might be required to install in simulation environment. +# PATHS is required as ROS Kinetic installs its own version of OpenCV 3.3.1 without CUDA support. +find_package(OpenCV 3.3.1 REQUIRED + CONFIG + PATHS /usr/local /usr + NO_DEFAULT_PATH +) ################################### ## catkin specific configuration ## @@ -95,8 +44,8 @@ set (CMAKE_CXX_STANDARD 14) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") set( - CUDA_NVCC_FLAGS - ${CUDA_NVCC_FLAGS}; + CUDA_NVCC_FLAGS + ${CUDA_NVCC_FLAGS}; -O3 -gencode arch=compute_53,code=sm_53 ) @@ -108,11 +57,6 @@ include_directories( ${catkin_INCLUDE_DIRS} ) -## Declare a C++ library -# add_library(caffe_ros -# src/${PROJECT_NAME}/caffe_ros.cpp -# ) - ## Add cmake target dependencies of the library ## as an example, code may need to be generated before libraries ## either from message generation or dynamic reconfigure @@ -134,7 +78,6 @@ target_link_libraries(caffe_ros_node nvinfer opencv_core opencv_imgproc - opencv_gpu opencv_highgui ) diff --git a/ros/packages/caffe_ros/include/caffe_ros/int8_calibrator.h b/ros/packages/caffe_ros/include/caffe_ros/int8_calibrator.h new file mode 100644 index 0000000..dadeb9b --- /dev/null +++ b/ros/packages/caffe_ros/include/caffe_ros/int8_calibrator.h @@ -0,0 +1,40 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#ifndef CAFFE_ROS_INT8_CALIBRATOR_H +#define CAFFE_ROS_INT8_CALIBRATOR_H + +#include +#include +#include "internal_utils.h" + +namespace caffe_ros +{ +class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator +{ +public: + Int8EntropyCalibrator(ConstStr& src, ConstStr& calib_cache); + ~Int8EntropyCalibrator(); + + int getBatchSize() const override { return 1; } + bool getBatch(void* bindings[], const char* names[], int nbBindings) override; + + const void* readCalibrationCache(size_t& length) override; + void writeCalibrationCache(const void* cache, size_t length) override; + + void setInputDims(nvinfer1::DimsCHW dims); + +private: + std::string src_; + std::string calib_cache_; + + std::deque files_; + + nvinfer1::DimsCHW dims_ = nvinfer1::DimsCHW(0, 0, 0); + + float* img_d_ = nullptr; +}; + +} + +#endif \ No newline at end of file diff --git a/ros/packages/caffe_ros/include/caffe_ros/internal_utils.h b/ros/packages/caffe_ros/include/caffe_ros/internal_utils.h new file mode 100644 index 0000000..96505d0 --- /dev/null +++ b/ros/packages/caffe_ros/include/caffe_ros/internal_utils.h @@ -0,0 +1,27 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#ifndef CAFFE_ROS_INTERNAL_UTILS_H +#define CAFFE_ROS_INTERNAL_UTILS_H + +#include + +namespace caffe_ros +{ +using ConstStr = const std::string; + +// Formats of the input layer. BGR is usually used by most of the frameworks that use OpenCV. +enum class InputFormat +{ + BGR = 0, + RGB +}; + +// Performs image preprocessing (resizing, scaling, format conversion etc) +// that is done before feeding the image into the networ. +cv::Mat preprocessImage(cv::Mat img, int dst_img_w, int dst_img_h, InputFormat inp_fmt, ConstStr& encoding, + float inp_scale, float inp_shift); + +} + +#endif diff --git a/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h b/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h index f67cd68..441fcb7 100644 --- a/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h +++ b/ros/packages/caffe_ros/include/caffe_ros/tensor_net.h @@ -7,7 +7,10 @@ #include #include #include -#include +// REVIEW alexeyk: OpenCV that comes with JetPack 3.2 is compiled without CUDA support. +// #include + +#include "int8_calibrator.h" namespace caffe_ros { @@ -15,14 +18,12 @@ namespace caffe_ros class TensorNet { public: - using ConstStr = const std::string; - TensorNet(); virtual ~TensorNet(); - void loadNetwork(ConstStr& prototxtPath, ConstStr& modelPath, - ConstStr& inputBlob = "data", ConstStr& outputBlob = "prob", - bool useFP16 = true, bool use_cached_model = true); + void loadNetwork(ConstStr& prototxt_path, ConstStr& model_path, + ConstStr& input_blob, ConstStr& output_blob, + nvinfer1::DataType data_type, bool use_cached_model); void forward(const unsigned char* input, size_t w, size_t h, size_t c, const std::string& encoding); @@ -36,15 +37,15 @@ class TensorNet const float* getOutput() const { return out_h_; } - void setInputFormat(ConstStr& inputFormat) + void setInputFormat(ConstStr& input_format) { - if (inputFormat == "BGR") + if (input_format == "BGR") inp_fmt_ = InputFormat::BGR; - else if (inputFormat == "RGB") + else if (input_format == "RGB") inp_fmt_ = InputFormat::RGB; else { - ROS_FATAL("Input format %s is not supported. Supported formats: BGR and RGB", inputFormat.c_str()); + ROS_FATAL("Input format %s is not supported. Supported formats: BGR and RGB", input_format.c_str()); ros::shutdown(); } } @@ -68,16 +69,12 @@ class TensorNet context_->setProfiler(on ? &s_profiler : nullptr); } -protected: + void createInt8Calibrator(ConstStr& int8_calib_src, ConstStr& int8_calib_cache); - // Formats of the input layer. BGR is usually used by most of the frameworks that use OpenCV. - enum class InputFormat - { - BGR = 0, - RGB - }; +protected: - void profileModel(ConstStr& prototxtPath, ConstStr& modelPath, bool useFP16, ConstStr& outputBlob, std::ostream& model); + void profileModel(ConstStr& prototxt_path, ConstStr& model_path, nvinfer1::DataType data_type, + ConstStr& input_blob, ConstStr& output_blob, std::ostream& model); class Logger : public nvinfer1::ILogger { @@ -109,9 +106,7 @@ class TensorNet cv::Mat in_h_; cv::Mat in_final_h_; - // cv::gpu::GpuMat m_inOrigD; - // cv::gpu::GpuMat m_inD; - cv::gpu::GpuMat in_d_; + float* in_d_ = nullptr; float* out_h_ = nullptr; float* out_d_ = nullptr; @@ -121,8 +116,13 @@ class TensorNet // This is a separate flag from ROS_DEBUG to enable only specific profiling // of data preparation and DNN feed forward. bool debug_mode_ = false; + + std::unique_ptr int8_calib_; }; +nvinfer1::DataType parseDataType(const std::string& src); +std::string toString(nvinfer1::DataType src); + } #endif diff --git a/ros/packages/caffe_ros/launch/caffe_gscam.launch b/ros/packages/caffe_ros/launch/caffe_gscam.launch index d3df810..d5084b2 100644 --- a/ros/packages/caffe_ros/launch/caffe_gscam.launch +++ b/ros/packages/caffe_ros/launch/caffe_gscam.launch @@ -3,7 +3,7 @@ - + @@ -29,7 +29,7 @@ - + diff --git a/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch b/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch index 7e80e0e..519abb4 100644 --- a/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch +++ b/ros/packages/caffe_ros/launch/caffe_gscam_h264.launch @@ -3,7 +3,7 @@ - + @@ -26,7 +26,7 @@ - + diff --git a/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch b/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch index 698f697..fdf2adc 100644 --- a/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch +++ b/ros/packages/caffe_ros/launch/caffe_gscam_h265.launch @@ -3,7 +3,7 @@ - + @@ -26,7 +26,7 @@ - + diff --git a/ros/packages/caffe_ros/launch/everything.launch b/ros/packages/caffe_ros/launch/everything.launch index 1734190..425de0c 100644 --- a/ros/packages/caffe_ros/launch/everything.launch +++ b/ros/packages/caffe_ros/launch/everything.launch @@ -2,13 +2,13 @@ - + - + @@ -40,7 +40,7 @@ - + @@ -54,7 +54,7 @@ - + diff --git a/ros/packages/caffe_ros/launch/mavros_caffe.launch b/ros/packages/caffe_ros/launch/mavros_caffe.launch index e39195c..ef6a36c 100644 --- a/ros/packages/caffe_ros/launch/mavros_caffe.launch +++ b/ros/packages/caffe_ros/launch/mavros_caffe.launch @@ -3,7 +3,7 @@ - + @@ -24,7 +24,7 @@ - + diff --git a/ros/packages/caffe_ros/src/caffe_ros.cpp b/ros/packages/caffe_ros/src/caffe_ros.cpp index 0a81a7e..73b628d 100644 --- a/ros/packages/caffe_ros/src/caffe_ros.cpp +++ b/ros/packages/caffe_ros/src/caffe_ros.cpp @@ -23,6 +23,9 @@ CaffeRos::CaffeRos() std::string output_layer; std::string inp_fmt; std::string post_proc; + std::string data_type_s; + std::string int8_calib_src; + std::string int8_calib_cache; bool use_FP16; float inp_scale; float inp_shift; @@ -37,7 +40,14 @@ CaffeRos::CaffeRos() nh.param("output_layer", output_layer, "prob"); nh.param("inp_fmt", inp_fmt, "BGR"); nh.param("post_proc", post_proc, ""); + nh.param("data_type", data_type_s, "fp16"); + nh.param("int8_calib_src", int8_calib_src, ""); + nh.param("int8_calib_cache", int8_calib_cache, ""); + + // Backward compatibility: (use_FP16 == false) means use FP32. nh.param("use_fp16", use_FP16, true); + data_type_s = use_FP16 ? data_type_s : "fp32"; + nh.param("inp_scale", inp_scale, 1.0f); nh.param("inp_shift", inp_shift, 0.0f); nh.param("camera_queue_size", camera_queue_size, DEFAULT_CAMERA_QUEUE_SIZE); @@ -54,26 +64,37 @@ CaffeRos::CaffeRos() ROS_INFO("Input : %s", input_layer.c_str()); ROS_INFO("Output: %s", output_layer.c_str()); ROS_INFO("In Fmt: %s", inp_fmt.c_str()); - ROS_INFO("FP16 : %s", use_FP16 ? "yes" : "no"); + ROS_INFO("DType : %s", data_type_s.c_str()); ROS_INFO("Scale : %.4f", inp_scale); ROS_INFO("Shift : %.2f", inp_shift); ROS_INFO("Cam Q : %d", camera_queue_size); ROS_INFO("DNN Q : %d", dnn_queue_size); - ROS_INFO("Post P: %s", post_proc.c_str()); + ROS_INFO("Post P: %s", post_proc.empty() ? "none" : post_proc.c_str()); ROS_INFO("Obj T : %.2f", obj_det_threshold_); ROS_INFO("IOU T : %.2f", iou_threshold_); ROS_INFO("Rate : %.1f", max_rate_hz_); ROS_INFO("Debug : %s", debug_mode_ ? "yes" : "no"); + ROS_INFO("INT8 calib src : %s", int8_calib_src.c_str()); + ROS_INFO("INT8 calib cache: %s", int8_calib_cache.c_str()); + // + ROS_WARN("The use_FP16 parameter is deprecated though still supported. " + "Please use data_type instead as use_FP16 will be removed in future release."); setPostProcessing(post_proc); - net_.loadNetwork(prototxt_path, model_path, input_layer, output_layer, use_FP16, use_cached_model); + auto data_type = parseDataType(data_type_s); + + if (data_type == nvinfer1::DataType::kINT8) + net_.createInt8Calibrator(int8_calib_src, int8_calib_cache); + + net_.loadNetwork(prototxt_path, model_path, input_layer, output_layer, + data_type, use_cached_model); net_.setInputFormat(inp_fmt); net_.setScale(inp_scale); net_.setShift(inp_shift); if (debug_mode_) net_.showProfile(true); - + image_sub_ = nh.subscribe(camera_topic, camera_queue_size, &CaffeRos::imageCallback, this); output_pub_ = nh.advertise("network/output", dnn_queue_size); } @@ -116,7 +137,7 @@ sensor_msgs::Image::ConstPtr CaffeRos::computeOutputs() out_msg->header.stamp.sec = img.header.stamp.sec; out_msg->header.stamp.nsec = img.header.stamp.nsec; out_msg->header.frame_id = img.header.frame_id; - + // Use single precision multidimensional array to represent outputs. // This can be useful in case DNN output is multidimensional such as in segmentation networks. // Note that encoding may not be compatible with other ROS code that uses Image in case number of channels > 4. @@ -155,7 +176,7 @@ sensor_msgs::Image::ConstPtr CaffeRos::computeOutputs() } ROS_ASSERT(i == msg_data.size()); // Create message. - // YOLO output is represented as a matrix where each row is + // YOLO output is represented as a matrix where each row is // a predicted object vector of size 6: label, prob and 4 bounding box coordinates. out_msg->encoding = "32FC1"; out_msg->width = num_col; diff --git a/ros/packages/caffe_ros/src/int8_calibrator.cpp b/ros/packages/caffe_ros/src/int8_calibrator.cpp new file mode 100644 index 0000000..8691007 --- /dev/null +++ b/ros/packages/caffe_ros/src/int8_calibrator.cpp @@ -0,0 +1,121 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "caffe_ros/int8_calibrator.h" +#include +#include +#include +#include +#include + +namespace fs = boost::filesystem; + +namespace caffe_ros +{ + +Int8EntropyCalibrator::Int8EntropyCalibrator(ConstStr& src, ConstStr& calib_cache) + : src_(src), calib_cache_(calib_cache) +{ + if (!src_.empty()) + { + // If calibration source is not empty then it should be either directory + // or single image file. + if (!fs::is_directory(src_) && !fs::is_regular_file(src_)) + { + ROS_FATAL("INT8 calibrator: not supported source \"%s\". Use directory or regular file name.", src_.c_str()); + ros::shutdown(); + } + // Set calibration file cache if it's empty. + if (calib_cache_.empty()) + calib_cache_ = (fs::path(src_) / "_int8_calib.cache").string(); + // Create file list. + if (fs::is_directory(src_)) + { + for(const auto& entry: boost::make_iterator_range(fs::directory_iterator(src_), {})) + files_.push_back(entry.path().string()); + } + else + files_.push_back(src_); + } +} + +Int8EntropyCalibrator::~Int8EntropyCalibrator() +{ + if (img_d_ != nullptr) + cudaFree(img_d_); + img_d_ = nullptr; +} + +bool Int8EntropyCalibrator::getBatch(void* bindings[], const char* names[], int nbBindings) +{ + if (files_.size() == 0) + return false; + + auto file = files_.front(); + files_.pop_front(); + ROS_DEBUG("INT8 calibrator: using \"%s\" as a source.", file.c_str()); + auto img = cv::imread(file, cv::IMREAD_COLOR); + auto img_h = preprocessImage(img, dims_.w(), dims_.h(), InputFormat::BGR, "bgr8", 1, 0); + // Copy to the device. + ROS_ASSERT(img_h.isContinuous()); + ROS_ASSERT(nbBindings == 1); + ROS_ASSERT(std::string("data") == names[0]); + + size_t size = dims_.c() * dims_.h() * dims_.w() * sizeof(float); + if (img_d_ == nullptr) + { + if (cudaMalloc((void**)&img_d_, size) != cudaSuccess) + { + ROS_ERROR("INT8 calibrator: could not allocate %zu bytes on the device, error: %u.", size, cudaGetLastError()); + return false; + } + } + if (cudaMemcpy(img_d_, img_h.ptr(0), size, cudaMemcpyHostToDevice) != cudaSuccess) + { + ROS_FATAL("INT8 calibrator: could not copy data to device, error: %u.", cudaGetLastError()); + return false; + } + bindings[0] = img_d_; + return true; +} + +const void* Int8EntropyCalibrator::readCalibrationCache(size_t& length) +{ + // If calibration source is not empty then it takes precedence over cache + // as we assume user has requested a calibration. + if (!src_.empty() || !fs::exists(calib_cache_)) + return nullptr; + + ROS_INFO("Reading INT8 calibration cache from: %s", calib_cache_.c_str()); + // Open file and seek to the end. + std::ifstream file(calib_cache_, std::ios::binary | std::ios::ate); + if (!file.is_open()) + { + ROS_ERROR("Could not open file %s for reading, INT8 cache will not be used.", calib_cache_.c_str()); + return nullptr; + } + // Get file size and read the data. + length = file.tellg(); + file.seekg(0, std::ios_base::beg); + char* data = new char[length]; + file.read(data, length); + return data; +} + +void Int8EntropyCalibrator::writeCalibrationCache(const void* cache, size_t length) +{ + ROS_INFO("Writing INT8 calibration cache to: %s", calib_cache_.c_str()); + auto file = std::ofstream(calib_cache_, std::ios::binary); + file.write((const char*)cache, length); +} + +void Int8EntropyCalibrator::setInputDims(nvinfer1::DimsCHW dims) +{ + dims_ = dims; + // Free image device cache. + if (img_d_ != nullptr) + cudaFree(img_d_); + img_d_ = nullptr; +} + +} \ No newline at end of file diff --git a/ros/packages/caffe_ros/src/tensor_net.cpp b/ros/packages/caffe_ros/src/tensor_net.cpp index 51755bc..d056c4d 100644 --- a/ros/packages/caffe_ros/src/tensor_net.cpp +++ b/ros/packages/caffe_ros/src/tensor_net.cpp @@ -5,24 +5,24 @@ #include #include #include -#include -#include -#include #include "caffe_ros/tensor_net.h" +#include namespace caffe_ros { +using namespace nvinfer1; + TensorNet::Logger TensorNet::s_log; TensorNet::Profiler TensorNet::s_profiler; -static nvinfer1::DimsCHW DimsToCHW(nvinfer1::Dims dims) +static DimsCHW DimsToCHW(Dims dims) { ROS_ASSERT(dims.nbDims == 3); - ROS_ASSERT(dims.type[0] == nvinfer1::DimensionType::kCHANNEL); - ROS_ASSERT(dims.type[1] == nvinfer1::DimensionType::kSPATIAL); - ROS_ASSERT(dims.type[2] == nvinfer1::DimensionType::kSPATIAL); - return nvinfer1::DimsCHW(dims.d[0], dims.d[1], dims.d[2]); + ROS_ASSERT(dims.type[0] == DimensionType::kCHANNEL); + ROS_ASSERT(dims.type[1] == DimensionType::kSPATIAL); + ROS_ASSERT(dims.type[2] == DimensionType::kSPATIAL); + return DimsCHW(dims.d[0], dims.d[1], dims.d[2]); } void TensorNet::Logger::log(Severity severity, const char *msg) @@ -57,40 +57,91 @@ TensorNet::TensorNet() TensorNet::~TensorNet() { + cudaError_t err = cudaSuccess; + if (in_d_ != nullptr) + { + err = cudaFree(in_d_); + if (err != cudaSuccess) + ROS_WARN("cudaFree returned %d", (int)err); + } + if (out_h_ != nullptr) { - cudaError_t err = cudaFreeHost(out_h_); + err = cudaFreeHost(out_h_); if (err != cudaSuccess) ROS_WARN("cudaFreeHost returned %d", (int)err); - out_h_ = nullptr; - out_d_ = nullptr; } + in_d_ = nullptr; + out_h_ = nullptr; + out_d_ = nullptr; } -void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool use_FP16, - ConstStr& output_blob, std::ostream& model) +void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, DataType data_type, + ConstStr& input_blob, ConstStr& output_blob, std::ostream& model) { - auto builder = nvinfer1::createInferBuilder(s_log); + auto builder = createInferBuilder(s_log); auto network = builder->createNetwork(); - builder->setMinFindIterations(3); // allow time for TX1 GPU to spin up + // Set TRT autotuner parameters. + builder->setMinFindIterations(3); builder->setAverageFindIterations(2); auto parser = nvcaffeparser1::createCaffeParser(); + + DataType model_data_type = DataType::kFLOAT; + // Check for FP16. bool has_fast_FP16 = builder->platformHasFastFp16(); ROS_INFO("Hardware support of fast FP16: %s.", has_fast_FP16 ? "yes" : "no"); - if (has_fast_FP16 && !use_FP16) - ROS_INFO("... however, the model will be loaded as FP32."); - - nvinfer1::DataType model_data_type = (has_fast_FP16 && use_FP16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; - auto blob_finder = parser->parse(prototxt_path.c_str(), model_path.c_str(), *network, model_data_type); + if (has_fast_FP16) + { + if (data_type == DataType::kHALF) + { + model_data_type = DataType::kHALF; + builder->setHalf2Mode(true); + } + else + ROS_INFO("... however, FP16 will not be used for this model."); + } + // Check for Int8. + bool has_fast_int8 = builder->platformHasFastInt8(); + ROS_INFO("Hardware support of fast INT8: %s.", has_fast_int8 ? "yes" : "no"); + if (has_fast_int8) + { + if (data_type == DataType::kINT8) + { + ROS_ASSERT(int8_calib_ != nullptr); + model_data_type = DataType::kINT8; + builder->setInt8Mode(true); + builder->setInt8Calibrator(int8_calib_.get()); + } + else + ROS_INFO("... however, INT8 will not be used for this model."); + } + + ROS_INFO("Using %s model data type.", toString(model_data_type).c_str()); + // Note: for INT8 models parsing, data type must be set to FP32, see TRT docs. + auto blob_finder = parser->parse(prototxt_path.c_str(), model_path.c_str(), *network, + model_data_type == DataType::kINT8 ? DataType::kFLOAT : model_data_type); if (blob_finder == nullptr) { ROS_FATAL("Failed to parse network: %s, %s", prototxt_path.c_str(), model_path.c_str()); ros::shutdown(); } ROS_INFO("Loaded model from: %s, %s", prototxt_path.c_str(), model_path.c_str()); - + + // Need to set input dimensions for INT8 calibrator. + if (data_type == DataType::kINT8) + { + auto in_b = blob_finder->find(input_blob.c_str()); + if (in_b == nullptr) + { + ROS_FATAL("Could not find input blob: %s", input_blob.c_str()); + ros::shutdown(); + } + int8_calib_->setInputDims(DimsToCHW(in_b->getDimensions())); + } + + // Find output blob and mark it as a network output. auto out_b = blob_finder->find(output_blob.c_str()); if (out_b == nullptr) { @@ -105,8 +156,6 @@ void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool builder->setMaxBatchSize(1); builder->setMaxWorkspaceSize(16 * 1024 * 1024); - builder->setHalf2Mode(has_fast_FP16 && use_FP16); - ROS_INFO("Building CUDA engine..."); auto engine = builder->buildCudaEngine(*network); if (engine == nullptr) @@ -116,11 +165,11 @@ void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool } // Save model. - nvinfer1::IHostMemory* model_ptr = engine->serialize(); + IHostMemory* model_ptr = engine->serialize(); ROS_ASSERT(model_ptr != nullptr); model.write(reinterpret_cast(model_ptr->data()), model_ptr->size()); model_ptr->destroy(); - + ROS_INFO("Done building."); // Cleanup. @@ -131,10 +180,10 @@ void TensorNet::profileModel(ConstStr& prototxt_path, ConstStr& model_path, bool } void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path, - ConstStr& input_blob, ConstStr& output_blob, - bool use_FP16, bool use_cached_model) + ConstStr& input_blob, ConstStr& output_blob, + DataType data_type, bool use_cached_model) { - infer_ = nvinfer1::createInferRuntime(s_log); + infer_ = createInferRuntime(s_log); if (infer_ == nullptr) { ROS_FATAL("Failed to create inference runtime."); @@ -143,7 +192,7 @@ void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path, std::stringstream model; if (!use_cached_model) - profileModel(prototxt_path, model_path, use_FP16, output_blob, model); + profileModel(prototxt_path, model_path, data_type, input_blob, output_blob, model); else { std::string cached_model_path(model_path + ".cache"); @@ -156,7 +205,7 @@ void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path, } else { - profileModel(prototxt_path, model_path, use_FP16, output_blob, model); + profileModel(prototxt_path, model_path, data_type, input_blob, output_blob, model); ROS_INFO("Saving cached model to: %s", cached_model_path.c_str()); std::ofstream cacheFile(cached_model_path, std::ios::binary); cacheFile << model.rdbuf(); @@ -184,19 +233,23 @@ void TensorNet::loadNetwork(ConstStr& prototxt_path, ConstStr& model_path, int iinp = engine_->getBindingIndex(input_blob.c_str()); in_dims_ = DimsToCHW(engine_->getBindingDimensions(iinp)); ROS_INFO("Input : (W:%4u, H:%4u, C:%4u).", in_dims_.w(), in_dims_.h(), in_dims_.c()); - //cv::gpu::ensureSizeIsEnough(in_dims_.h(), in_dims_.w(), CV_8UC3, in_d_); - in_d_ = cv::gpu::createContinuous(in_dims_.c(), in_dims_.w() * in_dims_.h(), CV_32FC1); - assert(in_d_.isContinuous()); - + size_t in_size_bytes = in_dims_.c() * in_dims_.h() * in_dims_.w() * sizeof(float); + // Allocate memory for the inputs. + if (cudaMalloc(&in_d_, in_size_bytes) != cudaSuccess) + { + ROS_FATAL("Could not allocate %zu bytes for the input, error: %u.", in_size_bytes, cudaGetLastError()); + ros::shutdown(); + } + int iout = engine_->getBindingIndex(output_blob.c_str()); out_dims_ = DimsToCHW(engine_->getBindingDimensions(iout)); ROS_INFO("Output: (W:%4u, H:%4u, C:%4u).", out_dims_.w(), out_dims_.h(), out_dims_.c()); // Allocate mapped memory for the outputs. - size_t outSizeBytes = out_dims_.w() * out_dims_.h() * out_dims_.c() * sizeof(float); - if (cudaHostAlloc(&out_h_, outSizeBytes, cudaHostAllocMapped) != cudaSuccess) + size_t out_size_bytes = out_dims_.w() * out_dims_.h() * out_dims_.c() * sizeof(float); + if (cudaHostAlloc(&out_h_, out_size_bytes, cudaHostAllocMapped) != cudaSuccess) { - ROS_FATAL("Could not allocate %zu bytes for the output, error: %u.", outSizeBytes, cudaGetLastError()); + ROS_FATAL("Could not allocate %zu bytes for the output, error: %u.", out_size_bytes, cudaGetLastError()); ros::shutdown(); } if (cudaHostGetDevicePointer(&out_d_, out_h_, 0) != cudaSuccess) @@ -218,71 +271,97 @@ void TensorNet::forward(const unsigned char* input, size_t w, size_t h, size_t c ros::Time start = ros::Time::now(); in_h_ = cv::Mat((int)h, (int)w, CV_8UC3, (void*)input); + in_final_h_ = preprocessImage(in_h_, in_dims_.w(), in_dims_.h(), inp_fmt_, encoding, inp_scale_, inp_shift_); + // Copy to the device. + ROS_ASSERT(in_final_h_.isContinuous()); + if (cudaMemcpy(in_d_, in_final_h_.ptr(0), + in_final_h_.size().area() * in_final_h_.channels() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) + { + ROS_FATAL("Could not copy data to device, error: %u.", cudaGetLastError()); + ros::shutdown(); + } + + if (debug_mode_) + ROS_INFO("Preproc time: %.3f", (ros::Time::now() - start).toSec() * 1000); + + void* bufs[] = {in_d_, out_d_}; + context_->execute(1, bufs); + if (debug_mode_) + s_profiler.printLayerTimes(); + ROS_DEBUG("Forward out (first 3 values): [%.4f, %.4f, %.4f]", out_h_[0], out_h_[1], out_h_[2]); +} + +void TensorNet::createInt8Calibrator(ConstStr& int8_calib_src, ConstStr& int8_calib_cache) +{ + ROS_ASSERT(!int8_calib_src.empty() || !int8_calib_cache.empty()); + + if (!int8_calib_src.empty()) + ROS_INFO("INT8 calibration is requested. This may take some time."); + + int8_calib_ = std::make_unique(int8_calib_src, int8_calib_cache); +} + +cv::Mat preprocessImage(cv::Mat img, int dst_img_w, int dst_img_h, InputFormat inp_fmt, ConstStr& encoding, + float inp_scale, float inp_shift) +{ // Handle encodings. - if (inp_fmt_ == InputFormat::BGR) + if (inp_fmt == InputFormat::BGR) { // Convert image from RGB to BGR format used by OpenCV if needed. if (encoding == "rgb8") - cv::cvtColor(in_h_, in_h_, CV_RGB2BGR); + cv::cvtColor(img, img, CV_RGB2BGR); } - else if (inp_fmt_ == InputFormat::RGB) + else if (inp_fmt == InputFormat::RGB) { // Input image in OpenCV BGR, convert to RGB. if (encoding == "bgr8") - cv::cvtColor(in_h_, in_h_, CV_BGR2RGB); + cv::cvtColor(img, img, CV_BGR2RGB); } - //ROS_INFO("Dims: (%zu, %zu) -> (%zu, %zu)", w, h, (size_t)in_dims_.w(), (size_t)in_dims_.h()); + //ROS_INFO("Dims: (%zu, %zu) -> (%zu, %zu)", w, h, (size_t)dst_img_w, (size_t)dst_img_h); // Convert to floating point type. - in_h_.convertTo(in_h_, CV_32F); + img.convertTo(img, CV_32F); // Resize (anisotropically) to input layer size. - cv::resize(in_h_, in_h_, cv::Size(in_dims_.w(), in_dims_.h()), 0, 0, cv::INTER_CUBIC); + cv::resize(img, img, cv::Size(dst_img_w, dst_img_h), 0, 0, cv::INTER_CUBIC); // Scale if needed. - if (inp_scale_ != 1) - in_h_ *= inp_scale_; + if (inp_scale != 1) + img *= inp_scale; // Shift if needed. - if (inp_shift_ != 0) - in_h_ = in_h_ + inp_shift_; + if (inp_shift != 0) + img += inp_shift; // Transpose to get CHW format. - in_final_h_ = in_h_.reshape(1, in_dims_.w() * in_dims_.h()).t(); - // Copy to the device. - ROS_ASSERT(in_final_h_.isContinuous()); - ROS_ASSERT(in_d_.isContinuous()); - ROS_ASSERT(in_d_.size().area() * in_d_.channels() == in_final_h_.size().area() * in_final_h_.channels()); - if (cudaMemcpy(in_d_.ptr(0), in_final_h_.ptr(0), - in_d_.size().area() * in_d_.channels() * sizeof(float), cudaMemcpyHostToDevice) != cudaSuccess) + return img.reshape(1, dst_img_w * dst_img_h).t(); +} + +DataType parseDataType(const std::string& src) +{ + if (boost::iequals(src, "FP32")) + return DataType::kFLOAT; + if (boost::iequals(src, "FP16")) + return DataType::kHALF; + if (boost::iequals(src, "INT8")) + return DataType::kINT8; + else { - ROS_FATAL("Could not copy data to device, error: %u.", cudaGetLastError()); + ROS_FATAL("Invalid data type: %s. Supported data types: FP32, FP16, INT8.", src.c_str()); ros::shutdown(); + // Will not get here (well, should not). + return (DataType)-1; } - - if (debug_mode_) - ROS_INFO("Preproc time: %.3f", (ros::Time::now() - start).toSec() * 1000); - - // GPU version: - // in_orig_d_.upload(cv::Mat((int)h, (int)w, CV_8UC3, (void*)input)); - // // Resize to input layer size. - // cv::gpu::resize(in_orig_d_, in_d_, cv::Size(in_dims_.w(), in_dims_.h()), 0, 0, cv::INTER_CUBIC); - // // Convert to floating point type. - // in_d_.convertTo(in_f_d_, CV_32FC3); - // // Subtract shift. - // // REVIEW alexeyk: should be configurable as some models already have this in prototxt. - // cv::gpu::subtract(in_f_d_, 128.0f, in_f_d_); - // // Transpose to get CHW format. - // ROS_DEBUG("in_f_d_: %zu", in_f_d_.elemSize()); - // cv::gpu::transpose(in_f_d_, in_f_d_); - - // cv::Mat cpuM; - // in_d_.download(cpuM); - // std::ofstream file("/home/ubuntu/tmp.raw", std::ios::binary); - // file.write(cpuM.ptr(0), std::streamsize(in_dims_.w() * in_dims_.h() * c)); - // file.close(); - - void* bufs[] = {in_d_.ptr(0), out_d_}; - context_->execute(1, bufs); - if (debug_mode_) - s_profiler.printLayerTimes(); - ROS_DEBUG("Forward out (first 3 values): [%.4f, %.4f, %.4f]", out_h_[0], out_h_[1], out_h_[2]); } +std::string toString(DataType src) +{ + switch (src) + { + case DataType::kFLOAT: + return "FP32"; + case DataType::kHALF: + return "FP16"; + case DataType::kINT8: + return "INT8"; + default: + return ""; + } } +} diff --git a/ros/packages/caffe_ros/tests/data/trailnet_int8_calib.cache b/ros/packages/caffe_ros/tests/data/trailnet_int8_calib.cache new file mode 100644 index 0000000..58c4cc1 --- /dev/null +++ b/ros/packages/caffe_ros/tests/data/trailnet_int8_calib.cache @@ -0,0 +1,54 @@ +1 +(Unnamed ITensor* 12): 3c66f92e +conv1: 3c04519f +pool1: 3c016718 +res1_1_1: 3c010a14 +(Unnamed ITensor* 21): 3ce8219b +(Unnamed ITensor* 23): 3d1474bf +(Unnamed ITensor* 31): 3d8316b5 +res2_1_1: 3c01cfc8 +softmax_t: 3be25d0c +(Unnamed ITensor* 56): 3e0c8ef5 +(Unnamed ITensor* 37): 3d76540e +(Unnamed ITensor* 14): 3cb3e96f +(Unnamed ITensor* 18): 3ca98b20 +res4_2_1: 3c023151 +res1_2: 3cbbbdf3 +(Unnamed ITensor* 80): 3dcdbb3f +fc3_t: 3d8e6f0d +(Unnamed ITensor* 33): 3d9aba8f +res2_1: 3c01a251 +(Unnamed ITensor* 42): 3dc7db29 +(Unnamed ITensor* 50): 3e30779d +(Unnamed ITensor* 59): 3e49a689 +sub_mean: 3b9e3ebb +data: 400fabe8 +(Unnamed ITensor* 46): 3daa87d1 +res2_1_2: 3d2c4b58 +(Unnamed ITensor* 40): 3db93d03 +res3_1: 3d5a4a5b +res2_2_1: 3c01ca05 +(Unnamed ITensor* 78): 3e4fb8a4 +(Unnamed ITensor* 61): 3e51b72a +(Unnamed ITensor* 52): 3e368ce7 +res3_1_2: 3db36048 +res1_2_1: 3c497c69 +fc3: 3e91f7f1 +(Unnamed ITensor* 69): 3e1a721f +res2_2: 3c016a8c +res4_1_2: 3de4c1d9 +res1_1: 3c66f92f +res3_2_1: 3d455a9e +softmax: 3c00e5d4 +res4_1: 3d1a681c +(Unnamed ITensor* 27): 3d3b70b6 +res3_2: 3d3b9f83 +res4_2: 3c023e26 +(Unnamed ITensor* 9): 3c7091cc +(Unnamed ITensor* 75): 3e09c85e +res3_1_1: 3c01b81e +pool_avg: 3c159346 +(Unnamed ITensor* 65): 3e27c4e6 +(Unnamed ITensor* 4): 3c81c69e +res4_1_1: 3d669c33 +(Unnamed ITensor* 71): 3e18ef86 diff --git a/ros/packages/caffe_ros/tests/tests.cpp b/ros/packages/caffe_ros/tests/tests.cpp index 0cf4853..84ec4bd 100644 --- a/ros/packages/caffe_ros/tests/tests.cpp +++ b/ros/packages/caffe_ros/tests/tests.cpp @@ -53,9 +53,9 @@ TEST(CaffeRosTests, TrailNetPredictions) ASSERT_TRUE(fs::exists(test_data_dir)); CaffeRosTestsCallback t; - auto dnn_sub = nh.subscribe("/trails_dnn/network/output", 1, + auto dnn_sub = nh.subscribe("/trailnet/dnn/network/output", 1, &CaffeRosTestsCallback::dnnCallback, &t); - const char* camera_topic = "/camera_trails/image_raw"; + const char* camera_topic = "/trailnet/camera/image_raw"; auto img_pub = nh.advertise(camera_topic, 1); // Test images and expected predictions. @@ -111,9 +111,9 @@ TEST(CaffeRosTests, TrailNetPredictionsBGR8) ASSERT_TRUE(fs::exists(test_data_dir)); CaffeRosTestsCallback t; - auto dnn_sub = nh.subscribe("/trails_dnn/network/output", 1, + auto dnn_sub = nh.subscribe("/trailnet/dnn/network/output", 1, &CaffeRosTestsCallback::dnnCallback, &t); - const char* camera_topic = "/camera_trails/image_raw"; + const char* camera_topic = "/trailnet/camera/image_raw"; auto img_pub = nh.advertise(camera_topic, 1); // Test images and expected predictions. @@ -169,9 +169,9 @@ TEST(CaffeRosTests, TrailNetPredictionsFP16) ASSERT_TRUE(fs::exists(test_data_dir)); CaffeRosTestsCallback t; - auto dnn_sub = nh.subscribe("/trails_dnn_fp16/network/output", 1, + auto dnn_sub = nh.subscribe("/trailnet/dnn_fp16/network/output", 1, &CaffeRosTestsCallback::dnnCallback, &t); - const char* camera_topic = "/camera_trails_fp16/image_raw"; + const char* camera_topic = "/trailnet/camera_fp16/image_raw"; auto img_pub = nh.advertise(camera_topic, 1); // Test images and expected predictions. @@ -220,6 +220,65 @@ TEST(CaffeRosTests, TrailNetPredictionsFP16) } } +TEST(CaffeRosTests, TrailNetPredictionsINT8) +{ + ros::NodeHandle nh("~"); + std::string test_data_dir; + nh.param("test_data_dir", test_data_dir, ""); + ASSERT_TRUE(fs::exists(test_data_dir)); + + CaffeRosTestsCallback t; + auto dnn_sub = nh.subscribe("/trailnet/dnn_int8/network/output", 1, + &CaffeRosTestsCallback::dnnCallback, &t); + const char* camera_topic = "/trailnet/camera_int8/image_raw"; + auto img_pub = nh.advertise(camera_topic, 1); + + // Test images and expected predictions. + auto images = std::vector{"rot_l.jpg", "rot_c.jpg", "rot_r.jpg", "tran_l.jpg", "tran_r.jpg"}; + float predictions[][6] = {{0.685, 0.211, 0.104, 0.493, 0.050, 0.466}, + {0.112, 0.794, 0.093, 0.541, 0.127, 0.332}, + {0.000, 0.027, 0.971, 0.095, 0.068, 0.836}, + {0.100, 0.896, 0.000, 0.521, 0.101, 0.377}, + {0.156, 0.285, 0.558, 0.074, 0.098, 0.827}}; + + // When running using rostest, current directory is $HOME/.ros + fs::path data_dir{test_data_dir}; + + for (size_t i = 0; i < images.size(); i++) + { + auto img_msg = readImage((data_dir / images[i]).string()); + // Use image index as a unique timestamp. + img_msg->header.stamp.sec = 0; + img_msg->header.stamp.nsec = (int)i; + + ros::Rate rate(1000); + // Wait until DNN processes the current messages. There might be multiple messages + // in the queue so make sure to select the right one based on current index. + while (ros::ok() && (t.dnn_out_ == nullptr || t.dnn_out_->header.stamp.nsec != i)) + { + img_pub.publish(img_msg); + ros::spinOnce(); + rate.sleep(); + } + + EXPECT_TRUE(t.dnn_out_ != nullptr); + auto dnn_out = *t.dnn_out_; + // The output should be 1x1x6 (HxWxC). + EXPECT_EQ(dnn_out.width, 1); + EXPECT_EQ(dnn_out.height, 1); + // float32, channels == 6. + EXPECT_EQ(dnn_out.encoding, "32FC6"); + + auto data = reinterpret_cast(dnn_out.data.data()); + for (int col = 0; col < 6; col++) + { + // Must use proper floating point comparison. + // REVIEW alexeyk: the tolerance has to be much higher in INT8. + EXPECT_NEAR(data[col], predictions[i][col], 0.1f) << "Values are not equal at (" << i << ", " << col <<")"; + } + } +} + TEST(CaffeRosTests, YoloNetPredictions) { ros::NodeHandle nh("~"); @@ -228,9 +287,9 @@ TEST(CaffeRosTests, YoloNetPredictions) ASSERT_TRUE(fs::exists(test_data_dir)); CaffeRosTestsCallback t; - auto dnn_sub = nh.subscribe("/object_dnn/network/output", 1, + auto dnn_sub = nh.subscribe("/yolo/dnn/network/output", 1, &CaffeRosTestsCallback::dnnCallback, &t); - const char* camera_topic = "/camera_object/image_raw"; + const char* camera_topic = "/yolo/camera/image_raw"; auto img_pub = nh.advertise(camera_topic, 1); // Test images and expected predictions. @@ -281,9 +340,9 @@ TEST(CaffeRosTests, YoloNetPredictionsFP16) ASSERT_TRUE(fs::exists(test_data_dir)); CaffeRosTestsCallback t; - auto dnn_sub = nh.subscribe("/object_dnn_fp16/network/output", 1, + auto dnn_sub = nh.subscribe("/yolo/dnn_fp16/network/output", 1, &CaffeRosTestsCallback::dnnCallback, &t); - const char* camera_topic = "/camera_object_fp16/image_raw"; + const char* camera_topic = "/yolo/camera_fp16/image_raw"; auto img_pub = nh.advertise(camera_topic, 1); // Test images and expected predictions. diff --git a/ros/packages/caffe_ros/tests/tests_basic.launch b/ros/packages/caffe_ros/tests/tests_basic.launch index 0585426..c14c0e4 100644 --- a/ros/packages/caffe_ros/tests/tests_basic.launch +++ b/ros/packages/caffe_ros/tests/tests_basic.launch @@ -4,74 +4,99 @@ rostest caffe_ros tests_basic.launch Use parameters to specify values for the arguments, for example: + 1. Simple version: + +rostest caffe_ros tests_basic.launch test_data_dir:=/data/src/redtail/ros/packages/caffe_ros/tests/data \ + model_dir:=/data/src/redtail/models/pretrained + + 2. Advanced version: + export REDTAIL_TEST_DIR=/data/src/redtail/ros/packages/caffe_ros/tests/data export REDTAIL_MODEL_DIR=/data/src/redtail/models/pretrained -rostest caffe_ros tests_basic.launch test_data_dir:=$REDTAIL_TEST_DIR \ +rostest caffe_ros tests_basic.launch test_data_dir:=$REDTAIL_TEST_DIR model_dir:=$REDTAIL_MODEL_DIR \ trail_prototxt_path:=$REDTAIL_MODEL_DIR/TrailNet_SResNet-18.prototxt trail_model_path:=$REDTAIL_MODEL_DIR/TrailNet_SResNet-18.caffemodel \ object_prototxt_path:=$REDTAIL_MODEL_DIR/yolo-relu.prototxt object_model_path:=$REDTAIL_MODEL_DIR/yolo-relu.caffemodel --> - - + + + - - + + - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - diff --git a/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch b/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch index 0ded215..4b78e8f 100644 --- a/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch +++ b/ros/packages/px4_controller/launch/mavros_caffe_ctl.launch @@ -3,7 +3,7 @@ - + @@ -29,7 +29,7 @@ - + diff --git a/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch b/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch index f8ae0a3..0b3530f 100644 --- a/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch +++ b/ros/packages/redtail_debug/launch/trailnet_debug_zed.launch @@ -6,7 +6,7 @@ - + @@ -20,7 +20,7 @@ - + diff --git a/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch b/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch index e390112..8955ef0 100644 --- a/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch +++ b/ros/packages/redtail_debug/launch/trailnet_debug_zed_gscam.launch @@ -7,7 +7,7 @@ - + @@ -35,7 +35,7 @@ - + diff --git a/ros/scripts/jetson_ros_install.sh b/ros/scripts/jetson_ros_install.sh index 6a820eb..5086def 100755 --- a/ros/scripts/jetson_ros_install.sh +++ b/ros/scripts/jetson_ros_install.sh @@ -90,20 +90,10 @@ echo "Installing dependencies..." sudo apt-get install -y libgstreamer1.0-dev gstreamer1.0-tools libgstreamer-plugins-base1.0-dev libgstreamer-plugins-good1.0-dev libyaml-cpp-dev cd $HOME -# REVIEW alexeyk: replace with sudo apt-get install -y ros-kinetic-camera-info-manager ros-kinetic-camera-calibration-parsers ros-kinetic-image-transport -if [ ! -d "$HOME/image_common" ]; then - echo "Cloning image_common sources..." - git clone https://github.com/ros-perception/image_common.git - # Create symlinks to catkin workspace. - ln -s $HOME/image_common/camera_calibration_parsers $CATKIN_WS/src/ - ln -s $HOME/image_common/camera_info_manager $CATKIN_WS/src/ - ln -s $HOME/image_common/image_transport $CATKIN_WS/src/ -else - echo "Updating image_common sources..." - cd image_common - git pull -fi +# Install gscam dependencies. +sudo apt-get install -y ros-kinetic-camera-info-manager ros-kinetic-camera-calibration-parsers ros-kinetic-image-transport +# Install gscam from sources rather than apt-get install as the latter installs a lot of redundant stuff. cd $HOME if [ ! -d "$HOME/gscam" ]; then echo "Cloning gscam sources..." @@ -121,14 +111,14 @@ echo "Building gscam package..." cd $CATKIN_WS catkin_make -DGSTREAMER_VERSION_1_x=On -# Installing caffe_ros ROS package and its dependencies. +# Installing redtail ROS packages and dependencies. echo "${green}Starting installation of caffe_ros and px4_controller ROS packages...${reset}" cd $HOME if [ ! -d "$HOME/redtail" ]; then - echo "Cloning caffe_ros sources..." + echo "Cloning redtail sources..." git clone https://github.com/NVIDIA-Jetson/redtail else - echo "Updating caffe_ros sources..." + echo "Updating redtail sources..." cd redtail git checkout master git pull @@ -142,15 +132,7 @@ fi echo "Installing dependencies..." cd $HOME -if [ ! -d "$HOME/angles" ]; then - git clone https://github.com/ros/angles.git - # Create symlink to catkin workspace. - ln -s $HOME/angles/angles $CATKIN_WS/src/ -else - echo "Updating angles sources..." - cd angles - git pull -fi +sudo apt-get install -y ros-kinetic-angles cd $CATKIN_WS echo "Building caffe_ros package..." diff --git a/stereoDNN/.gitignore b/stereoDNN/.gitignore new file mode 100644 index 0000000..565f820 --- /dev/null +++ b/stereoDNN/.gitignore @@ -0,0 +1,4 @@ +bin/ +build/ +build_rel/ +*.pyc \ No newline at end of file diff --git a/stereoDNN/CMakeLists.txt b/stereoDNN/CMakeLists.txt new file mode 100644 index 0000000..619a0f0 --- /dev/null +++ b/stereoDNN/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +# Build with: +# Debug: cmake -DCMAKE_BUILD_TYPE=Debug .. + +cmake_minimum_required(VERSION 3.5) + +set(default_build_type "Release") + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") +endif() + +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(TARGET_SUFFIX "_debug") + message("Using ${TARGET_SUFFIX} suffix.") +endif() + +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/../bin") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/../bin") + +add_subdirectory(lib) +add_subdirectory(sample_app) +add_subdirectory(tests) diff --git a/stereoDNN/README.md b/stereoDNN/README.md new file mode 100644 index 0000000..2c8a515 --- /dev/null +++ b/stereoDNN/README.md @@ -0,0 +1,136 @@ +# Stereo DNN TensorRT inference library +The goal of this project is to enable inference for [NVIDIA Stereo DNN](https://arxiv.org/) TensorFlow models on Jetson as well as other platforms supported by NVIDIA TensorRT library. + +This is a 2-step process: +1. Convert the TensorFlow model to TensorRT C++ API model. This step is performed only once for each model and can be done on any environment like user's desktop. +2. Use the TRT C++ API model in an application. Once the model is built, it can be used in any environment (e.g. Jetson) to perform inference. + +Note: TensorFlow is **not** required for the inference step. The library needs only basic components like CUDA 9.0, cuDNN 7.0 and TensorRT 3.0 so it will run as-is on Jetson with JetPack 3.2 + +The library implements the following TensorRT plugins: +* `conv3d` : implementation of TensorFlow-compatible 3D convolution +* `conv3d_transpose`: implementation of TensorFlow-compatible 3D transposed convolution (aka deconvolution) +* `cost_volume` : implementation of cost volume computation used in StereoDNN +* `softargmax` : implementation of specific softargmax implementation used in StereoDNN +* `elu` : implementation of ELU activation function +* `transform` : implementation of tensor transformation required for certain operations +* `slice` : implementation of tensor slicing required for certain operations +* `pad` : implementation of tensor padding required for certain operations + +Note that these plugins make certain assumptions that are valid in case of Stereo DNN. +`slice` and `pad` plugins implement only a tiny piece of functionality required to run the inference. + +## Models +There are several Stereo DNN models included in this packages, the following table provides brief comparison. `TF` stands for TensorFlow and `TRT` - for our implementation based on TensorRT and cuDNN. All times are in milliseconds per image, averaged over 200 images. + +| Model | Input size | Titan Xp (TF) | Titan Xp (TRT) | Jetson TX2 (TRT) | D1 error (%) | +| --------- | ----------- | --------------| -------------- | ---------------- | ------------ | +| NVSmall | 1025x321 | 800 | 450 | 7800 | 9.8 | +| NVTiny | 513x161 | 75 | 40 | 360 | 11.12 | +| ResNet-18 | 1025x321 | 950 | 650 | 11000 | 3.4(*) | + +Notes: +* We could not run TensorFlow on Jetson with our models so no measurments were done in this case. +* D1 error for `NVSmall` and `NVTiny` was measured on 200 training images from [KITTI 2015 stereo benchmark](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=stereo). This dataset was **not** used to train the models. +* `*` - measured on [KITTI 2015 stereo test set](http://www.cvlibs.net/datasets/kitti/eval_scene_flow.php?benchmark=stereo). Note that this model was fine-tuned on 200 training images, so providing error on that dataset is not useful. + +## Converting TensorFlow model to TensorRT C++ API model +To convert TensorFlow model to TensorRT C++ API, run the `./scripts/model_builder.py` script which takes several named parameters: +* model type +* network name to use in generated C++ code +* path to TensorFlow model +* path to resulting binary weights file +* path to generated C++ file. + +You can also optionally specify model data type (fp32 or fp16 with fp32 being default). + +Example: +```sh +cd ./scripts/ +python ./model_builder.py --model_type nvsmall --net_name NVSmall1025x321 --checkpoint_path=../models/NVSmall/TensorFlow/model-inference-1025x321-0 --weights_file=../models/NVSmall/TensorRT/trt_weights.bin --cpp_file=../sample_app/nvsmall_1025x321_net.cpp --data_type fp32 +``` +Currently the supported model types are `nvsmall` and `resnet18`. `NVTiny` is a slight variation of `NVSmall` so it works with `nvsmall` model type. Adding new model types should be relatively easy, `./scripts/model_nvsmall.py` or `./scripts/model_resnet18.py` can provide a good starting point. + +Note: TensorFlow v.1.5 or later is required. We stronly recommend using our [TensorFlow Docker container](../tools/tensorflow/docker) as it contains all necessary components required to use Stereo DNN with TensorFlow. + +## Building inference code +Once the TensorRT C++ model is created, it can be used in any TensorRT-enabled application. The inference static library `nvstereo_inference` located at `./lib/` contains imlpementation of TensorRT plugins requried to run Stereo DNN. A sample application located at `./sample_app/` provides example of library usage. To build library, sample application and tests, run the following commands: + +```sh +# Build debug: +mkdir build +cd ./build/ +cmake -DCMAKE_BUILD_TYPE=Debug .. +make +# Build release: +cd .. +mkdir build_rel +cd ./build_rel/ +cmake -DCMAKE_BUILD_TYPE=Release .. +``` + +If you get CMake error that `GTest` is not found, do the following: +```sh +cd /usr/src/gtest +cmake CMakeLists.txt +make +``` +and then try building the library again (you may need to use `sudo` depending on your environment). + +The build will place binary files in `./bin/` directory. + +It's a good idea to run the tests first to make sure everything is working as expected: +```sh +./bin/nvstereo_tests_debug ./tests/data +``` +All tests should pass (obviously). We recommend running debug version first to make sure all asserts in the code are enabled. + +To run the sample application: +```sh +./bin/nvstereo_sample_app_debug nvsmall 513 161 ./models/NVTiny/TensorRT/trt_weights.bin ./sample_app/data/img_left.png ./sample_app/data/img_right.png ./bin/disp.bin +``` +The app takes 8 parameters: +* model type (`nvsmall` or `resnet18`) +* dimensions of the image (width and height - must be equal to dimensions of network input) +* path to weights file created by model builder script +* 2 images, left and right (e.g. PNG files) +* path to output file, the app will create 2 files: binary and PNG +* [optional] data type (fp32 or fp16). Note that FP16 implementation in cuDNN is currently not optimized for 3D convolutions so results might be worse than FP32. + +We recommend running debug version first to make sure all asserts in the code are enabled. + +The following scripts demonstrate how to properly read and pre-process images for the Stereo DNN: + +Using OpenCV (C++ version is in the `sample_app` as well): +```python +import numpy as np +import cv2 + +# Using OpenCV +img = cv2.imread('left.png') +img = cv2.resize(img, (1025, 321), interpolation = cv2.INTER_AREA) +# Convert to RGB and then CHW. +img = np.transpose(img[:, :, ::-1], [2, 0, 1]).astype(np.float32) +img /= 255.0 +print(img.shape) +with open('left.bin', 'wb') as w: + img.reshape(-1).tofile(w) + +``` + +Using TensorFlow: +```python +import numpy as np +import tensorflow as tf + +img = tf.image.decode_png(tf.read_file('left.png'), dtype=tf.uint8) +img = tf.image.convert_image_dtype(img, tf.float32) +img = tf.image.resize_images(img, [321, 1025], tf.image.ResizeMethod.AREA) +# Convert to CHW. +img_res = np.transpose(img.eval(), [2, 0, 1]) +with open('left.bin', 'wb') as w: + img_res.reshape(-1).tofile(w) + +``` + +Note that due to different implementation of resizing algorithm in TF and OpenCV results will not be byte-wise equal. \ No newline at end of file diff --git a/stereoDNN/lib/CMakeLists.txt b/stereoDNN/lib/CMakeLists.txt new file mode 100644 index 0000000..4729cc0 --- /dev/null +++ b/stereoDNN/lib/CMakeLists.txt @@ -0,0 +1,30 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +cmake_minimum_required(VERSION 3.5) + +set(PROJECT_NAME nvstereo_inference) +project(${PROJECT_NAME}) + +find_package(CUDA 9.0 REQUIRED) + +include_directories(${CUDA_INCLUDE_DIRS}) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") + +# Set CUDA NVCC flags: +# Enable C++ 14 +list(APPEND CUDA_NVCC_FLAGS -std=c++14) +list(APPEND CUDA_NVCC_FLAGS -lineinfo -Xcompiler -Wall -Xcompiler -Wextra) + +# Generate both PTX and SASS for Jetson TX2 and use base Maxwell PTX for everything else. +list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_50,code=compute_50) +list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_60,code=sm_61) +list(APPEND CUDA_NVCC_FLAGS -gencode arch=compute_62,code=sm_62) + +file(GLOB ${PROJECT_NAME}_sources ./*.cpp ./*.cu) +set(PROJECT_SOURCES ${${PROJECT_NAME}_sources}) + +set(TARGET_NAME ${PROJECT_NAME}${TARGET_SUFFIX}) +cuda_add_library(${TARGET_NAME} ${PROJECT_SOURCES} STATIC) diff --git a/stereoDNN/lib/conv3d_plugin.cpp b/stereoDNN/lib/conv3d_plugin.cpp new file mode 100644 index 0000000..ec3ad64 --- /dev/null +++ b/stereoDNN/lib/conv3d_plugin.cpp @@ -0,0 +1,376 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include +#include +#include "conv_utils.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// 3D convolution plugin. +// For more information on how 3D convolution is implemented, see +// comments in conv_utils.h +// ----------------------------------------------------------------- +class Conv3DPlugin: public IPlugin +{ +public: + Conv3DPlugin(Conv3DType conv_type, Dims kernel_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + ILogger& log, std::string name): + conv_type_(conv_type), w_dims_(kernel_dims), + stride_dims_(stride_dims), pad_start_dims_(pad_start_dims), pad_end_dims_(pad_end_dims), + kernel_weights_(kernel_weights), bias_weights_(bias_weights), + log_(log), name_(name) + { + // REVIEW alexeyk: TRT currently does not support FP16 data tensors so we + // use weights tensor data type for all descriptors. In case weights + // are in FP16 we'll do the conversion on the fly. This should be changed + // when TRT adds full support for FP16. + // For FP16 we support only TRUE_HALF_CONFIG mode: + // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionForward + data_type_ = CUDNN_DATA_FLOAT; + + // Expecting kernel to be 5D tensor in KVCRS format. + assert(w_dims_.nbDims == 5); + + // Expecting stride to be 3D tensor in DHW format. + assert(stride_dims.nbDims == 3); + + // Expecting padding to be 3D tensors in DHW format. + assert(pad_start_dims.nbDims == 3); + assert(pad_end_dims.nbDims == 3); + // Currently only symmetric padding is supported for H,W dims. + assert(pad_start_dims_.d[1] == pad_end_dims_.d[1]); + assert(pad_start_dims_.d[2] == pad_end_dims_.d[2]); + // Special case (TF-compatible) of asymmetric padding is supported for D dim. + assert(pad_start_dims_.d[0] == pad_end_dims_.d[0] || pad_start_dims_.d[0] == pad_end_dims_.d[0] - 1); + + // TRT supprots FP32/FP16 weights. + assert(kernel_weights_.type == DataType::kFLOAT || kernel_weights_.type == DataType::kHALF); + assert(kernel_weights_.count > 0 && kernel_weights_.values != nullptr); + // TRT supprots FP32/FP16 weights. + assert(bias_weights_.type == DataType::kFLOAT || bias_weights_.type == DataType::kHALF); + assert((bias_weights_.count > 0 && bias_weights_.values != nullptr) || + (bias_weights_.count == 0 && bias_weights_.values == nullptr)); + // Assume same type for simplicity. + assert(bias_weights_.type == kernel_weights_.type); + + weights_type_ = trtToCudnnDataType(kernel_weights_.type); + } + + Conv3DPlugin(Conv3DPlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + assert(index == 0); + assert(nbInputDims == 1); + assert(inputs[0].nbDims == 4); + + x_dims_ = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]); + + createDescriptors(); + // Can use batch_size == 1 to set tensor descriptors initially. + // Set input descriptor. + ConvUtils::setConv3DTensorDescriptor(conv_type_, x_dims_, 1, weights_type_, x_desc_, log_); + // Set conv operation descriptors. + ConvUtils::setConv3DOperationDescriptors(conv_type_, w_dims_, stride_dims_, pad_start_dims_, + weights_type_, w_desc_, c_desc_, log_); + // Compute output dims. + auto y_d = ConvUtils::getConv3DOutputDims(c_desc_, x_desc_, w_desc_, log_); + // Remove batch index dim. + y_dims_ = DimsNCHW(y_d.d[1], y_d.d[2], y_d.d[3], y_d.d[4]); + // Output tensor is always in cuDNN format. + ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, 1, weights_type_, y_desc_, log_); + // Set bias descriptor. + // REVIEW alexeyk: see the comment in tensorrt_model_builder.py re: the stride issue in Conv3D. + ConvUtils::setConv3DBiasDescriptor(Dims{5, {1, y_dims_.d[0], 1, 1, 1}}, weights_type_, b_desc_, log_); + + return y_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(isValid()); + assert(nbInputs == 1); + assert(nbOutputs == 1); + assert(DimsUtils::areEqual(inputDims[0], x_dims_)); + assert(DimsUtils::areEqual(outputDims[0], y_dims_)); + + max_batch_size_ = maxBatchSize; + // Update in/out descriptors and run auto-tuner to find best (fastest) algo. + ConvUtils::setConv3DTensorDescriptor(conv_type_, x_dims_, maxBatchSize, weights_type_, x_desc_, log_); + ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, maxBatchSize, weights_type_, y_desc_, log_); + findBestAlgo(); + + const size_t elt_size = getWeightsDataTypeSize(); + + // Need workspace for FP32 -> FP16 conversion. + if (isFP16()) + workspace_bytes_ += max_batch_size_ * std::max(DimsUtils::getTensorSize(x_dims_), DimsUtils::getTensorSize(y_dims_)) * elt_size; + + // Allocate memory and copy weights. + CHECK(cudaMalloc(&kernel_weights_d_, kernel_weights_.count * elt_size)); + CHECK(cudaMemcpy(kernel_weights_d_, kernel_weights_.values, + kernel_weights_.count * elt_size, cudaMemcpyHostToDevice)); + + if (bias_weights_.count > 0) + { + CHECK(cudaMalloc(&bias_weights_d_, bias_weights_.count * elt_size)); + CHECK(cudaMemcpy(bias_weights_d_, bias_weights_.values, + bias_weights_.count * elt_size, cudaMemcpyHostToDevice)); + + } + log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(x_dims_)).c_str()); + log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims : " + DimsUtils::toString(y_dims_)).c_str()); + } + + int initialize() override + { + assert(isValid()); + return 0; + } + + void terminate() override + { + assert(isValid()); + + if (c_desc_ != nullptr) + CHECK(cudnnDestroyConvolutionDescriptor(c_desc_)); + if (w_desc_ != nullptr) + CHECK(cudnnDestroyFilterDescriptor(w_desc_)); + if (x_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(x_desc_)); + if (y_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(y_desc_)); + if (b_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(b_desc_)); + if (cudnn_ != nullptr) + CHECK(cudnnDestroy(cudnn_)); + + if (kernel_weights_d_ != nullptr) + CHECK(cudaFree(kernel_weights_d_)); + if (bias_weights_d_ != nullptr) + CHECK(cudaFree(bias_weights_d_)); + + c_desc_ = nullptr; + w_desc_ = nullptr; + x_desc_ = nullptr; + y_desc_ = nullptr; + b_desc_ = nullptr; + cudnn_ = nullptr; + + kernel_weights_d_ = nullptr; + bias_weights_d_ = nullptr; + + assert(!isValid()); + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + assert(isValid()); + assert(max_batch_size_ == maxBatchSize); + + return workspace_bytes_; + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + assert(isValid()); + // REVIEW alexeyk: for now assuming batch size always equals max batch size. + // That's pretty strict as it disables dynamic batch sizes but fine for now. + assert(batchSize == max_batch_size_); + + cudnnStatus_t status; + + CHECK(status = cudnnSetStream(cudnn_, stream)); + + size_t workspace_used_bytes = 0; + // Convert to FP16 first if needed. + auto px = preprocessInput(batchSize, inputs[0], workspace, stream, workspace_used_bytes); + assert(px != nullptr); + assert(workspace_used_bytes <= workspace_bytes_); + + CHECK(status = cudnnConvolutionForward(cudnn_, &Consts::kOne, x_desc_, px, w_desc_, kernel_weights_d_, + c_desc_, best_algo_, + static_cast(workspace) + workspace_used_bytes, workspace_bytes_ - workspace_used_bytes, + &Consts::kZero, y_desc_, outputs[0])); + + if (bias_weights_.count > 0) + CHECK(status = cudnnAddTensor(cudnn_, &Consts::kOne, b_desc_, bias_weights_d_, &Consts::kOne, y_desc_, outputs[0])); + + // Convert back to FP32 if needed. + postprocessOutput(batchSize, outputs[0], workspace, stream); + + return status == CUDNN_STATUS_SUCCESS ? 0 : -1; + } + + size_t getSerializationSize() override + { + assert(isValid()); + return 0; + } + + void serialize(void* buffer) override + { + assert(isValid()); + // REVIEW alexeyk: implement. + assert(false); + } + +private: + bool isValid() const + { + return cudnn_ != nullptr; + } + + bool isFP16() const + { + return weights_type_ == CUDNN_DATA_HALF; + } + + size_t getWeightsDataTypeSize() const + { + return (isFP16() ? sizeof(uint16_t) : sizeof(float)); + } + + const void* preprocessInput(int batchSize, const void* x, void* workspace, cudaStream_t stream, size_t& workspace_used_bytes) + { + if (!isFP16()) + return x; + + assert(data_type_ == CUDNN_DATA_FLOAT); + + // Convert to FP16 using workspace. + size_t x_size = batchSize * DimsUtils::getTensorSize(x_dims_); + CHECK(CudaKernels::fp32Tofp16((const float*)x, (uint16_t*)workspace, x_size, stream)); + + workspace_used_bytes = x_size * sizeof(uint16_t); + return workspace; + } + + void postprocessOutput(int batchSize, void* y, void* workspace, cudaStream_t stream) + { + if (!isFP16()) + return; + + assert(data_type_ == CUDNN_DATA_FLOAT); + + size_t y_size = batchSize * DimsUtils::getTensorSize(y_dims_); + // Copy to workspace first. + CHECK(cudaMemcpyAsync(workspace, y, y_size * sizeof(uint16_t), cudaMemcpyDeviceToDevice, stream)); + // Convert to FP32 from workspace. + CHECK(CudaKernels::fp16Tofp32((const uint16_t*)workspace, (float*)y, y_size, stream)); + } + + void createDescriptors() + { + if (cudnn_ == nullptr) + CHECK(cudnnCreate(&cudnn_)); + if (x_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&x_desc_)); + if (y_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&y_desc_)); + if (w_desc_ == nullptr) + CHECK(cudnnCreateFilterDescriptor(&w_desc_)); + if (c_desc_ == nullptr) + CHECK(cudnnCreateConvolutionDescriptor(&c_desc_)); + if (b_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&b_desc_)); + } + + void findBestAlgo() + { + // Let's hope cuDNN team will not come up with more than that number of algos (8 in cuDNN 7). + const int algo_count = 20; + int res_algo_count; + cudnnConvolutionFwdAlgoPerf_t algos[algo_count]; + CHECK(cudnnFindConvolutionForwardAlgorithm(cudnn_, x_desc_, w_desc_, c_desc_, y_desc_, + algo_count, &res_algo_count, algos)); + + assert(res_algo_count > 0); + assert(algos[0].status == CUDNN_STATUS_SUCCESS); + + // Best algo is the first. + best_algo_ = algos[0].algo; + workspace_bytes_ = algos[0].memory; + + // Log results. + log_.log(ILogger::Severity::kINFO, (name_ + ": --> Conv3D layer tuning results:").c_str()); + for (auto& a: algos) + { + if (a.status != CUDNN_STATUS_SUCCESS) + break; + std::ostringstream str; + str << a.algo << ": " << std::fixed << std::setw(8) << std::setprecision(1) << a.time << "ms, " + << std::fixed << std::setw(8) << a.memory << "B"; + log_.log(ILogger::Severity::kINFO, str.str().c_str()); + } + log_.log(ILogger::Severity::kINFO, (name_ + ": <-- Conv3D layer tuning results.").c_str()); + } + +private: + Conv3DType conv_type_; + cudnnDataType_t data_type_; + cudnnDataType_t weights_type_; + + // Using DimsNCHW to represent 3D convos input/output is an ugly workaround + // of TRT limitations which currently result in assert in the guts of TRT. + DimsNCHW x_dims_; + DimsNCHW y_dims_; + Dims w_dims_; + Dims stride_dims_; + Dims pad_start_dims_; + Dims pad_end_dims_; + + int max_batch_size_ = 0; + + // Kernel weights on the host. + Weights kernel_weights_; + // Kernel weights on the device. + float* kernel_weights_d_ = nullptr; + + // Bias weights on the host. + Weights bias_weights_; + // Bias weights on the device. + float* bias_weights_d_ = nullptr; + + cudnnHandle_t cudnn_ = nullptr; + cudnnTensorDescriptor_t x_desc_ = nullptr; + cudnnTensorDescriptor_t y_desc_ = nullptr; + cudnnFilterDescriptor_t w_desc_ = nullptr; + cudnnConvolutionDescriptor_t c_desc_ = nullptr; + cudnnTensorDescriptor_t b_desc_ = nullptr; + + cudnnConvolutionFwdAlgo_t best_algo_ = (cudnnConvolutionFwdAlgo_t)-1; + size_t workspace_bytes_ = 0; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createConv3DPlugin(Conv3DType conv_type, Dims kernel_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new Conv3DPlugin(conv_type, kernel_dims, + stride_dims, pad_start_dims, pad_end_dims, + kernel_weights, bias_weights, + log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/conv3d_transpose_plugin.cpp b/stereoDNN/lib/conv3d_transpose_plugin.cpp new file mode 100644 index 0000000..55c50fe --- /dev/null +++ b/stereoDNN/lib/conv3d_transpose_plugin.cpp @@ -0,0 +1,413 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include +#include +#include +#include "conv_utils.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Transposed 3D convolution plugin. +// For dimension members naming we loosely follow the notation +// from cuDNN where: +// - transposed convolution input is denoted as dy. +// - transposed convolution output is denoted as dx. +// This makes a code more consistent as a convolution input is +// a transposed convolution output and vice versa. +// ----------------------------------------------------------------- +class Conv3DTransposePlugin: public IPlugin +{ +public: + Conv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims, + Dims out_dims, Dims stride_dims, + Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + ILogger& log, std::string name): + conv_type_(conv_type), w_dims_(kernel_dims), + stride_dims_(stride_dims), pad_start_dims_(pad_start_dims), pad_end_dims_(pad_end_dims), + kernel_weights_(kernel_weights), bias_weights_(bias_weights), + log_(log), name_(name) + { + // REVIEW alexeyk: TRT currently does not support FP16 data tensors so we + // use weights tensor data type for all descriptors. In case weights + // are in FP16 we'll do the conversion on the fly. This should be changed + // when TRT adds full support for FP16. + // For FP16 we support only TRUE_HALF_CONFIG mode: + // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionForward + data_type_ = CUDNN_DATA_FLOAT; + + // Expecting kernel to be 5D tensor in KVCRS format. + assert(w_dims_.nbDims == 5); + + // Expecting stride to be 3D tensor in DHW format. + assert(stride_dims.nbDims == 3); + + // Expecting padding to be 3D tensors in DHW format. + assert(pad_start_dims.nbDims == 3); + assert(pad_end_dims.nbDims == 3); + // Currently only symmetric padding is supported for H,W dims. + assert(pad_start_dims_.d[1] == pad_end_dims_.d[1]); + assert(pad_start_dims_.d[2] == pad_end_dims_.d[2]); + // Special case (TF-compatible) of asymmetric padding is supported for D dim. + assert(pad_start_dims_.d[0] == pad_end_dims_.d[0] || pad_start_dims_.d[0] == pad_end_dims_.d[0] - 1); + + // Expecting output to be 4D tensor in CDHW format. + assert(out_dims.nbDims == 4); + x_dims_ = DimsNCHW(out_dims.d[0], out_dims.d[1], out_dims.d[2], out_dims.d[3]); + + b_dims_ = Dims{5, {1, 1, x_dims_.d[1], 1, 1}}; + + // TRT supprots FP32/FP16 weights. + assert(kernel_weights_.type == DataType::kFLOAT || kernel_weights_.type == DataType::kHALF); + assert(kernel_weights_.count > 0 && kernel_weights_.values != nullptr); + // TRT supprots FP32/FP16 weights. + assert(bias_weights_.type == DataType::kFLOAT || bias_weights_.type == DataType::kHALF); + assert((bias_weights_.count > 0 && bias_weights_.values != nullptr) || + (bias_weights_.count == 0 && bias_weights_.values == nullptr)); + // Assume same type for simplicity. + assert(bias_weights_.type == kernel_weights_.type); + + weights_type_ = trtToCudnnDataType(kernel_weights_.type); + } + + Conv3DTransposePlugin(Conv3DTransposePlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + assert(index == 0); + assert(nbInputDims == 1); + assert(inputs[0].nbDims == 4); + + y_dims_ = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]); + + createDescriptors(); + // Can use batch_size == 1 to set tensor descriptors initially. + // Set input descriptor (i.e. convolution output). + // Note: convolution output tensor is always in cuDNN format. + ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, 1, weights_type_, y_desc_, log_); + // Set conv operation descriptors. + ConvUtils::setConv3DOperationDescriptors(conv_type_, w_dims_, stride_dims_, pad_start_dims_, + weights_type_, w_desc_, c_desc_, log_); + // Set output descriptors (i.e. convolution input). + // xo_desc_ is needed in bias addition operation. + ConvUtils::setConv3DTensorDescriptor(conv_type_, x_dims_, 1, weights_type_, x_desc_, log_); + ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, x_dims_, 1, weights_type_, xo_desc_, log_); + // Verify that dimensions match. + std::array out_dims; + CHECK(cudnnGetConvolutionNdForwardOutputDim(c_desc_, x_desc_, w_desc_, out_dims.size(), out_dims.data())); + assert(DimsUtils::areEqual(y_dims_, DimsNCHW{out_dims[1], out_dims[2], out_dims[3], out_dims[4]})); + // Set bias descriptor. + ConvUtils::setConv3DBiasDescriptor(b_dims_, weights_type_, b_desc_, log_); + + return x_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(isValid()); + assert(nbInputs == 1); + assert(nbOutputs == 1); + assert(DimsUtils::areEqual(inputDims[0], y_dims_)); + assert(DimsUtils::areEqual(outputDims[0], x_dims_)); + + max_batch_size_ = maxBatchSize; + // Update in/out descriptors and run auto-tuner to find best (fastest) algo. + ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, y_dims_, maxBatchSize, weights_type_, y_desc_, log_); + ConvUtils::setConv3DTensorDescriptor(conv_type_, x_dims_, maxBatchSize, weights_type_, x_desc_, log_); + ConvUtils::setConv3DTensorDescriptor(Conv3DType::kCuDnn, x_dims_, maxBatchSize, weights_type_, xo_desc_, log_); + findBestAlgo(); + + const size_t elt_size = getWeightsDataTypeSize(); + + // Need workspace for FP32 -> FP16 conversion. + if (isFP16()) + workspace_bytes_ += max_batch_size_ * std::max(DimsUtils::getTensorSize(x_dims_), DimsUtils::getTensorSize(y_dims_)) * elt_size; + + // Allocate memory and copy weights. + CHECK(cudaMalloc(&kernel_weights_d_, kernel_weights_.count * elt_size)); + CHECK(cudaMemcpy(kernel_weights_d_, kernel_weights_.values, + kernel_weights_.count * elt_size, cudaMemcpyHostToDevice)); + + if (bias_weights_.count > 0) + { + CHECK(cudaMalloc(&bias_weights_d_, bias_weights_.count * elt_size)); + CHECK(cudaMemcpy(bias_weights_d_, bias_weights_.values, + bias_weights_.count * elt_size, cudaMemcpyHostToDevice)); + + } + log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(y_dims_)).c_str()); + log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims : " + DimsUtils::toString(x_dims_)).c_str()); + } + + int initialize() override + { + assert(isValid()); + return 0; + } + + void terminate() override + { + assert(isValid()); + + if (c_desc_ != nullptr) + CHECK(cudnnDestroyConvolutionDescriptor(c_desc_)); + if (w_desc_ != nullptr) + CHECK(cudnnDestroyFilterDescriptor(w_desc_)); + if (x_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(x_desc_)); + if (xo_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(xo_desc_)); + if (y_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(y_desc_)); + if (b_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(b_desc_)); + if (cudnn_ != nullptr) + CHECK(cudnnDestroy(cudnn_)); + + if (kernel_weights_d_ != nullptr) + CHECK(cudaFree(kernel_weights_d_)); + if (bias_weights_d_ != nullptr) + CHECK(cudaFree(bias_weights_d_)); + + c_desc_ = nullptr; + w_desc_ = nullptr; + x_desc_ = nullptr; + xo_desc_ = nullptr; + y_desc_ = nullptr; + b_desc_ = nullptr; + cudnn_ = nullptr; + + kernel_weights_d_ = nullptr; + bias_weights_d_ = nullptr; + + assert(!isValid()); + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + assert(isValid()); + assert(max_batch_size_ == maxBatchSize); + + return workspace_bytes_; + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + assert(isValid()); + // REVIEW alexeyk: for now assuming batch size always equals max batch size. + // That's pretty strict as it disables dynamic batch sizes but fine for now. + assert(batchSize == max_batch_size_); + + cudnnStatus_t status; + cudaError_t cu_status = cudaSuccess; + + CHECK(status = cudnnSetStream(cudnn_, stream)); + + size_t workspace_used_bytes = 0; + // Convert to FP16 first if needed. + auto py = preprocessInput(batchSize, inputs[0], workspace, stream, workspace_used_bytes); + assert(py != nullptr); + assert(workspace_used_bytes <= workspace_bytes_); + + CHECK(status = cudnnConvolutionBackwardData(cudnn_, &Consts::kOne, w_desc_, kernel_weights_d_, + y_desc_, inputs[0], c_desc_, best_algo_, + static_cast(workspace) + workspace_used_bytes, workspace_bytes_ - workspace_used_bytes, + &Consts::kZero, x_desc_, outputs[0])); + + if (bias_weights_.count > 0) + { + // REVIEW alexeyk: cuDNN limitation - only second outermost dimension is supported. + if (b_dims_.d[1] == x_dims_.d[0]) + CHECK(status = cudnnAddTensor(cudnn_, &Consts::kOne, b_desc_, bias_weights_d_, &Consts::kOne, x_desc_, outputs[0])); + else if (b_dims_.d[2] == x_dims_.d[1]) + CHECK(cu_status = CudaKernels::addDBiasTo3DConv(bias_weights_d_, b_dims_, static_cast(outputs[0]), x_dims_, stream)); + else + assert(false); + } + + // Convert back to FP32 if needed. + postprocessOutput(batchSize, outputs[0], workspace, stream); + + return (status == CUDNN_STATUS_SUCCESS && cu_status == cudaSuccess) ? 0 : -1; + } + + size_t getSerializationSize() override + { + assert(isValid()); + return 0; + } + + void serialize(void* buffer) override + { + assert(isValid()); + // REVIEW alexeyk: implement. + assert(false); + } + +private: + bool isValid() const + { + return cudnn_ != nullptr; + } + + bool isFP16() const + { + return weights_type_ == CUDNN_DATA_HALF; + } + + size_t getWeightsDataTypeSize() const + { + return (isFP16() ? sizeof(uint16_t) : sizeof(float)); + } + + const void* preprocessInput(int batchSize, const void* y, void* workspace, cudaStream_t stream, size_t& workspace_used_bytes) + { + if (!isFP16()) + return y; + + assert(data_type_ == CUDNN_DATA_FLOAT); + + // Convert to FP16 using workspace. + size_t y_size = batchSize * DimsUtils::getTensorSize(y_dims_); + CHECK(CudaKernels::fp32Tofp16((const float*)y, (uint16_t*)workspace, y_size, stream)); + + workspace_used_bytes = y_size * sizeof(uint16_t); + return workspace; + } + + void postprocessOutput(int batchSize, void* x, void* workspace, cudaStream_t stream) + { + if (!isFP16()) + return; + + assert(data_type_ == CUDNN_DATA_FLOAT); + + size_t x_size = batchSize * DimsUtils::getTensorSize(x_dims_); + // Copy to workspace first. + CHECK(cudaMemcpyAsync(workspace, x, x_size * sizeof(uint16_t), cudaMemcpyDeviceToDevice, stream)); + // Convert to FP32 from workspace. + CHECK(CudaKernels::fp16Tofp32((const uint16_t*)workspace, (float*)x, x_size, stream)); + } + + void createDescriptors() + { + if (cudnn_ == nullptr) + CHECK(cudnnCreate(&cudnn_)); + if (x_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&x_desc_)); + if (xo_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&xo_desc_)); + if (y_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&y_desc_)); + if (w_desc_ == nullptr) + CHECK(cudnnCreateFilterDescriptor(&w_desc_)); + if (c_desc_ == nullptr) + CHECK(cudnnCreateConvolutionDescriptor(&c_desc_)); + if (b_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&b_desc_)); + } + + void findBestAlgo() + { + // Let's hope cuDNN team will not come up with more than that number of algos (7 in cuDNN 7). + const int algo_count = 20; + int res_algo_count; + cudnnConvolutionBwdDataAlgoPerf_t algos[algo_count]; + // REVIEW alexeyk: cuDNN issue: it looks like cudnnConvolutionBackwardData hangs when + // CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 is chosen. Same problem with Ex version. +#if false + // CHECK(cudnnFindConvolutionBackwardDataAlgorithm(cudnn_, w_desc_, y_desc_, c_desc_, x_desc_, + // algo_count, &res_algo_count, algos)); +#else + CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnn_, w_desc_, y_desc_, c_desc_, x_desc_, + algo_count, &res_algo_count, algos)); +#endif + assert(res_algo_count > 0); + assert(algos[0].status == CUDNN_STATUS_SUCCESS); + + // Best algo is the first. + best_algo_ = algos[0].algo; + workspace_bytes_ = algos[0].memory; + + // Log results. + log_.log(ILogger::Severity::kINFO, (name_ + ": --> Conv3DTranspose layer tuning results:").c_str()); + for (auto& a: algos) + { + if (a.status != CUDNN_STATUS_SUCCESS) + break; + std::ostringstream str; + str << a.algo << ": " << std::fixed << std::setw(8) << std::setprecision(1) << a.time << "ms, " + << std::fixed << std::setw(8) << a.memory << "B"; + log_.log(ILogger::Severity::kINFO, str.str().c_str()); + } + log_.log(ILogger::Severity::kINFO, (name_ + ": <-- Conv3DTranspose layer tuning results.").c_str()); + } + +private: + Conv3DType conv_type_; + cudnnDataType_t data_type_; + cudnnDataType_t weights_type_; + + // Using DimsNCHW to represent 3D convos input/output is an ugly workaround + // of TRT limitations which currently result in assert in the guts of TRT. + DimsNCHW x_dims_; + DimsNCHW y_dims_; + Dims w_dims_; + Dims stride_dims_; + Dims pad_start_dims_; + Dims pad_end_dims_; + Dims b_dims_; + + int max_batch_size_ = 0; + + // Kernel weights on the host. + Weights kernel_weights_; + // Kernel weights on the device. + float* kernel_weights_d_ = nullptr; + + // Bias weights on the host. + Weights bias_weights_; + // Bias weights on the device. + float* bias_weights_d_ = nullptr; + + cudnnHandle_t cudnn_ = nullptr; + cudnnTensorDescriptor_t x_desc_ = nullptr; + cudnnTensorDescriptor_t xo_desc_ = nullptr; // Original descriptor of x. + cudnnTensorDescriptor_t y_desc_ = nullptr; + cudnnFilterDescriptor_t w_desc_ = nullptr; + cudnnConvolutionDescriptor_t c_desc_ = nullptr; + cudnnTensorDescriptor_t b_desc_ = nullptr; + + cudnnConvolutionBwdDataAlgo_t best_algo_ = (cudnnConvolutionBwdDataAlgo_t)-1; + size_t workspace_bytes_ = 0; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createConv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims, Dims out_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new Conv3DTransposePlugin(conv_type, kernel_dims, out_dims, + stride_dims, pad_start_dims, pad_end_dims, + kernel_weights, bias_weights, + log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/conv_utils.cpp b/stereoDNN/lib/conv_utils.cpp new file mode 100644 index 0000000..59f75d4 --- /dev/null +++ b/stereoDNN/lib/conv_utils.cpp @@ -0,0 +1,112 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "conv_utils.h" +#include +#include +#include "internal_utils.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +void ConvUtils::setConv3DTensorDescriptor(Conv3DType conv_type, Dims dims, int batch_size, + cudnnDataType_t data_type, cudnnTensorDescriptor_t& desc, + ILogger& log) +{ + assert(dims.nbDims == 4); + assert(batch_size > 0); + assert(desc != nullptr); + + int c = dims.d[0]; + int d = dims.d[1]; + int h = dims.d[2]; + int w = dims.d[3]; + + // Reshape input if needed. + if (conv_type == Conv3DType::kTensorFlow) + { + c = 1; + d = dims.d[0] * dims.d[1]; + } + + std::array full_dims = { batch_size, c, d, h, w }; + std::array strides = { c * d * h * w, + d * h * w, + h * w, + w, + 1}; + // Sanity check. + assert((int64_t)c * d * h * w == c * d * h * w); + + CHECKL(cudnnSetTensorNdDescriptor(desc, data_type, full_dims.size(), full_dims.data(), strides.data()), log); +} + +void ConvUtils::setConv3DOperationDescriptors(Conv3DType conv_type, Dims w_dims, Dims stride_dims, Dims pad_dims, + cudnnDataType_t data_type, + cudnnFilterDescriptor_t& w_desc, cudnnConvolutionDescriptor_t& c_desc, + ILogger& log) +{ + assert(w_dims.nbDims == 5); + assert(stride_dims.nbDims == 3); + assert(pad_dims.nbDims == 3); + assert(w_desc != nullptr); + assert(c_desc != nullptr); + + // Reshape/update if needed. + if (conv_type == Conv3DType::kTensorFlow) + { + int c = w_dims.d[1]; + int v = w_dims.d[2]; + + // Reshape filter. + w_dims.d[1] = 1; + w_dims.d[2] = c * v; + + // Update stride. + stride_dims.d[0] *= v; + + // Update padding. + pad_dims.d[0] *= v; + } + + // Set filter descriptor. + CHECKL(cudnnSetFilterNdDescriptor(w_desc, data_type, CUDNN_TENSOR_NCHW, w_dims.nbDims, w_dims.d), log); + + // Set convolution descriptor. + std::array dilation = { 1, 1, 1 }; + CHECKL(cudnnSetConvolutionNdDescriptor(c_desc, pad_dims.nbDims, pad_dims.d, stride_dims.d, dilation.data(), + CUDNN_CROSS_CORRELATION, data_type), log); +} + +Dims ConvUtils::getConv3DOutputDims(cudnnConvolutionDescriptor_t c_desc, cudnnTensorDescriptor_t x_desc, + cudnnFilterDescriptor_t w_desc, ILogger& log) +{ + assert(c_desc != nullptr); + assert(x_desc != nullptr); + assert(w_desc != nullptr); + + // 5D tensor in (N, K, D, H, W) format. + Dims y_dims; + y_dims.nbDims = 5; + CHECKL(cudnnGetConvolutionNdForwardOutputDim(c_desc, x_desc, w_desc, y_dims.nbDims, y_dims.d), log); + return y_dims; +} + +void ConvUtils::setConv3DBiasDescriptor(Dims dims, cudnnDataType_t data_type, + cudnnTensorDescriptor_t& desc, ILogger& log) +{ + assert(dims.nbDims == 5); + assert(desc != nullptr); + + // REVIEW alexeyk: see the comment in tensorrt_model_builder.py re: the stride issue in Conv3D. + std::array strides = {dims.d[1] * dims.d[2] * dims.d[3] * dims.d[4], + dims.d[2] * dims.d[3] * dims.d[4], + dims.d[3] * dims.d[4], + dims.d[4], + 1}; + CHECKL(cudnnSetTensorNdDescriptor(desc, data_type, dims.nbDims, dims.d, strides.data()), log); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/conv_utils.h b/stereoDNN/lib/conv_utils.h new file mode 100644 index 0000000..a479343 --- /dev/null +++ b/stereoDNN/lib/conv_utils.h @@ -0,0 +1,100 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#ifndef REDTAIL_CONV_UTILS_H +#define REDTAIL_CONV_UTILS_H + +#include +#include +#include "redtail_tensorrt_plugins.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Contains various helper methods that set cudNN convolution descriptors. +// +// Notes on 3D convolution implementation in cuDNN: +// 1. Convolution input is represnted as 4D tensor (ignoring batch size dimension): +// (C, Di, Hi, Wi), where C is number of 3D feature maps, Di, Hi, Wi - +// depth, height and width of the input over which convolution is performed. +// Convolution is performed by the filter (described later in details) over +// (Di, Hi, Wi) dimensions. This is similar to 2D case where convolution +// is performed over (Hi,Wi) dimensions over input that has C feature maps. +// 2. Convolution filter is represented as 5D tensor (cuDNN-like notation): +// (K, C, V, R, S), where K is number of output feature maps (similar to 2D convo), +// C - must be equal to C of the input (similar to 2D convo), and V, R, S - +// dimensions of the filter that go along Di, Hi, Wi dimensions of the input. +// 3. Convolution output is represented as 4D tensor (ignoring batch dimension): +// (K, Do, Ho, Wo), where K is number of output feature maps (equal to K dim +// of the filter), and Do, Ho, Wo are dimensions of the output that are +// computed according to usual rules of convolution dimensions, taking into +// account padding and strides. +// +// This is different from 3D convolution implementation in some other +// DL toolkits, for example, TensorFlow conv3d operator computes it differently +// REVIEW alexeyk: finish documenting Conv3DType. +// +// We also have to abuse TensorRT DimsNCHW type in few places to represent 3D convolutions +// as TRT (as of v3.0) does not support generic 4D/5D tensors - it fails with assert +// somewhere in the guts of TRT when using generic Dims type. +// ----------------------------------------------------------------- +class ConvUtils +{ +public: + // ----------------------------------------------------------------- + // Sets descriptor for 3D convolution input/output. + // This method is used in: + // - Conv3DPlugin: + // sets plugin input descriptor which is a convolution input. + // sets plugin output descriptor which is a convolution output. + // - Conv3DTransposePlugin: + // sets plugin output descriptor which is a transposed convolution output. + // sets plugin input descriptor which is a transposed convolution input. + // + // dims : tensor dimensions, must have rank 4. + // batch_size: batch size, must be positive. + // ----------------------------------------------------------------- + static void setConv3DTensorDescriptor(Conv3DType conv_type, Dims dims, int batch_size, + cudnnDataType_t data_type, cudnnTensorDescriptor_t& desc, + ILogger& log); + + // ----------------------------------------------------------------- + // Sets convolution filter and operation descriptors for 3D convolution. + // This method is used in: + // - Conv3DPlugin: + // - Conv3DTransposePlugin: + // + // w_dims: + // Filter dimensions, must have rank 5. The filter tensor is assumed to be in (K, C, V, R, S) format. + // pad_dims: + // Input padding dimensions, must have rank 3. The padding is for (Di, Hi, Wi) dimensions of the + // convolution input (transposed conovlution output). + // stride_dims: + // Filter stride dimensions, must have rank 3. The stride is for (Di, Hi, Wi) dimensions of the + // convolution input (transposed conovlution output). + // ----------------------------------------------------------------- + static void setConv3DOperationDescriptors(Conv3DType conv_type, Dims w_dims, Dims stride_dims, Dims pad_dims, + cudnnDataType_t data_type, + cudnnFilterDescriptor_t& w_desc, cudnnConvolutionDescriptor_t& c_desc, + ILogger& log); + + // ----------------------------------------------------------------- + // Returns convolution output dimensions given input, filter and + // convolution descriptors. + // ----------------------------------------------------------------- + static Dims getConv3DOutputDims(cudnnConvolutionDescriptor_t c_desc, cudnnTensorDescriptor_t x_desc, + cudnnFilterDescriptor_t w_desc, ILogger& log); + + static void setConv3DBiasDescriptor(Dims dims, cudnnDataType_t data_type, + cudnnTensorDescriptor_t& desc, ILogger& log); + +public: + ConvUtils(ConvUtils&&) = delete; +}; + +} } + +#endif \ No newline at end of file diff --git a/stereoDNN/lib/cost_volume_plugin.cpp b/stereoDNN/lib/cost_volume_plugin.cpp new file mode 100644 index 0000000..0a9fe0b --- /dev/null +++ b/stereoDNN/lib/cost_volume_plugin.cpp @@ -0,0 +1,118 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Cost volume plugin. +// ----------------------------------------------------------------- +class CostVolumePlugin: public IPlugin +{ +public: + CostVolumePlugin(int max_disparity, ILogger& log, std::string name): + max_disparity_(max_disparity), log_(log), name_(name) + { + assert(max_disparity_ > 0); + } + + CostVolumePlugin(CostVolumePlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + // Expecting 2 3D inputs, left and right, which are outputs of 2D convolutions + // and are of the same shape. + assert(nbInputDims == 2); + for (int i = 0; i < nbInputDims; i++) + { + assert(inputs[i].nbDims == 3); + assert(inputs[i].type[0] == DimensionType::kCHANNEL); + assert(inputs[i].type[1] == DimensionType::kSPATIAL); + assert(inputs[i].type[2] == DimensionType::kSPATIAL); + } + assert(DimsUtils::areEqual(inputs[0], inputs[1])); + + in_dims_ = inputs[0]; + out_dims_ = DimsNCHW(max_disparity_, 2 * in_dims_.d[0], in_dims_.d[1], in_dims_.d[2]); + return out_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(nbInputs == 2); + assert(nbOutputs == 1); + assert(DimsUtils::areEqual(inputDims[0], in_dims_)); + assert(DimsUtils::areEqual(inputDims[1], in_dims_)); + assert(DimsUtils::areEqual(outputDims[0], out_dims_)); + assert(maxBatchSize == 1); + + log_.log(ILogger::Severity::kINFO, (name_ + ": InDims(x2): " + DimsUtils::toString(in_dims_)).c_str()); + log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims : " + DimsUtils::toString(out_dims_)).c_str()); + } + + int initialize() override + { + return 0; + } + + void terminate() override + { + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + assert(maxBatchSize == 1); + return 0; + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + assert(batchSize == 1); + + auto pleft = static_cast(inputs[0]); + auto pright = static_cast(inputs[1]); + auto pcost_vol = static_cast(outputs[0]); + + cudaError_t status; + CHECK(status = CudaKernels::computeCostVolume(pleft, pright, in_dims_, pcost_vol, out_dims_, stream)); + + return status; + } + + size_t getSerializationSize() override + { + return 0; + } + + void serialize(void* buffer) override + { + } + +private: + int max_disparity_; + Dims in_dims_; + DimsNCHW out_dims_; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createCostVolumePlugin(int max_disparity, std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new CostVolumePlugin(max_disparity, log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/elu_plugin.cpp b/stereoDNN/lib/elu_plugin.cpp new file mode 100644 index 0000000..762fc4e --- /dev/null +++ b/stereoDNN/lib/elu_plugin.cpp @@ -0,0 +1,163 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// ELU activation function plugin. +// ----------------------------------------------------------------- +class EluPlugin: public IPlugin +{ +public: + EluPlugin(DataType data_type, ILogger& log, std::string name): + data_type_(trtToCudnnDataType(data_type)), log_(log), name_(name) + { + // REVIEW alexeyk: TRT currently does not support FP16 data tensors. + assert(data_type == DataType::kFLOAT); + } + + EluPlugin(EluPlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + assert(nbInputDims == 1); + in_dims_ = inputs[0]; + // No restrictions on input dims. + + return in_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(nbInputs == 1); + assert(nbOutputs == 1); + // Sanity check. + assert(DimsUtils::areEqual(in_dims_, inputDims[0])); + assert(DimsUtils::areEqual(in_dims_, outputDims[0])); + + cudnnStatus_t status; + + CHECK(status = cudnnCreate(&cudnn_)); + CHECK(status = cudnnCreateActivationDescriptor(&act_)); + CHECK(status = cudnnSetActivationDescriptor(act_, CUDNN_ACTIVATION_ELU, CUDNN_PROPAGATE_NAN, 1.0)); + CHECK(status = cudnnCreateTensorDescriptor(&t_desc_)); + + setTensorDescriptor(maxBatchSize); + + log_.log(ILogger::Severity::kINFO, (name_ + ": Dims: " + DimsUtils::toString(in_dims_)).c_str()); + } + + int initialize() override + { + return 0; + } + + void terminate() override + { + assert(isValid()); + + if (act_ != nullptr) + CHECK(cudnnDestroyActivationDescriptor(act_)); + if (t_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(t_desc_)); + if (cudnn_ != nullptr) + CHECK(cudnnDestroy(cudnn_)); + act_ = nullptr; + t_desc_ = nullptr; + cudnn_ = nullptr; + + assert(!isValid()); + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + return 0; + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + assert(isValid()); + + cudnnStatus_t status; + + CHECK(status = cudnnSetStream(cudnn_, stream)); + if (batchSize != tensor_dims_.d[0]) + updateTensorDescriptor(batchSize); + CHECK(status = cudnnActivationForward(cudnn_, act_, &Consts::kOne, t_desc_, inputs[0], &Consts::kZero, t_desc_, outputs[0])); + + return status == CUDNN_STATUS_SUCCESS ? 0 : -1; + } + + size_t getSerializationSize() override + { + assert(isValid()); + return 0; + } + + void serialize(void* buffer) override + { + assert(isValid()); + } + +private: + void setTensorDescriptor(int batch_size) + { + assert(isValid()); + + tensor_dims_.nbDims = in_dims_.nbDims + 1; + tensor_dims_.d[0] = batch_size; + std::copy(in_dims_.d, in_dims_.d + in_dims_.nbDims, tensor_dims_.d + 1); + + tensor_strides_ = DimsUtils::getStrides(tensor_dims_); + + CHECK(cudnnSetTensorNdDescriptor(t_desc_, data_type_, tensor_dims_.nbDims, tensor_dims_.d, tensor_strides_.d)); + } + + // Updates tensor descriptor according to batch_size. + void updateTensorDescriptor(int batch_size) + { + // No other parameters require update. + tensor_dims_.d[0] = batch_size; + CHECK(cudnnSetTensorNdDescriptor(t_desc_, data_type_, tensor_dims_.nbDims, tensor_dims_.d, tensor_strides_.d)); + } + +private: + bool isValid() const + { + return cudnn_ != nullptr; + } + +private: + cudnnDataType_t data_type_; + cudnnHandle_t cudnn_ = nullptr; + cudnnActivationDescriptor_t act_ = nullptr; + cudnnTensorDescriptor_t t_desc_ = nullptr; + + Dims in_dims_; + Dims tensor_dims_; + Dims tensor_strides_; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createEluPlugin(DataType data_type, std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new EluPlugin(data_type, log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/internal_macros.h b/stereoDNN/lib/internal_macros.h new file mode 100644 index 0000000..30106e2 --- /dev/null +++ b/stereoDNN/lib/internal_macros.h @@ -0,0 +1,38 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#ifndef REDTAIL_INTERNAL_MACROS_H +#define REDTAIL_INTERNAL_MACROS_H + +// ----------------------------------------------------------------- +// CHECKL and CHECK macros check for status and log the message +// in case of status != 0. The difference between macros is CHECKL +// takes log argument explicitly while CHECK - assumes log_ variable +// is defined (used in class memebers). +// ----------------------------------------------------------------- +#undef CHECKL +#define CHECKL(status, log) do { \ + auto res = (status); \ + if ((int)res != 0) \ + redtail::tensorrt::reportError(res, __FILE__, __LINE__, __FUNCTION__, log); \ +} while(false) + +#undef CHECK +#define CHECK(status) CHECKL(status, log_) + +// ----------------------------------------------------------------- +// UNUSED is generic version of macro that allows to avoid unused +// variable warning. +// UNUSEDR disables such warning only in Release builds. +// ----------------------------------------------------------------- +#undef UNUSED +#define UNUSED(x) ((void)(x)) + +#undef UNUSEDR +#ifdef NDEBUG + #define UNUSEDR(x) ((void)(x)) +#else + #define UNUSEDR(x) +#endif + +#endif \ No newline at end of file diff --git a/stereoDNN/lib/internal_utils.cpp b/stereoDNN/lib/internal_utils.cpp new file mode 100644 index 0000000..8c2d232 --- /dev/null +++ b/stereoDNN/lib/internal_utils.cpp @@ -0,0 +1,236 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include +#include +#include +#include +#include + +namespace redtail { namespace tensorrt +{ + +const float Consts::kZero = 0.0f; +const float Consts::kOne = 1.0f; + +// ----------------------------------------------------------------- +// DimsUtils implementation. +// ----------------------------------------------------------------- +size_t DimsUtils::getTensorSize(Dims dims) +{ + assert(dims.nbDims >= 1); + size_t res = std::accumulate(dims.d, dims.d + dims.nbDims, (size_t)1, std::multiplies()); + // Dims.d is int, so overflow check. + assert(res == (size_t)std::accumulate(dims.d, dims.d + dims.nbDims, (int)1, std::multiplies())); + return res; +} + +Dims DimsUtils::getStrides(Dims dims) +{ + Dims strides; + strides.nbDims = dims.nbDims; + strides.d[strides.nbDims - 1] = 1; + for (int i = strides.nbDims - 2; i >= 0; i--) + { + strides.d[i] = strides.d[i + 1] * dims.d[i + 1]; + strides.type[i] = DimensionType::kSPATIAL; + } + return strides; +} + +// Equality check. The function checks for shape and size, +// but not for dimension types. +bool DimsUtils::areEqual(Dims d1, Dims d2) +{ + if (d1.nbDims != d2.nbDims) + return false; + return std::equal(d1.d, d1.d + d1.nbDims, d2.d); +} + +std::string DimsUtils::toString(Dims dims) +{ + std::ostringstream str; + str << "{"; + for (int i = 0; i < dims.nbDims; i++) + { + str << std::fixed << std::setw(4) << dims.d[i]; + if (i < dims.nbDims - 1) + str << ","; + } + str << "}"; + return str.str(); +} + +// ----------------------------------------------------------------- +// Conversion helpers. +// ----------------------------------------------------------------- +cudnnDataType_t trtToCudnnDataType(DataType trt_data_type) +{ + // REVIEW alexeyk: fp32/16 only for now. + assert(trt_data_type == DataType::kFLOAT || trt_data_type == DataType::kHALF); + + if (trt_data_type == DataType::kFLOAT) + return CUDNN_DATA_FLOAT; + if (trt_data_type == DataType::kHALF) + return CUDNN_DATA_HALF; + return (cudnnDataType_t)-1; +} + +// ----------------------------------------------------------------- +// PluginContainer implementation. +// ----------------------------------------------------------------- + +std::unique_ptr IPluginContainer::create(ILogger& log) +{ + return std::make_unique(log); +} + +// ----------------------------------------------------------------- +// Plugins helper functions. +// ----------------------------------------------------------------- +ILayer* addElu(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + DataType data_type, const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createEluPlugin(data_type, name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&input}; + auto layer = network.addPlugin(inputs, 1, *plugin); + assert(layer != nullptr); + return layer; +} + +ILayer* addCostVolume(IPluginContainer& plugin_factory, INetworkDefinition& network, + ITensor& left_input, ITensor& right_input, int max_disparity, + const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createCostVolumePlugin(max_disparity, name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&left_input, &right_input}; + auto layer = network.addPlugin(inputs, 2, *plugin); + assert(layer != nullptr); + return layer; +} + +ILayer* addConv3D(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Conv3DType conv_type, Dims kernel_dims, Dims stride_dims, + Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createConv3DPlugin(conv_type, kernel_dims, + stride_dims, pad_start_dims, pad_end_dims, + kernel_weights, bias_weights, + name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&input}; + auto layer = network.addPlugin(inputs, 1, *plugin); + assert(layer != nullptr); + return layer; +} + +ILayer* addConv3DTranspose(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Conv3DType conv_type, Dims kernel_dims, Dims out_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createConv3DTransposePlugin(conv_type, kernel_dims, out_dims, + stride_dims, pad_start_dims, pad_end_dims, + kernel_weights, bias_weights, + name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&input}; + auto layer = network.addPlugin(inputs, 1, *plugin); + assert(layer != nullptr); + return layer; +} + +ILayer* addSlice(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Dims dims, Dims slice_start, Dims slice_end, + const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createSlicePlugin(dims, slice_start, slice_end, name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&input}; + auto layer = network.addPlugin(inputs, 1, *plugin); + assert(layer != nullptr); + return layer; +} + +ILayer* addTransform(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Permutation permutation, + const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createTransformPlugin(permutation, name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&input}; + auto layer = network.addPlugin(inputs, 1, *plugin); + assert(layer != nullptr); + return layer; +} + +ILayer* addPad(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + DimsNCHW pad_start, DimsNCHW pad_end, + const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createPaddingPlugin(pad_start, pad_end, name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&input}; + auto layer = network.addPlugin(inputs, 1, *plugin); + assert(layer != nullptr); + return layer; +} + +ILayer* addSoftargmax(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + const std::string& name) +{ + // Create plugin. + auto plugin = plugin_factory.createSoftargmaxPlugin(name); + assert(plugin != nullptr); + // Add to the network. + ITensor* inputs[] = {&input}; + auto layer = network.addPlugin(inputs, 1, *plugin); + assert(layer != nullptr); + return layer; +} + +// ----------------------------------------------------------------- +// Error checking and reporting methods. +// ----------------------------------------------------------------- +void reportError(cudaError_t status, const char* file, int line, const char* func, ILogger& log) +{ + if (status == cudaSuccess) + return; + std::ostringstream str; + str << file << ":" << line << ": " << func << ": CUDA error " + << status << " (" << cudaGetErrorName(status) << ": " << cudaGetErrorString(status) << ")."; + log.log(ILogger::Severity::kERROR, str.str().c_str()); + assert(status == cudaSuccess); +} + +void reportError(cudnnStatus_t status, const char* file, int line, const char* func, ILogger& log) +{ + if (status == CUDNN_STATUS_SUCCESS) + return; + std::ostringstream str; + str << file << ":" << line << ": " << func << ": cuDNN error " << status << " (" << cudnnGetErrorString(status) << ")."; + log.log(ILogger::Severity::kERROR, str.str().c_str()); + assert(status == CUDNN_STATUS_SUCCESS); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/internal_utils.h b/stereoDNN/lib/internal_utils.h new file mode 100644 index 0000000..d2742e0 --- /dev/null +++ b/stereoDNN/lib/internal_utils.h @@ -0,0 +1,147 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#ifndef REDTAIL_INTERNAL_UTILS_H +#define REDTAIL_INTERNAL_UTILS_H + +#include +#include +#include +#include + +#include +#include +#include + +#include "internal_macros.h" +#include "redtail_tensorrt_plugins.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Constants. +// REVIEW alexeyk: float ony for now, refactor to support FP16/Int8. +// ----------------------------------------------------------------- +struct Consts +{ + static const float kZero; + static const float kOne; + +public: + Consts(Consts&&) = delete; +}; + +// ----------------------------------------------------------------- +// Various Dims helper functions. +// ----------------------------------------------------------------- +class DimsUtils +{ +public: + static size_t getTensorSize(Dims dims); + + static Dims getStrides(Dims dims); + + static bool areEqual(Dims d1, Dims d2); + + static std::string toString(Dims dims); + +public: + DimsUtils(DimsUtils&&) = delete; +}; + +// ----------------------------------------------------------------- +// Conversion helpers. +// ----------------------------------------------------------------- +cudnnDataType_t trtToCudnnDataType(DataType trt_data_type); + +// ----------------------------------------------------------------- +// CUDA kernels interface. +// ----------------------------------------------------------------- +class CudaKernels +{ +public: + template + static cudaError_t computeCostVolume(const T* left, const T* right, Dims in_dims, T* cost_vol, Dims out_dims, cudaStream_t stream); + + template + static cudaError_t addDBiasTo3DConv(const T* bias, Dims bias_dims, T* conv, Dims conv_dims, cudaStream_t stream); + + static cudaError_t fp32Tofp16(const float* src, uint16_t* dst, size_t size, cudaStream_t stream); + static cudaError_t fp16Tofp32(const uint16_t* src, float* dst, size_t size, cudaStream_t stream); + +public: + CudaKernels(CudaKernels&&) = delete; +}; + +// Template instantiation. +template<> +cudaError_t CudaKernels::computeCostVolume(const float*, const float*, Dims, float*, Dims, cudaStream_t); + +template<> +cudaError_t CudaKernels::addDBiasTo3DConv(const float*, Dims, float*, Dims, cudaStream_t); + +// ----------------------------------------------------------------- +// Simple implementation of IPluginContainer. +// The container keeps pointers to created plugins. +// ----------------------------------------------------------------- +class PluginContainer: public IPluginContainer +{ +public: + PluginContainer(ILogger& log): + log_(log) + { + } + + ~PluginContainer() noexcept = default; + + // Also disables copy ctor/assign ops. + PluginContainer(PluginContainer&&) = delete; + + // ELU plugin. + IPlugin* createEluPlugin(DataType data_type, std::string name) override; + + // Cost volume plugin. + IPlugin* createCostVolumePlugin(int max_disparity, std::string name) override; + + // 3D convolution. + IPlugin* createConv3DPlugin(Conv3DType conv_type, Dims kernel_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + std::string name) override; + + // Transposed 3D convolution. + IPlugin* createConv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims, Dims out_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + std::string name) override; + + // Transform. + IPlugin* createTransformPlugin(Permutation permutation, std::string name) override; + + // Padding. + IPlugin* createPaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end, + std::string name) override; + + IPlugin* createSlicePlugin(Dims dims, Dims slice_start, Dims slice_end, + std::string name) override; + + IPlugin* createSoftargmaxPlugin(std::string name) override; + +private: + // TensorRT IPlugin interface has protected dtor so cannot use unique_ptr + // or delete plugins in the container dtor. + std::vector plugins_; + std::mutex lock_; + + ILogger& log_; +}; + +void reportError(cudaError_t status, const char* file, int line, const char* func, ILogger& log); +void reportError(cudnnStatus_t status, const char* file, int line, const char* func, ILogger& log); + +} } + +#endif \ No newline at end of file diff --git a/stereoDNN/lib/kernels.cu b/stereoDNN/lib/kernels.cu new file mode 100644 index 0000000..31dc60a --- /dev/null +++ b/stereoDNN/lib/kernels.cu @@ -0,0 +1,261 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include + +// Check async error. +// Sync and get kernel status in Debug builds. +#ifndef NDEBUG + #define SYNC_AND_CHECK_STREAM(stream) do { \ + cudaError_t status = cudaStreamSynchronize(stream); \ + if (status != cudaSuccess) \ + return status; \ +}while(false) +#else + #define SYNC_AND_CHECK_STREAM(stream) +#endif + +#define CHECKK(stream) do { \ + cudaError_t status = cudaGetLastError(); \ + if (status != cudaSuccess) \ + return status; \ + SYNC_AND_CHECK_STREAM(stream); \ +}while(false) + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +static const int kMaxGridSizeY = 65535; +static const int kMaxGridSizeZ = 65535; + +// ----------------------------------------------------------------- +// Helper function to get block count. +// ----------------------------------------------------------------- +static uint32_t getBlockCount(uint32_t total_size, uint32_t block_size) +{ + uint32_t res = (total_size + block_size - 1) / block_size; + assert(res > 0); + assert((size_t)res * block_size >= total_size); + return res; +} + +// REVIEW alexeyk: kernels are not optimized for now. + +// ----------------------------------------------------------------- +// Cost volume kernels. +// ----------------------------------------------------------------- +template +__global__ void costVolumeCopyKernel(const T* src, int32_t c, int32_t h, int32_t w, int32_t disp, T* dst) +{ + assert(src != nullptr); + assert(dst != nullptr); + + const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z; + if (ix >= w || iy >= h || iz >= c) + return; + const size_t isrc = iz * h * w + iy * w + ix; + const size_t stride = 2 * c * h * w; + T val = src[isrc]; + T* pdst = dst + isrc; + for (int32_t idst = 0; idst < disp; idst++) + { + *pdst = val; + pdst += stride; + } +} + +template +__global__ void costVolumeCopyPadKernel(const T* src, int32_t c, int32_t h, int32_t w, int32_t disp, T* dst) +{ + assert(src != nullptr); + assert(dst != nullptr); + + const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z; + if (ix >= w || iy >= h || iz >= c) + return; + const size_t isrc = iz * h * w + iy * w + ix; + size_t stride = c * h * w; + const size_t idst = isrc + stride; + stride *= 2; + + T* pdst = dst + idst; + for (int32_t pad = 0; pad < disp; pad++) + { + if (ix < pad) + *pdst = 0; + else + *pdst = src[isrc - pad]; + pdst += stride; + } +} + +template +__global__ void costVolumeKernel(const T* left, const T* right, int32_t c, int32_t h, int32_t w, int32_t disp, T* dst) +{ + assert(left != nullptr); + assert(right != nullptr); + assert(dst != nullptr); + + const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z; + if (ix >= w || iy >= h || iz >= c) + return; + + // Setup initial indices. + size_t stride = c * h * w; + // Left and right source is the same. + const size_t ileft = iz * h * w + iy * w + ix; + T* pdst_l = dst + ileft; + const size_t iright = ileft; + // Right destination is offset by 1 in c dimension. + T* pdst_r = dst + iright + stride; + // Final stride is 2 in c dimension. + stride *= 2; + + T val_l = left[ileft]; + for (int32_t pad = 0; pad < disp; pad++) + { + if (ix < pad) + *pdst_r = 0; + else + *pdst_r = right[iright - pad]; + *pdst_l = val_l; + pdst_l += stride; + pdst_r += stride; + } +} + +template<> +cudaError_t CudaKernels::computeCostVolume(const float* left, const float* right, Dims in_dims, float* cost_vol, Dims out_dims, cudaStream_t stream) +{ + assert(in_dims.nbDims == 3); + assert(out_dims.nbDims == 4); + + dim3 b_dim{16, 16, 1}; + dim3 g_dim; + g_dim.x = getBlockCount(in_dims.d[2], b_dim.x); + g_dim.y = getBlockCount(in_dims.d[1], b_dim.y); + g_dim.z = getBlockCount(in_dims.d[0], b_dim.z); + + // REVIEW alexeyk: using 2 kernels instead of one as it's not yet optimized so 2 kernels are faster. + // REVIEW alexeyk: optimize, see gld_efficiency,gst_efficiency,gld_transactions,gst_transactions. + // costVolumeKernel<<>>(left, right, in_dims.d[0], in_dims.d[1], in_dims.d[2], out_dims.d[0], + // cost_vol); + costVolumeCopyKernel<<>>(left, in_dims.d[0], in_dims.d[1], in_dims.d[2], out_dims.d[0], + cost_vol); + CHECKK(stream); + costVolumeCopyPadKernel<<>>(right, in_dims.d[0], in_dims.d[1], in_dims.d[2], out_dims.d[0], + cost_vol); + CHECKK(stream); + return cudaSuccess; +} + +// ----------------------------------------------------------------- +// Some convolution-related kernels. +// ----------------------------------------------------------------- +template +__global__ void addDBiasTo3DConvKernel(const T* bias, int32_t c, int32_t d, int32_t h, int32_t w, T* conv) +{ + assert(bias != nullptr); + assert(conv != nullptr); + + const uint32_t ix = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t iy = blockIdx.y * blockDim.y + threadIdx.y; + const uint32_t iz = blockIdx.z * blockDim.z + threadIdx.z; + if (ix >= w || iy >= h || iz >= d * c) + return; + + int32_t cur_d = iz % d; + const size_t idst = iz * h * w + iy * w + ix; + + conv[idst] += bias[cur_d]; +} + +template<> +cudaError_t CudaKernels::addDBiasTo3DConv(const float* bias, Dims bias_dims, float* conv, Dims conv_dims, cudaStream_t stream) +{ + assert(bias_dims.nbDims == 5); + assert(conv_dims.nbDims == 4); + // REVIEW alexeyk: minibatch size 1 for now. + assert(bias_dims.d[0] == 1); + assert(bias_dims.d[2] == conv_dims.d[1]); + UNUSEDR(bias_dims); + + dim3 b_dim{16, 16, 1}; + dim3 g_dim; + g_dim.x = getBlockCount(conv_dims.d[3], b_dim.x); + g_dim.y = getBlockCount(conv_dims.d[2], b_dim.y); + g_dim.z = getBlockCount(conv_dims.d[0] * conv_dims.d[1], b_dim.z); + // REVIEW alexeyk: no block striding for now. + assert(g_dim.y <= kMaxGridSizeY); + assert(g_dim.z <= kMaxGridSizeZ); + UNUSEDR(kMaxGridSizeY); + UNUSEDR(kMaxGridSizeZ); + + addDBiasTo3DConvKernel<<>>(bias, conv_dims.d[0], conv_dims.d[1], conv_dims.d[2], conv_dims.d[3], conv); + CHECKK(stream); + + return cudaSuccess; +} + +// ----------------------------------------------------------------- +// Conversion kernels. +// ----------------------------------------------------------------- +__global__ void fp32Tofp16Kernel(const float* src, uint16_t* dst, size_t size) +{ + assert(src != nullptr); + assert(dst != nullptr); + + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= size) + return; + + __half val(src[tid]); + dst[tid] = *(uint16_t*)&val; +} + +cudaError_t CudaKernels::fp32Tofp16(const float* src, uint16_t* dst, size_t size, cudaStream_t stream) +{ + dim3 b_dim{256, 1, 1}; + dim3 g_dim; + g_dim.x = getBlockCount(size, b_dim.x); + + fp32Tofp16Kernel<<>>(src, dst, size); + CHECKK(stream); + + return cudaSuccess; +} + +__global__ void fp16Tofp32Kernel(const uint16_t* src, float* dst, size_t size) +{ + assert(src != nullptr); + assert(dst != nullptr); + + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= size) + return; + + dst[tid] = (float)(*(__half*)(src + tid)); +} + +cudaError_t CudaKernels::fp16Tofp32(const uint16_t* src, float* dst, size_t size, cudaStream_t stream) +{ + dim3 b_dim{256, 1, 1}; + dim3 g_dim; + g_dim.x = getBlockCount(size, b_dim.x); + + fp16Tofp32Kernel<<>>(src, dst, size); + CHECKK(stream); + + return cudaSuccess; +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/padding_plugin.cpp b/stereoDNN/lib/padding_plugin.cpp new file mode 100644 index 0000000..3101641 --- /dev/null +++ b/stereoDNN/lib/padding_plugin.cpp @@ -0,0 +1,125 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Tensor padding plugin. +// ----------------------------------------------------------------- +class PaddingPlugin: public IPlugin +{ +public: + PaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end, ILogger& log, std::string name): + pad_start_(pad_start), pad_end_(pad_end), log_(log), name_(name) + { + // Only D end padding is currently supported. + assert(pad_start_.n() == 0); + assert(pad_start_.c() == 0); + assert(pad_start_.h() == 0); + assert(pad_start_.w() == 0); + + assert(pad_end_.n() >= 0); + assert(pad_end_.c() == 0); + assert(pad_end_.h() == 0); + assert(pad_end_.w() == 0); + } + + PaddingPlugin(PaddingPlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + assert(nbInputDims == 1); + // Only NCHW format is supported for now, TRT does not work (assert) with generic Dims. + assert(inputs[0].nbDims == 4); + + in_dims_ = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]); + out_dims_ = DimsNCHW(in_dims_.n() + pad_start_.n() + pad_end_.n(), + in_dims_.c() + pad_start_.c() + pad_end_.c(), + in_dims_.h() + pad_start_.h() + pad_end_.h(), + in_dims_.w() + pad_start_.w() + pad_end_.w()); + return out_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(nbInputs == 1); + assert(nbOutputs == 1); + assert(DimsUtils::areEqual(inputDims[0], in_dims_)); + assert(DimsUtils::areEqual(outputDims[0], out_dims_)); + + log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str()); + log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str()); + } + + int initialize() override + { + return 0; + } + + void terminate() override + { + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + return 0; + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + // Copy original tensor. + size_t in_size_bytes = DimsUtils::getTensorSize(in_dims_) * sizeof(float); + CHECK(cudaMemcpyAsync(outputs[0], inputs[0], in_size_bytes, cudaMemcpyDeviceToDevice, stream)); + + // Zero out end D padding if needed. + size_t out_size_bytes = DimsUtils::getTensorSize(out_dims_) * sizeof(float); + if (out_size_bytes > in_size_bytes) + { + auto pdst = static_cast(outputs[0]) + in_size_bytes; + CHECK(cudaMemsetAsync(pdst, 0, out_size_bytes - in_size_bytes, stream)); + } + + return 0; + } + + size_t getSerializationSize() override + { + return 0; + } + + void serialize(void* buffer) override + { + } + +private: + +private: + DimsNCHW pad_start_; + DimsNCHW pad_end_; + DimsNCHW in_dims_; + DimsNCHW out_dims_; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createPaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end, std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new PaddingPlugin(pad_start, pad_end, log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/redtail_tensorrt_plugins.h b/stereoDNN/lib/redtail_tensorrt_plugins.h new file mode 100644 index 0000000..11deba1 --- /dev/null +++ b/stereoDNN/lib/redtail_tensorrt_plugins.h @@ -0,0 +1,105 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#ifndef REDTAIL_TENSORRT_PLUGINS_H +#define REDTAIL_TENSORRT_PLUGINS_H + +#include +#include + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// 3D convolution type used to setup convolution descriptors. +// ----------------------------------------------------------------- +enum class Conv3DType +{ + kCuDnn = 0, + kTensorFlow = 1 +}; + +// ----------------------------------------------------------------- +// Plugin container/factory. +// TensorRT does not manage plugins and requires a plugin lifetime +// to be the same as any TRT engine. +// Each plugin create* function returns naked pointer as expected by TRT, +// with IPluginContainer managing the plugin's lifetime. +// ----------------------------------------------------------------- +class IPluginContainer +{ +public: + virtual ~IPluginContainer() = default; + + // ELU plugin. + virtual IPlugin* createEluPlugin(DataType data_type, std::string name) = 0; + + // Cost volume plugin. + virtual IPlugin* createCostVolumePlugin(int max_disparity, std::string name) = 0; + + // 3D convolution. + virtual IPlugin* createConv3DPlugin(Conv3DType conv_type, Dims kernel_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + std::string name) = 0; + + virtual IPlugin* createConv3DTransposePlugin(Conv3DType conv_type, Dims kernel_dims, Dims out_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + std::string name) = 0; + + virtual IPlugin* createTransformPlugin(Permutation permutation, std::string name) = 0; + + virtual IPlugin* createPaddingPlugin(DimsNCHW pad_start, DimsNCHW pad_end, + std::string name) = 0; + + virtual IPlugin* createSlicePlugin(Dims dims, Dims slice_start, Dims slice_end, + std::string name) = 0; + + virtual IPlugin* createSoftargmaxPlugin(std::string name) = 0; + + static std::unique_ptr create(ILogger& log); +}; + +// ----------------------------------------------------------------- +// Plugins helper functions. +// ----------------------------------------------------------------- +ILayer* addElu(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + DataType data_type, const std::string& name); + +ILayer* addCostVolume(IPluginContainer& plugin_factory, INetworkDefinition& network, + ITensor& left_input, ITensor& right_input, int max_disparity, + const std::string& name); + +ILayer* addConv3D(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Conv3DType conv_type, Dims kernel_dims, Dims stride_dims, + Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + const std::string& name); + +ILayer* addConv3DTranspose(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Conv3DType conv_type, Dims kernel_dims, Dims out_dims, + Dims stride_dims, Dims pad_start_dims, Dims pad_end_dims, + Weights kernel_weights, Weights bias_weights, + const std::string& name); + +ILayer* addSlice(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Dims dims, Dims slice_start, Dims slice_end, + const std::string& name); + +ILayer* addTransform(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + Permutation permutation, + const std::string& name); + +ILayer* addPad(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + DimsNCHW pad_start, DimsNCHW pad_end, + const std::string& name); + +ILayer* addSoftargmax(IPluginContainer& plugin_factory, INetworkDefinition& network, ITensor& input, + const std::string& name); + +} } + +#endif // REDTAIL_TENSORRT_PLUGINS_H diff --git a/stereoDNN/lib/slice_plugin.cpp b/stereoDNN/lib/slice_plugin.cpp new file mode 100644 index 0000000..bbcfff9 --- /dev/null +++ b/stereoDNN/lib/slice_plugin.cpp @@ -0,0 +1,121 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Tensor slice plugin. +// REVIEW alexey: currently the only supported slicing is for the +// second outermost dimension of 5D tensor as this is all we need so far. +// ----------------------------------------------------------------- +class SlicePlugin: public IPlugin +{ +public: + SlicePlugin(Dims dims, Dims slice_start, Dims slice_end, ILogger& log, std::string name): + in_dims_(dims), slice_start_(slice_start), slice_end_(slice_end), + log_(log), name_(name) + { + assert(in_dims_.nbDims == 4); + assert(in_dims_.nbDims == slice_start_.nbDims); + assert(in_dims_.nbDims == slice_end_.nbDims); + // For now no negative indices or empty tensors either. + assert(0 <= slice_start_.d[0] && slice_start_.d[0] < slice_end_.d[0]); + assert(0 < slice_end_.d[0] && slice_end_.d[0] <= in_dims_.d[0]); + + for (int i = 1; i < in_dims_.nbDims; i++) + { + assert(slice_start_.d[i] == 0); + assert(slice_end_.d[i] == in_dims_.d[i]); + } + } + + SlicePlugin(SlicePlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + assert(nbInputDims == 1); + // Only NCHW format is supported for now. + assert(inputs[0].nbDims == 4); + + out_dims_ = DimsNCHW(slice_end_.d[0] - slice_start_.d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]); + return out_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(nbInputs == 1); + assert(nbOutputs == 1); + assert(DimsUtils::areEqual(inputDims[0], in_dims_)); + assert(DimsUtils::areEqual(outputDims[0], out_dims_)); + + log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str()); + log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str()); + } + + int initialize() override + { + return 0; + } + + void terminate() override + { + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + return 0; + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + size_t d_dim_size = DimsUtils::getTensorSize(in_dims_) / in_dims_.d[0]; + assert(d_dim_size * in_dims_.d[0] == DimsUtils::getTensorSize(in_dims_)); + + auto psrc = static_cast(inputs[0]) + d_dim_size * slice_start_.d[0]; + size_t size = out_dims_.d[0] * d_dim_size; + + cudaError_t status; + CHECK(status = cudaMemcpyAsync(outputs[0], psrc, size * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + + return status; + } + + size_t getSerializationSize() override + { + return 0; + } + + void serialize(void* buffer) override + { + } + +private: + Dims in_dims_; + DimsNCHW out_dims_; + Dims slice_start_; + Dims slice_end_; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createSlicePlugin(Dims dims, Dims slice_start, Dims slice_end, std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new SlicePlugin(dims, slice_start, slice_end, log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/softargmax_plugin.cpp b/stereoDNN/lib/softargmax_plugin.cpp new file mode 100644 index 0000000..1dc0882 --- /dev/null +++ b/stereoDNN/lib/softargmax_plugin.cpp @@ -0,0 +1,236 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include +#include + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Soft argmax plugin. +// ----------------------------------------------------------------- +class SoftargmaxPlugin: public IPlugin +{ +public: + SoftargmaxPlugin(ILogger& log, std::string name): + log_(log), name_(name) + { + // REVIEW alexeyk: FP32 only for now. + data_type_ = CUDNN_DATA_FLOAT; + } + + SoftargmaxPlugin(SoftargmaxPlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + assert(nbInputDims == 1); + // Input must be 5D tensor in NDCHW format (batch index implicit) and + // C dim must be equal to 1. + assert(inputs[0].nbDims == 4); + assert(inputs[0].d[1] == 1); + + in_dims_ = {3, {inputs[0].d[0], inputs[0].d[2], inputs[0].d[3]}}; + out_dims_ = DimsCHW(1, in_dims_.d[1], in_dims_.d[2]); + + return out_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(nbInputs == 1); + assert(nbOutputs == 1); + assert(DimsUtils::areEqual(inputDims[0], Dims{4, {in_dims_.d[0], 1, in_dims_.d[1], in_dims_.d[2]}})); + assert(DimsUtils::areEqual(outputDims[0], out_dims_)); + + last_batch_size_ = maxBatchSize; + + createDescriptors(); + setTensorDescriptors(maxBatchSize); + + log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str()); + log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str()); + + assert(isValid()); + } + + int initialize() override + { + assert(isValid()); + + // Create and copy indices. Not using workspace as amount of memory required + // for indices is tiny. + std::vector indices(in_dims_.d[0]); + std::iota(indices.begin(), indices.end(), 0); + size_t size_bytes = indices.size() * sizeof(float); + CHECK(cudaMalloc(&indices_d_, size_bytes)); + CHECK(cudaMemcpy(indices_d_, indices.data(), size_bytes, cudaMemcpyHostToDevice)); + + return 0; + } + + void terminate() override + { + assert(isValid()); + + if (in_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(in_desc_)); + if (out_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(out_desc_)); + if (idx_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(idx_desc_)); + if (op_desc_ != nullptr) + CHECK(cudnnDestroyOpTensorDescriptor(op_desc_)); + if (sum_desc_ != nullptr) + CHECK(cudnnDestroyReduceTensorDescriptor(sum_desc_)); + if (cudnn_ != nullptr) + CHECK(cudnnDestroy(cudnn_)); + if (indices_d_ != nullptr) + CHECK(cudaFree(indices_d_)); + in_desc_ = nullptr; + out_desc_ = nullptr; + idx_desc_ = nullptr; + cudnn_ = nullptr; + indices_d_ = nullptr; + sum_desc_ = nullptr; + + assert(!isValid()); + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + // Need a copy of the input. + return 2 * (DimsUtils::getTensorSize(in_dims_)) * maxBatchSize * sizeof(float); + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + assert(isValid()); + + cudnnStatus_t status = CUDNN_STATUS_SUCCESS; + cudaError_t cu_status = cudaSuccess; + + CHECK(status = cudnnSetStream(cudnn_, stream)); + + if (batchSize != last_batch_size_) + setTensorDescriptors(batchSize); + + auto pdst = static_cast(workspace); + // Copy input to workspace. + size_t in_size = batchSize * DimsUtils::getTensorSize(in_dims_); + size_t in_size_bytes = in_size * sizeof(float); + CHECK(cu_status = cudaMemcpyAsync(pdst, inputs[0], in_size_bytes, cudaMemcpyDeviceToDevice, stream)); + + // Scale. + const float kMinusOne = -1.0f; + CHECK(status = cudnnScaleTensor(cudnn_, in_desc_, pdst, &kMinusOne)); + + // Do softmax over D dim. + CHECK(status = cudnnSoftmaxForward(cudnn_, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, &Consts::kOne, in_desc_, pdst, + &Consts::kZero, in_desc_, pdst)); + + // Multiply and sum-reduce over D dim. + CHECK(status = cudnnOpTensor(cudnn_, op_desc_, &Consts::kOne, in_desc_, pdst, + &Consts::kOne, idx_desc_, indices_d_, + &Consts::kZero, in_desc_, pdst)); + CHECK(status = cudnnReduceTensor(cudnn_, sum_desc_, nullptr, 0, pdst + in_size, in_size_bytes, + &Consts::kOne, in_desc_, pdst, + &Consts::kZero, out_desc_, outputs[0])); + + return (status == CUDNN_STATUS_SUCCESS && cu_status == cudaSuccess) ? 0 : -1; + } + + size_t getSerializationSize() override + { + return 0; + } + + void serialize(void* buffer) override + { + } + +private: + bool isValid() const + { + return cudnn_ != nullptr; + } + + void createDescriptors() + { + if (cudnn_ == nullptr) + CHECK(cudnnCreate(&cudnn_)); + if (in_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&in_desc_)); + if (out_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&out_desc_)); + if (idx_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&idx_desc_)); + if (op_desc_ == nullptr) + CHECK(cudnnCreateOpTensorDescriptor(&op_desc_)); + if (sum_desc_ == nullptr) + CHECK(cudnnCreateReduceTensorDescriptor(&sum_desc_)); + } + + void setTensorDescriptors(int batch_size) + { + // Set input tensor descriptor as NDHW (add batch dim and remove C dim). + auto tensor_dims = Dims{4, {batch_size, in_dims_.d[0], in_dims_.d[1], in_dims_.d[2]}}; + auto tensor_strides = DimsUtils::getStrides(tensor_dims); + CHECK(cudnnSetTensorNdDescriptor(in_desc_, data_type_, tensor_dims.nbDims, tensor_dims.d, tensor_strides.d)); + + // Set output tensor descriptor as NCHW where C == 1. + tensor_dims = Dims{4, {batch_size, out_dims_.d[0], out_dims_.d[1], out_dims_.d[2]}}; + tensor_strides = DimsUtils::getStrides(tensor_dims); + CHECK(cudnnSetTensorNdDescriptor(out_desc_, data_type_, tensor_dims.nbDims, tensor_dims.d, tensor_strides.d)); + + // Set index tensor. + tensor_dims = Dims{4, {1, in_dims_.d[0], 1, 1}}; + tensor_strides = DimsUtils::getStrides(tensor_dims); + CHECK(cudnnSetTensorNdDescriptor(idx_desc_, data_type_, tensor_dims.nbDims, tensor_dims.d, tensor_strides.d)); + + // Set tensor op for index multiplication. + CHECK(cudnnSetOpTensorDescriptor(op_desc_, CUDNN_OP_TENSOR_MUL, data_type_, CUDNN_PROPAGATE_NAN)); + + // Set reduce descriptor. + CHECK(cudnnSetReduceTensorDescriptor(sum_desc_, CUDNN_REDUCE_TENSOR_ADD, data_type_, CUDNN_PROPAGATE_NAN, + CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES)); + + last_batch_size_ = batch_size; + } + +private: + cudnnDataType_t data_type_; + cudnnHandle_t cudnn_ = nullptr; + cudnnTensorDescriptor_t in_desc_ = nullptr; + cudnnTensorDescriptor_t out_desc_ = nullptr; + cudnnTensorDescriptor_t idx_desc_ = nullptr; + cudnnOpTensorDescriptor_t op_desc_ = nullptr; + cudnnReduceTensorDescriptor_t sum_desc_ = nullptr; + + Dims in_dims_; + DimsCHW out_dims_; + int last_batch_size_ = 0; + + float* indices_d_ = nullptr; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createSoftargmaxPlugin(std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new SoftargmaxPlugin(log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/lib/transform_plugin.cpp b/stereoDNN/lib/transform_plugin.cpp new file mode 100644 index 0000000..4963155 --- /dev/null +++ b/stereoDNN/lib/transform_plugin.cpp @@ -0,0 +1,192 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "internal_utils.h" +#include +#include + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +// ----------------------------------------------------------------- +// Tensor transform plugin. +// ----------------------------------------------------------------- +class TransformPlugin: public IPlugin +{ +public: + TransformPlugin(Permutation permutation, ILogger& log, std::string name): + permutation_(permutation), log_(log), name_(name) + { + } + + TransformPlugin(TransformPlugin&&) = delete; + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override + { + assert(nbInputDims == 1); + // Only NCHW format is supported for now, TRT does not work (assert) with generic Dims. + assert(inputs[0].nbDims == 4); + + in_dims_ = DimsNCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2], inputs[0].d[3]); + // Sanity check. + int actual_sum = std::accumulate(permutation_.order, permutation_.order + in_dims_.nbDims, 0, std::plus()); + int expected_sum = in_dims_.nbDims * (in_dims_.nbDims - 1) / 2; + assert(actual_sum == expected_sum); + UNUSEDR(actual_sum); + UNUSEDR(expected_sum); + // Transposed output dimensions. + for (int i = 0; i < in_dims_.nbDims; i++) + out_dims_.d[i] = in_dims_.d[permutation_.order[i]]; + return out_dims_; + } + + void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override + { + assert(nbInputs == 1); + assert(nbOutputs == 1); + assert(DimsUtils::areEqual(inputDims[0], in_dims_)); + assert(DimsUtils::areEqual(outputDims[0], out_dims_)); + + createDescriptors(); + setTensorDescriptors(maxBatchSize); + + log_.log(ILogger::Severity::kINFO, (name_ + ": InDims : " + DimsUtils::toString(in_dims_)).c_str()); + log_.log(ILogger::Severity::kINFO, (name_ + ": OutDims: " + DimsUtils::toString(out_dims_)).c_str()); + + assert(isValid()); + } + + int initialize() override + { + assert(isValid()); + return 0; + } + + void terminate() override + { + assert(isValid()); + + if (in_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(in_desc_)); + if (out_desc_ != nullptr) + CHECK(cudnnDestroyTensorDescriptor(out_desc_)); + if (cudnn_ != nullptr) + CHECK(cudnnDestroy(cudnn_)); + in_desc_ = nullptr; + out_desc_ = nullptr; + cudnn_ = nullptr; + + assert(!isValid()); + } + + size_t getWorkspaceSize(int maxBatchSize) const + { + return 0; + } + + int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override + { + assert(isValid()); + + cudnnStatus_t status; + + CHECK(status = cudnnSetStream(cudnn_, stream)); + + if (batchSize != tensor_dims_.d[0]) + setTensorDescriptors(batchSize); + + CHECK(status = cudnnTransformTensor(cudnn_, &Consts::kOne, in_desc_, inputs[0], &Consts::kZero, out_desc_, outputs[0])); + + return status == CUDNN_STATUS_SUCCESS ? 0 : -1; + } + + size_t getSerializationSize() override + { + return 0; + } + + void serialize(void* buffer) override + { + } + +private: + bool isValid() const + { + return cudnn_ != nullptr; + } + + void createDescriptors() + { + if (cudnn_ == nullptr) + CHECK(cudnnCreate(&cudnn_)); + if (in_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&in_desc_)); + if (out_desc_ == nullptr) + CHECK(cudnnCreateTensorDescriptor(&out_desc_)); + } + + void setTensorDescriptors(int batch_size) + { + // cudnnTransformTensor requires dimensions of both tensors to be the same. + // The transform is represented by different strides. + + // Add batch dimension to final tensor and copy dims from input. + tensor_dims_.nbDims = in_dims_.nbDims + 1; + tensor_dims_.d[0] = batch_size; + std::copy(in_dims_.d, in_dims_.d + in_dims_.nbDims, tensor_dims_.d + 1); + + in_tensor_strides_ = DimsUtils::getStrides(tensor_dims_); + CHECK(cudnnSetTensorNdDescriptor(in_desc_, CUDNN_DATA_FLOAT, + tensor_dims_.nbDims, tensor_dims_.d, in_tensor_strides_.d)); + + // Compute output tensor strides. + // 1. Create output tensor dims with batch dimension. + Dims out_tensor_dims; + out_tensor_dims.nbDims = out_dims_.nbDims + 1; + out_tensor_dims.d[0] = batch_size; + std::copy(out_dims_.d, out_dims_.d + out_dims_.nbDims, out_tensor_dims.d + 1); + // 2. Compute the strides for the output tensor. We need a copy to do a transform later. + Dims out_strides = DimsUtils::getStrides(out_tensor_dims); + out_tensor_strides_ = out_strides; + // 3. As cudnnTransformTensor requires both dims to be the same, we need + // to change strides on the _output_ tensor to achieve desired transform effect. + for (int i = 1; i < out_strides.nbDims; i++) + out_tensor_strides_.d[i] = out_strides.d[permutation_.order[i - 1] + 1]; + CHECK(cudnnSetTensorNdDescriptor(out_desc_, CUDNN_DATA_FLOAT, + tensor_dims_.nbDims, tensor_dims_.d, out_tensor_strides_.d)); + + } + +private: + cudnnHandle_t cudnn_ = nullptr; + cudnnTensorDescriptor_t in_desc_ = nullptr; + cudnnTensorDescriptor_t out_desc_ = nullptr; + + Permutation permutation_; + DimsNCHW in_dims_; + DimsNCHW out_dims_; + + Dims tensor_dims_; + Dims in_tensor_strides_; + Dims out_tensor_strides_; + + ILogger& log_; + std::string name_; +}; + +// Factory method. +IPlugin* PluginContainer::createTransformPlugin(Permutation permutation, std::string name) +{ + std::lock_guard lock(lock_); + plugins_.push_back(new TransformPlugin(permutation, log_, name)); + return plugins_.back(); +} + +} } \ No newline at end of file diff --git a/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 new file mode 100644 index 0000000..1ebde5a Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 differ diff --git a/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.index b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.index new file mode 100644 index 0000000..6205fb9 Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.index differ diff --git a/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.meta b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.meta new file mode 100644 index 0000000..2b2c9fc Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorFlow/model-inference-1025x321-0.meta differ diff --git a/stereoDNN/models/NVSmall/TensorRT/trt_weights.bin b/stereoDNN/models/NVSmall/TensorRT/trt_weights.bin new file mode 100644 index 0000000..5fee420 Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorRT/trt_weights.bin differ diff --git a/stereoDNN/models/NVSmall/TensorRT/trt_weights_fp16.bin b/stereoDNN/models/NVSmall/TensorRT/trt_weights_fp16.bin new file mode 100644 index 0000000..a0807bd Binary files /dev/null and b/stereoDNN/models/NVSmall/TensorRT/trt_weights_fp16.bin differ diff --git a/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.data-00000-of-00001 b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.data-00000-of-00001 new file mode 100644 index 0000000..fd007a5 Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.data-00000-of-00001 differ diff --git a/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.index b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.index new file mode 100644 index 0000000..5f868ca Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.index differ diff --git a/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.meta b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.meta new file mode 100644 index 0000000..5cc0eb2 Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorFlow/model-inference-513x161-0.meta differ diff --git a/stereoDNN/models/NVTiny/TensorRT/trt_weights.bin b/stereoDNN/models/NVTiny/TensorRT/trt_weights.bin new file mode 100644 index 0000000..3f95079 Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorRT/trt_weights.bin differ diff --git a/stereoDNN/models/NVTiny/TensorRT/trt_weights_fp16.bin b/stereoDNN/models/NVTiny/TensorRT/trt_weights_fp16.bin new file mode 100644 index 0000000..4090a2a Binary files /dev/null and b/stereoDNN/models/NVTiny/TensorRT/trt_weights_fp16.bin differ diff --git a/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 new file mode 100644 index 0000000..b6224e9 Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.data-00000-of-00001 differ diff --git a/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.index b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.index new file mode 100644 index 0000000..359528c Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.index differ diff --git a/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.meta b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.meta new file mode 100644 index 0000000..81ebbce Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorFlow/model-inference-1025x321-0.meta differ diff --git a/stereoDNN/models/ResNet-18/TensorRT/trt_weights.bin b/stereoDNN/models/ResNet-18/TensorRT/trt_weights.bin new file mode 100644 index 0000000..e766aea Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorRT/trt_weights.bin differ diff --git a/stereoDNN/models/ResNet-18/TensorRT/trt_weights_fp16.bin b/stereoDNN/models/ResNet-18/TensorRT/trt_weights_fp16.bin new file mode 100644 index 0000000..c38612e Binary files /dev/null and b/stereoDNN/models/ResNet-18/TensorRT/trt_weights_fp16.bin differ diff --git a/stereoDNN/sample_app/CMakeLists.txt b/stereoDNN/sample_app/CMakeLists.txt new file mode 100644 index 0000000..3f1e320 --- /dev/null +++ b/stereoDNN/sample_app/CMakeLists.txt @@ -0,0 +1,32 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +cmake_minimum_required(VERSION 3.5) + +set(PROJECT_NAME nvstereo_sample_app) +project(${PROJECT_NAME}) + +find_package(CUDA 9.0 REQUIRED) +find_package(OpenCV 3.3.1 REQUIRED) + +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${CMAKE_SOURCE_DIR}/lib) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") + +file(GLOB ${PROJECT_NAME}_sources ./*.cpp) +set(PROJECT_SOURCES ${${PROJECT_NAME}_sources}) + +set(TARGET_NAME ${PROJECT_NAME}${TARGET_SUFFIX}) + +add_executable(${TARGET_NAME} ${PROJECT_SOURCES}) + +target_link_libraries(${TARGET_NAME} + nvstereo_inference${TARGET_SUFFIX} + ${CUDA_LIBRARIES} + cudnn + nvinfer + opencv_core + opencv_imgproc + opencv_highgui) diff --git a/stereoDNN/sample_app/data/img_left.bin b/stereoDNN/sample_app/data/img_left.bin new file mode 100644 index 0000000..e7a5d35 Binary files /dev/null and b/stereoDNN/sample_app/data/img_left.bin differ diff --git a/stereoDNN/sample_app/data/img_left.png b/stereoDNN/sample_app/data/img_left.png new file mode 100644 index 0000000..cf07f6d Binary files /dev/null and b/stereoDNN/sample_app/data/img_left.png differ diff --git a/stereoDNN/sample_app/data/img_right.bin b/stereoDNN/sample_app/data/img_right.bin new file mode 100644 index 0000000..c6aa131 Binary files /dev/null and b/stereoDNN/sample_app/data/img_right.bin differ diff --git a/stereoDNN/sample_app/data/img_right.png b/stereoDNN/sample_app/data/img_right.png new file mode 100644 index 0000000..f49813b Binary files /dev/null and b/stereoDNN/sample_app/data/img_right.png differ diff --git a/stereoDNN/sample_app/main.cpp b/stereoDNN/sample_app/main.cpp new file mode 100644 index 0000000..c3aa46b --- /dev/null +++ b/stereoDNN/sample_app/main.cpp @@ -0,0 +1,282 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include "NvInfer.h" +#include "NvCaffeParser.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include "redtail_tensorrt_plugins.h" + +#define UNUSED(x) ((void)(x)) + +#define CHECK(status) do { \ + int res = (int)(status); \ + assert(res == 0); \ + UNUSED(res); \ +} while(false) + +using namespace nvinfer1; +using namespace redtail::tensorrt; + +namespace redtail { namespace tensorrt +{ +using weight_map = std::unordered_map; + +// NVSmall DNN: 1025x321 input, 96 max disparity. +INetworkDefinition* createNVSmall1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory, + DimsCHW img_dims, const weight_map& weights, DataType data_type, ILogger& log); + +// Tiny version of NVSmall DNN: 513x161 input, 48 max disparity. +INetworkDefinition* createNVTiny513x161Network(IBuilder& builder, IPluginContainer& plugin_factory, + DimsCHW img_dims, const weight_map& weights, DataType data_type, + ILogger& log); + +// Baseline ResNet-18 DNN: 1025x321 input, 136 max disparity. +INetworkDefinition* createResNet18_1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory, + DimsCHW img_dims, const weight_map& weights, DataType data_type, + ILogger& log); +} } + +class Logger : public nvinfer1::ILogger +{ + public: + void log(nvinfer1::ILogger::Severity severity, const char* msg) override + { + // Skip info (verbose) messages. + // if (severity == Severity::kINFO) + // return; + + switch (severity) + { + case Severity::kINTERNAL_ERROR: std::cerr << "TRT INTERNAL_ERROR: "; break; + case Severity::kERROR: std::cerr << "TRT ERROR: "; break; + case Severity::kWARNING: std::cerr << "TRT WARNING: "; break; + case Severity::kINFO: std::cerr << "TRT INFO: "; break; + default: std::cerr << "TRT UNKNOWN: "; break; + } + std::cerr << msg << std::endl; + } +}; + +static Logger gLogger; + +std::vector readImgFile(const std::string& filename, int w, int h) +{ + auto img = cv::imread(filename); + assert(img.data != nullptr); + // 0. Convert to float. + img.convertTo(img, CV_32F); + // 1. Resize. + cv::resize(img, img, cv::Size(w, h), 0, 0,cv::INTER_AREA); + // 2. Convert BGR -> RGB. + cv::cvtColor(img, img, CV_BGR2RGB); + // 3. Convert HWC -> CHW. + cv::Mat res = img.reshape(1, w * h).t(); + // 4. Scale. + res /= 255.0; + return std::vector(res.ptr(0), res.ptr(0) + w * h * 3); +} + +std::vector readBinFile(const std::string& filename) +{ + std::ifstream input_file(filename, std::ios::binary | std::ios::ate); + assert(input_file.is_open()); + size_t size = input_file.tellg(); + input_file.seekg(0, std::ios_base::beg); + std::vector data(size / sizeof(float)); + input_file.read((char*)data.data(), size); + return data; +} + +std::unordered_map readWeights(const std::string& filename, DataType data_type) +{ + assert(data_type == DataType::kFLOAT || data_type == DataType::kHALF); + + std::unordered_map weights; + std::ifstream weights_file(filename, std::ios::binary); + assert(weights_file.is_open()); + while (weights_file.peek() != std::ifstream::traits_type::eof()) + { + std::string name; + uint32_t count; + Weights w {data_type, nullptr, 0}; + std::getline(weights_file, name, '\0'); + weights_file.read(reinterpret_cast(&count), sizeof(uint32_t)); + w.count = count; + size_t el_size_bytes = data_type == DataType::kFLOAT ? 4 : 2; + auto p = new uint8_t[count * el_size_bytes]; + weights_file.read(reinterpret_cast(p), count * el_size_bytes); + w.values = p; + assert(weights.find(name) == weights.cend()); + weights[name] = w; + } + return weights; +} + +int main(int argc, char** argv) +{ + if (argc < 8) + { + printf("\n" + "Usage : nvstereo_sample_app[_debug] [data_type]\n" + "where : model_type is the type of the DNN, supported are: nvsmall and resnet18\n" + " width and height are dimensions of the network (e.g. 1025 321)\n" + " weights file is the output of TensorRT model builder script\n" + " left and right are images that will be scaled to x \n" + " disparity output is the output of the network of size x (bin and PNG files are created)\n" + " data type(optional) is the data type of the model: fp32 (default) or fp16\n" + "See /models directory for model files\n" + "Example: nvstereo_sample_app nvsmall 1025 321 trt_weights.bin img_left.png img_right.png out_disp.bin\n\n"); + return 1; + } + //getchar(); + + auto model_type = std::string(argv[1]); + if (model_type != "nvsmall" && model_type != "resnet18") + { + printf("Invalid model type %s, supported: nvsmall, resnet18.\n", model_type.c_str()); + exit(1); + } + + DataType data_type = DataType::kFLOAT; + if (argc >= 9) + { + auto d_type = std::string(argv[8]); + if (d_type == "fp32") + data_type = DataType::kFLOAT; + else if (d_type == "fp16") + data_type = DataType::kHALF; + else + { + printf("Data type %s is not supported, supported types: fp32, fp16.\n", d_type.c_str()); + exit(1); + } + } + printf("Using %s data type.\n", data_type == DataType::kFLOAT ? "fp32" : "fp16"); + + // Read weights. + // Note: the weights object lifetime must be at least the same as engine. + auto weights = readWeights(argv[4], data_type); + printf("Loaded %zu weight sets.\n", weights.size()); + + //const int b = 1; + const int c = 3; + const int h = std::stoi(argv[3]); + const int w = std::stoi(argv[2]); + printf("Using [%d, %d](width, height) as network input dimensions.\n", w, h); + + // Read images. + auto img_left = readImgFile(argv[5], w, h); + //auto img_left = readBinFile(argv[5]); + assert(img_left.size() == (size_t)c * h * w); + auto img_right = readImgFile(argv[6], w, h); + //auto img_right = readBinFile(argv[6]); + assert(img_right.size() == (size_t)c * h * w); + + // Create builder and network. + IBuilder* builder = createInferBuilder(gLogger); + + // Note: the plugin_factory object lifetime must be at least the same as engine. + auto plugin_factory = IPluginContainer::create(gLogger); + // TRT v3 supports FP16 only for the weights (e.g. convolutions) but not the data so use float data type. + INetworkDefinition* network = nullptr; + if (model_type == "nvsmall") + { + if (w == 1025) + network = createNVSmall1025x321Network(*builder, *plugin_factory, DimsCHW { c, h, w }, weights, DataType::kFLOAT, gLogger); + else if (w == 513) + network = createNVTiny513x161Network( *builder, *plugin_factory, DimsCHW { c, h, w }, weights, DataType::kFLOAT, gLogger); + else + assert(false); + } + else if (model_type == "resnet18") + { + if (w == 1025) + network = createResNet18_1025x321Network(*builder, *plugin_factory, DimsCHW { c, h, w }, weights, DataType::kFLOAT, gLogger); + else + { + printf("ResNet-18 model supports only 1025x321 input image.\n"); + exit(1); + } + } + else + assert(false); + + builder->setMaxBatchSize(1); + size_t workspace_bytes = 1024 * 1024 * 1024; + builder->setMaxWorkspaceSize(workspace_bytes); + + builder->setHalf2Mode(data_type == DataType::kHALF); + // Build the network. + auto engine = builder->buildCudaEngine(*network); + network->destroy(); + + // REVIEW alexeyk: serialization is not yet supported. Need to implement IPluginFactory properly. + // IHostMemory *model_stream = engine->serialize(); + // engine->destroy(); + // builder->destroy(); + + // IRuntime* runtime = createInferRuntime(gLogger); + // engine = runtime->deserializeCudaEngine(model_stream->data(), model_stream->size(), nullptr); + // model_stream->destroy(); + + assert(engine->getNbBindings() == 3); + void* buffers[3]; + int in_idx_left = engine->getBindingIndex("left"); + assert(in_idx_left == 0); + int in_idx_right = engine->getBindingIndex("right"); + assert(in_idx_right == 1); + int out_idx = engine->getBindingIndex("disp"); + assert(out_idx == 2); + + IExecutionContext *context = engine->createExecutionContext(); + + std::vector output(h * w); + + // Allocate GPU memory and copy data. + CHECK(cudaMalloc(&buffers[in_idx_left], img_left.size() * sizeof(float))); + CHECK(cudaMalloc(&buffers[in_idx_right], img_right.size() * sizeof(float))); + CHECK(cudaMalloc(&buffers[out_idx], output.size() * sizeof(float))); + + CHECK(cudaMemcpy(buffers[in_idx_left], img_left.data(), img_left.size() * sizeof(float), cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(buffers[in_idx_right], img_right.data(), img_right.size() * sizeof(float), cudaMemcpyHostToDevice)); + + // Do the inference. + auto host_start = std::chrono::high_resolution_clock::now(); + auto err = context->execute(1, buffers); + auto host_end = std::chrono::high_resolution_clock::now(); + assert(err); + UNUSED(err); + auto host_elapsed_ms = std::chrono::duration(host_end - host_start).count(); + printf("Host time: %.4fms\n", host_elapsed_ms); + + // Copy output back to host. + CHECK(cudaMemcpy(output.data(), buffers[out_idx], output.size() * sizeof(float), cudaMemcpyDeviceToHost)); + + // Write results. + // 1. As binary file. + auto res_file = std::ofstream(argv[7], std::ios::binary); + res_file.write((char*)output.data(), output.size() * sizeof(float)); + // 2. As PNG image. + auto img_f = cv::Mat(h, w, CV_32F, output.data()); + img_f *= 256; + cv::Mat img_u16; + img_f.convertTo(img_u16, CV_16U); + cv::imwrite(std::string(argv[7]) + ".png", img_u16); + + // Cleanup. + context->destroy(); + engine->destroy(); + for (auto b: buffers) + CHECK(cudaFree(b)); + + printf("Done\n"); + return 0; +} \ No newline at end of file diff --git a/stereoDNN/sample_app/nvsmall_1025x321_net.cpp b/stereoDNN/sample_app/nvsmall_1025x321_net.cpp new file mode 100644 index 0000000..26ba02a --- /dev/null +++ b/stereoDNN/sample_app/nvsmall_1025x321_net.cpp @@ -0,0 +1,369 @@ +// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +//------------------------------------------------------------------- +// !!! This file was automatically generated. Do not edit. !!! +//------------------------------------------------------------------- + +#include +#include +#include +#include +#include "redtail_tensorrt_plugins.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +using weight_map = std::unordered_map; + +INetworkDefinition* createNVSmall1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory, + DimsCHW img_dims, const weight_map& weights, DataType data_type, + ILogger& log) +{ + INetworkDefinition* network = builder.createNetwork(); + assert(network != nullptr); + // Input tensor. + auto left = network->addInput("left", DataType::kFLOAT, img_dims); + assert(left != nullptr); + + // Input tensor. + auto right = network->addInput("right", DataType::kFLOAT, img_dims); + assert(right != nullptr); + + // Scaling op. + auto left_scale = network->addScale(*left, ScaleMode::kUNIFORM, + weights.at("left_scale_shift"), weights.at("left_scale_scale"), weights.at("left_scale_power")); + assert(left_scale != nullptr); + + // Scaling op. + auto right_scale = network->addScale(*right, ScaleMode::kUNIFORM, + weights.at("right_scale_shift"), weights.at("right_scale_scale"), weights.at("right_scale_power")); + assert(right_scale != nullptr); + + // left_conv1 convolution op. + auto left_conv1 = network->addConvolution(*left_scale->getOutput(0), 32, DimsHW {5, 5}, + weights.at("left_conv1_k"), weights.at("left_conv1_b")); + assert(left_conv1 != nullptr); + left_conv1->setStride( DimsHW {2, 2}); + left_conv1->setPadding(DimsHW {2, 2}); + + // left_conv1_act ELU activation op. + auto left_conv1_act = addElu(plugin_factory, *network, *left_conv1->getOutput(0), data_type, "left_conv1_act"); + assert(left_conv1_act != nullptr); + + // right_conv1 convolution op. + auto right_conv1 = network->addConvolution(*right_scale->getOutput(0), 32, DimsHW {5, 5}, + weights.at("right_conv1_k"), weights.at("right_conv1_b")); + assert(right_conv1 != nullptr); + right_conv1->setStride( DimsHW {2, 2}); + right_conv1->setPadding(DimsHW {2, 2}); + + // right_conv1_act ELU activation op. + auto right_conv1_act = addElu(plugin_factory, *network, *right_conv1->getOutput(0), data_type, "right_conv1_act"); + assert(right_conv1_act != nullptr); + + // left_conv2 convolution op. + auto left_conv2 = network->addConvolution(*left_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_conv2_k"), weights.at("left_conv2_b")); + assert(left_conv2 != nullptr); + left_conv2->setStride( DimsHW {1, 1}); + left_conv2->setPadding(DimsHW {1, 1}); + + // left_conv2_act ELU activation op. + auto left_conv2_act = addElu(plugin_factory, *network, *left_conv2->getOutput(0), data_type, "left_conv2_act"); + assert(left_conv2_act != nullptr); + + // right_conv2 convolution op. + auto right_conv2 = network->addConvolution(*right_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_conv2_k"), weights.at("right_conv2_b")); + assert(right_conv2 != nullptr); + right_conv2->setStride( DimsHW {1, 1}); + right_conv2->setPadding(DimsHW {1, 1}); + + // right_conv2_act ELU activation op. + auto right_conv2_act = addElu(plugin_factory, *network, *right_conv2->getOutput(0), data_type, "right_conv2_act"); + assert(right_conv2_act != nullptr); + + // left_conv3 convolution op. + auto left_conv3 = network->addConvolution(*left_conv2_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_conv3_k"), weights.at("left_conv3_b")); + assert(left_conv3 != nullptr); + left_conv3->setStride( DimsHW {1, 1}); + left_conv3->setPadding(DimsHW {1, 1}); + + // left_conv3_act ELU activation op. + auto left_conv3_act = addElu(plugin_factory, *network, *left_conv3->getOutput(0), data_type, "left_conv3_act"); + assert(left_conv3_act != nullptr); + + // right_conv3 convolution op. + auto right_conv3 = network->addConvolution(*right_conv2_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_conv3_k"), weights.at("right_conv3_b")); + assert(right_conv3 != nullptr); + right_conv3->setStride( DimsHW {1, 1}); + right_conv3->setPadding(DimsHW {1, 1}); + + // right_conv3_act ELU activation op. + auto right_conv3_act = addElu(plugin_factory, *network, *right_conv3->getOutput(0), data_type, "right_conv3_act"); + assert(right_conv3_act != nullptr); + + // left_conv4 convolution op. + auto left_conv4 = network->addConvolution(*left_conv3_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_conv4_k"), weights.at("left_conv4_b")); + assert(left_conv4 != nullptr); + left_conv4->setStride( DimsHW {1, 1}); + left_conv4->setPadding(DimsHW {1, 1}); + + // left_conv4_act ELU activation op. + auto left_conv4_act = addElu(plugin_factory, *network, *left_conv4->getOutput(0), data_type, "left_conv4_act"); + assert(left_conv4_act != nullptr); + + // right_conv4 convolution op. + auto right_conv4 = network->addConvolution(*right_conv3_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_conv4_k"), weights.at("right_conv4_b")); + assert(right_conv4 != nullptr); + right_conv4->setStride( DimsHW {1, 1}); + right_conv4->setPadding(DimsHW {1, 1}); + + // right_conv4_act ELU activation op. + auto right_conv4_act = addElu(plugin_factory, *network, *right_conv4->getOutput(0), data_type, "right_conv4_act"); + assert(right_conv4_act != nullptr); + + // left_conv5 convolution op. + auto left_conv5 = network->addConvolution(*left_conv4_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_conv5_k"), weights.at("left_conv5_b")); + assert(left_conv5 != nullptr); + left_conv5->setStride( DimsHW {1, 1}); + left_conv5->setPadding(DimsHW {1, 1}); + + // right_conv5 convolution op. + auto right_conv5 = network->addConvolution(*right_conv4_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_conv5_k"), weights.at("right_conv5_b")); + assert(right_conv5 != nullptr); + right_conv5->setStride( DimsHW {1, 1}); + right_conv5->setPadding(DimsHW {1, 1}); + + // cost_vol cost volume op. + auto cost_vol = addCostVolume(plugin_factory, *network, *left_conv5->getOutput(0), *right_conv5->getOutput(0), 48, "cost_vol"); + assert(cost_vol != nullptr); + + // conv3D_1 3D convolution op. + auto conv3D_1 = addConv3D(plugin_factory, *network, *cost_vol->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_1_k"), weights.at("conv3D_1_b"), + "conv3D_1"); + assert(conv3D_1 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_1_tran = addTransform(plugin_factory, *network, *conv3D_1->getOutput(0), {1, 0, 2, 3}, "conv3D_1_tran_transform"); + assert(conv3D_1_tran != nullptr); + + // conv3D_1_act ELU activation op. + auto conv3D_1_act = addElu(plugin_factory, *network, *conv3D_1_tran->getOutput(0), data_type, "conv3D_1_act"); + assert(conv3D_1_act != nullptr); + + // conv3D_2 3D convolution op. + auto conv3D_2 = addConv3D(plugin_factory, *network, *conv3D_1_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_2_k"), weights.at("conv3D_2_b"), + "conv3D_2"); + assert(conv3D_2 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_2_tran = addTransform(plugin_factory, *network, *conv3D_2->getOutput(0), {1, 0, 2, 3}, "conv3D_2_tran_transform"); + assert(conv3D_2_tran != nullptr); + + // conv3D_2_act ELU activation op. + auto conv3D_2_act = addElu(plugin_factory, *network, *conv3D_2_tran->getOutput(0), data_type, "conv3D_2_act"); + assert(conv3D_2_act != nullptr); + + // conv3D_3ds_pad padding op. + auto conv3D_3ds_pad = addPad(plugin_factory, *network, *conv3D_2_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_3ds_pad"); + assert(conv3D_3ds_pad != nullptr); + + // conv3D_3ds 3D convolution op. + auto conv3D_3ds = addConv3D(plugin_factory, *network, *conv3D_3ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_3ds_k"), weights.at("conv3D_3ds_b"), + "conv3D_3ds"); + assert(conv3D_3ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_3ds_tran = addTransform(plugin_factory, *network, *conv3D_3ds->getOutput(0), {1, 0, 2, 3}, "conv3D_3ds_tran_transform"); + assert(conv3D_3ds_tran != nullptr); + + // conv3D_3ds_act ELU activation op. + auto conv3D_3ds_act = addElu(plugin_factory, *network, *conv3D_3ds_tran->getOutput(0), data_type, "conv3D_3ds_act"); + assert(conv3D_3ds_act != nullptr); + + // conv3D_4 3D convolution op. + auto conv3D_4 = addConv3D(plugin_factory, *network, *conv3D_3ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_4_k"), weights.at("conv3D_4_b"), + "conv3D_4"); + assert(conv3D_4 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_4_tran = addTransform(plugin_factory, *network, *conv3D_4->getOutput(0), {1, 0, 2, 3}, "conv3D_4_tran_transform"); + assert(conv3D_4_tran != nullptr); + + // conv3D_4_act ELU activation op. + auto conv3D_4_act = addElu(plugin_factory, *network, *conv3D_4_tran->getOutput(0), data_type, "conv3D_4_act"); + assert(conv3D_4_act != nullptr); + + // conv3D_5 3D convolution op. + auto conv3D_5 = addConv3D(plugin_factory, *network, *conv3D_4_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_5_k"), weights.at("conv3D_5_b"), + "conv3D_5"); + assert(conv3D_5 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_5_tran = addTransform(plugin_factory, *network, *conv3D_5->getOutput(0), {1, 0, 2, 3}, "conv3D_5_tran_transform"); + assert(conv3D_5_tran != nullptr); + + // conv3D_5_act ELU activation op. + auto conv3D_5_act = addElu(plugin_factory, *network, *conv3D_5_tran->getOutput(0), data_type, "conv3D_5_act"); + assert(conv3D_5_act != nullptr); + + // conv3D_6ds_pad padding op. + auto conv3D_6ds_pad = addPad(plugin_factory, *network, *conv3D_5_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_6ds_pad"); + assert(conv3D_6ds_pad != nullptr); + + // conv3D_6ds 3D convolution op. + auto conv3D_6ds = addConv3D(plugin_factory, *network, *conv3D_6ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_6ds_k"), weights.at("conv3D_6ds_b"), + "conv3D_6ds"); + assert(conv3D_6ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_6ds_tran = addTransform(plugin_factory, *network, *conv3D_6ds->getOutput(0), {1, 0, 2, 3}, "conv3D_6ds_tran_transform"); + assert(conv3D_6ds_tran != nullptr); + + // conv3D_6ds_act ELU activation op. + auto conv3D_6ds_act = addElu(plugin_factory, *network, *conv3D_6ds_tran->getOutput(0), data_type, "conv3D_6ds_act"); + assert(conv3D_6ds_act != nullptr); + + // conv3D_7 3D convolution op. + auto conv3D_7 = addConv3D(plugin_factory, *network, *conv3D_6ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_7_k"), weights.at("conv3D_7_b"), + "conv3D_7"); + assert(conv3D_7 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_7_tran = addTransform(plugin_factory, *network, *conv3D_7->getOutput(0), {1, 0, 2, 3}, "conv3D_7_tran_transform"); + assert(conv3D_7_tran != nullptr); + + // conv3D_7_act ELU activation op. + auto conv3D_7_act = addElu(plugin_factory, *network, *conv3D_7_tran->getOutput(0), data_type, "conv3D_7_act"); + assert(conv3D_7_act != nullptr); + + // conv3D_8 3D convolution op. + auto conv3D_8 = addConv3D(plugin_factory, *network, *conv3D_7_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_8_k"), weights.at("conv3D_8_b"), + "conv3D_8"); + assert(conv3D_8 != nullptr); + + // conv3D_8_act ELU activation op. + auto conv3D_8_act = addElu(plugin_factory, *network, *conv3D_8->getOutput(0), data_type, "conv3D_8_act"); + assert(conv3D_8_act != nullptr); + + // deconv3D_1 3D transposed convolution op. + Dims deconv3D_1_out_dims{4, {25, 64, 81, 257}}; + auto deconv3D_1 = addConv3DTranspose(plugin_factory, *network, *conv3D_8_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}}, deconv3D_1_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_1_k"), weights.at("deconv3D_1_b"), + "deconv3D_1"); + assert(deconv3D_1 != nullptr); + + // deconv3D_1 output slice op. + auto deconv3D_1_slice_layer = addSlice(plugin_factory, *network, *deconv3D_1->getOutput(0), + deconv3D_1_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_1_out_dims.d[0] - 1, deconv3D_1_out_dims.d[1], deconv3D_1_out_dims.d[2], deconv3D_1_out_dims.d[3]}}, + "deconv3D_1_slice"); + assert(deconv3D_1_slice_layer != nullptr); + + // deconv3D_1_add_skip tensor add op. + auto deconv3D_1_add_skip = network->addElementWise(*(deconv3D_1_slice_layer->getOutput(0)), *(conv3D_5_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_1_add_skip != nullptr); + + // deconv3D_1_act ELU activation op. + auto deconv3D_1_act = addElu(plugin_factory, *network, *deconv3D_1_add_skip->getOutput(0), data_type, "deconv3D_1_act"); + assert(deconv3D_1_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_1_transform = addTransform(plugin_factory, *network, *deconv3D_1_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_1_transform_transform"); + assert(deconv3D_1_transform != nullptr); + + // deconv3D_2 3D transposed convolution op. + Dims deconv3D_2_out_dims{4, {49, 32, 161, 513}}; + auto deconv3D_2 = addConv3DTranspose(plugin_factory, *network, *deconv3D_1_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, deconv3D_2_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_2_k"), weights.at("deconv3D_2_b"), + "deconv3D_2"); + assert(deconv3D_2 != nullptr); + + // deconv3D_2 output slice op. + auto deconv3D_2_slice_layer = addSlice(plugin_factory, *network, *deconv3D_2->getOutput(0), + deconv3D_2_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_2_out_dims.d[0] - 1, deconv3D_2_out_dims.d[1], deconv3D_2_out_dims.d[2], deconv3D_2_out_dims.d[3]}}, + "deconv3D_2_slice"); + assert(deconv3D_2_slice_layer != nullptr); + + // deconv3D_2_add_skip tensor add op. + auto deconv3D_2_add_skip = network->addElementWise(*(deconv3D_2_slice_layer->getOutput(0)), *(conv3D_2_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_2_add_skip != nullptr); + + // deconv3D_2_act ELU activation op. + auto deconv3D_2_act = addElu(plugin_factory, *network, *deconv3D_2_add_skip->getOutput(0), data_type, "deconv3D_2_act"); + assert(deconv3D_2_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_2_transform = addTransform(plugin_factory, *network, *deconv3D_2_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_2_transform_transform"); + assert(deconv3D_2_transform != nullptr); + + // deconv3D_3 3D transposed convolution op. + Dims deconv3D_3_out_dims{4, {97, 1, 321, 1025}}; + auto deconv3D_3 = addConv3DTranspose(plugin_factory, *network, *deconv3D_2_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 1, 3, 3}}, deconv3D_3_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_3_k"), weights.at("deconv3D_3_b"), + "deconv3D_3"); + assert(deconv3D_3 != nullptr); + + // deconv3D_3 output slice op. + auto deconv3D_3_slice_layer = addSlice(plugin_factory, *network, *deconv3D_3->getOutput(0), + deconv3D_3_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_3_out_dims.d[0] - 1, deconv3D_3_out_dims.d[1], deconv3D_3_out_dims.d[2], deconv3D_3_out_dims.d[3]}}, + "deconv3D_3_slice"); + assert(deconv3D_3_slice_layer != nullptr); + + // Softargmax. + auto disp = addSoftargmax(plugin_factory, *network, *deconv3D_3_slice_layer->getOutput(0), "disp_softargmax"); + assert(disp != nullptr); + + auto disp_out = disp->getOutput(0); + disp_out->setName("disp"); + network->markOutput(*disp_out); + + return network; +} + +} } // namespace diff --git a/stereoDNN/sample_app/nvtiny_513x161_net.cpp b/stereoDNN/sample_app/nvtiny_513x161_net.cpp new file mode 100644 index 0000000..5ed7a10 --- /dev/null +++ b/stereoDNN/sample_app/nvtiny_513x161_net.cpp @@ -0,0 +1,369 @@ +// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +//------------------------------------------------------------------- +// !!! This file was automatically generated. Do not edit. !!! +//------------------------------------------------------------------- + +#include +#include +#include +#include +#include "redtail_tensorrt_plugins.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +using weight_map = std::unordered_map; + +INetworkDefinition* createNVTiny513x161Network(IBuilder& builder, IPluginContainer& plugin_factory, + DimsCHW img_dims, const weight_map& weights, DataType data_type, + ILogger& log) +{ + INetworkDefinition* network = builder.createNetwork(); + assert(network != nullptr); + // Input tensor. + auto left = network->addInput("left", DataType::kFLOAT, img_dims); + assert(left != nullptr); + + // Input tensor. + auto right = network->addInput("right", DataType::kFLOAT, img_dims); + assert(right != nullptr); + + // Scaling op. + auto left_scale = network->addScale(*left, ScaleMode::kUNIFORM, + weights.at("left_scale_shift"), weights.at("left_scale_scale"), weights.at("left_scale_power")); + assert(left_scale != nullptr); + + // Scaling op. + auto right_scale = network->addScale(*right, ScaleMode::kUNIFORM, + weights.at("right_scale_shift"), weights.at("right_scale_scale"), weights.at("right_scale_power")); + assert(right_scale != nullptr); + + // left_conv1 convolution op. + auto left_conv1 = network->addConvolution(*left_scale->getOutput(0), 32, DimsHW {5, 5}, + weights.at("left_conv1_k"), weights.at("left_conv1_b")); + assert(left_conv1 != nullptr); + left_conv1->setStride( DimsHW {2, 2}); + left_conv1->setPadding(DimsHW {2, 2}); + + // left_conv1_act ELU activation op. + auto left_conv1_act = addElu(plugin_factory, *network, *left_conv1->getOutput(0), data_type, "left_conv1_act"); + assert(left_conv1_act != nullptr); + + // right_conv1 convolution op. + auto right_conv1 = network->addConvolution(*right_scale->getOutput(0), 32, DimsHW {5, 5}, + weights.at("right_conv1_k"), weights.at("right_conv1_b")); + assert(right_conv1 != nullptr); + right_conv1->setStride( DimsHW {2, 2}); + right_conv1->setPadding(DimsHW {2, 2}); + + // right_conv1_act ELU activation op. + auto right_conv1_act = addElu(plugin_factory, *network, *right_conv1->getOutput(0), data_type, "right_conv1_act"); + assert(right_conv1_act != nullptr); + + // left_conv2 convolution op. + auto left_conv2 = network->addConvolution(*left_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_conv2_k"), weights.at("left_conv2_b")); + assert(left_conv2 != nullptr); + left_conv2->setStride( DimsHW {1, 1}); + left_conv2->setPadding(DimsHW {1, 1}); + + // left_conv2_act ELU activation op. + auto left_conv2_act = addElu(plugin_factory, *network, *left_conv2->getOutput(0), data_type, "left_conv2_act"); + assert(left_conv2_act != nullptr); + + // right_conv2 convolution op. + auto right_conv2 = network->addConvolution(*right_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_conv2_k"), weights.at("right_conv2_b")); + assert(right_conv2 != nullptr); + right_conv2->setStride( DimsHW {1, 1}); + right_conv2->setPadding(DimsHW {1, 1}); + + // right_conv2_act ELU activation op. + auto right_conv2_act = addElu(plugin_factory, *network, *right_conv2->getOutput(0), data_type, "right_conv2_act"); + assert(right_conv2_act != nullptr); + + // left_conv3 convolution op. + auto left_conv3 = network->addConvolution(*left_conv2_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_conv3_k"), weights.at("left_conv3_b")); + assert(left_conv3 != nullptr); + left_conv3->setStride( DimsHW {1, 1}); + left_conv3->setPadding(DimsHW {1, 1}); + + // left_conv3_act ELU activation op. + auto left_conv3_act = addElu(plugin_factory, *network, *left_conv3->getOutput(0), data_type, "left_conv3_act"); + assert(left_conv3_act != nullptr); + + // right_conv3 convolution op. + auto right_conv3 = network->addConvolution(*right_conv2_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_conv3_k"), weights.at("right_conv3_b")); + assert(right_conv3 != nullptr); + right_conv3->setStride( DimsHW {1, 1}); + right_conv3->setPadding(DimsHW {1, 1}); + + // right_conv3_act ELU activation op. + auto right_conv3_act = addElu(plugin_factory, *network, *right_conv3->getOutput(0), data_type, "right_conv3_act"); + assert(right_conv3_act != nullptr); + + // left_conv4 convolution op. + auto left_conv4 = network->addConvolution(*left_conv3_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_conv4_k"), weights.at("left_conv4_b")); + assert(left_conv4 != nullptr); + left_conv4->setStride( DimsHW {1, 1}); + left_conv4->setPadding(DimsHW {1, 1}); + + // left_conv4_act ELU activation op. + auto left_conv4_act = addElu(plugin_factory, *network, *left_conv4->getOutput(0), data_type, "left_conv4_act"); + assert(left_conv4_act != nullptr); + + // right_conv4 convolution op. + auto right_conv4 = network->addConvolution(*right_conv3_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_conv4_k"), weights.at("right_conv4_b")); + assert(right_conv4 != nullptr); + right_conv4->setStride( DimsHW {1, 1}); + right_conv4->setPadding(DimsHW {1, 1}); + + // right_conv4_act ELU activation op. + auto right_conv4_act = addElu(plugin_factory, *network, *right_conv4->getOutput(0), data_type, "right_conv4_act"); + assert(right_conv4_act != nullptr); + + // left_conv5 convolution op. + auto left_conv5 = network->addConvolution(*left_conv4_act->getOutput(0), 8, DimsHW {3, 3}, + weights.at("left_conv5_k"), weights.at("left_conv5_b")); + assert(left_conv5 != nullptr); + left_conv5->setStride( DimsHW {1, 1}); + left_conv5->setPadding(DimsHW {1, 1}); + + // right_conv5 convolution op. + auto right_conv5 = network->addConvolution(*right_conv4_act->getOutput(0), 8, DimsHW {3, 3}, + weights.at("right_conv5_k"), weights.at("right_conv5_b")); + assert(right_conv5 != nullptr); + right_conv5->setStride( DimsHW {1, 1}); + right_conv5->setPadding(DimsHW {1, 1}); + + // cost_vol cost volume op. + auto cost_vol = addCostVolume(plugin_factory, *network, *left_conv5->getOutput(0), *right_conv5->getOutput(0), 24, "cost_vol"); + assert(cost_vol != nullptr); + + // conv3D_1 3D convolution op. + auto conv3D_1 = addConv3D(plugin_factory, *network, *cost_vol->getOutput(0), + Conv3DType::kTensorFlow, {5, {16, 3, 16, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_1_k"), weights.at("conv3D_1_b"), + "conv3D_1"); + assert(conv3D_1 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_1_tran = addTransform(plugin_factory, *network, *conv3D_1->getOutput(0), {1, 0, 2, 3}, "conv3D_1_tran_transform"); + assert(conv3D_1_tran != nullptr); + + // conv3D_1_act ELU activation op. + auto conv3D_1_act = addElu(plugin_factory, *network, *conv3D_1_tran->getOutput(0), data_type, "conv3D_1_act"); + assert(conv3D_1_act != nullptr); + + // conv3D_2 3D convolution op. + auto conv3D_2 = addConv3D(plugin_factory, *network, *conv3D_1_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {16, 3, 16, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_2_k"), weights.at("conv3D_2_b"), + "conv3D_2"); + assert(conv3D_2 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_2_tran = addTransform(plugin_factory, *network, *conv3D_2->getOutput(0), {1, 0, 2, 3}, "conv3D_2_tran_transform"); + assert(conv3D_2_tran != nullptr); + + // conv3D_2_act ELU activation op. + auto conv3D_2_act = addElu(plugin_factory, *network, *conv3D_2_tran->getOutput(0), data_type, "conv3D_2_act"); + assert(conv3D_2_act != nullptr); + + // conv3D_3ds_pad padding op. + auto conv3D_3ds_pad = addPad(plugin_factory, *network, *conv3D_2_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_3ds_pad"); + assert(conv3D_3ds_pad != nullptr); + + // conv3D_3ds 3D convolution op. + auto conv3D_3ds = addConv3D(plugin_factory, *network, *conv3D_3ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 16, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_3ds_k"), weights.at("conv3D_3ds_b"), + "conv3D_3ds"); + assert(conv3D_3ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_3ds_tran = addTransform(plugin_factory, *network, *conv3D_3ds->getOutput(0), {1, 0, 2, 3}, "conv3D_3ds_tran_transform"); + assert(conv3D_3ds_tran != nullptr); + + // conv3D_3ds_act ELU activation op. + auto conv3D_3ds_act = addElu(plugin_factory, *network, *conv3D_3ds_tran->getOutput(0), data_type, "conv3D_3ds_act"); + assert(conv3D_3ds_act != nullptr); + + // conv3D_4 3D convolution op. + auto conv3D_4 = addConv3D(plugin_factory, *network, *conv3D_3ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_4_k"), weights.at("conv3D_4_b"), + "conv3D_4"); + assert(conv3D_4 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_4_tran = addTransform(plugin_factory, *network, *conv3D_4->getOutput(0), {1, 0, 2, 3}, "conv3D_4_tran_transform"); + assert(conv3D_4_tran != nullptr); + + // conv3D_4_act ELU activation op. + auto conv3D_4_act = addElu(plugin_factory, *network, *conv3D_4_tran->getOutput(0), data_type, "conv3D_4_act"); + assert(conv3D_4_act != nullptr); + + // conv3D_5 3D convolution op. + auto conv3D_5 = addConv3D(plugin_factory, *network, *conv3D_4_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_5_k"), weights.at("conv3D_5_b"), + "conv3D_5"); + assert(conv3D_5 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_5_tran = addTransform(plugin_factory, *network, *conv3D_5->getOutput(0), {1, 0, 2, 3}, "conv3D_5_tran_transform"); + assert(conv3D_5_tran != nullptr); + + // conv3D_5_act ELU activation op. + auto conv3D_5_act = addElu(plugin_factory, *network, *conv3D_5_tran->getOutput(0), data_type, "conv3D_5_act"); + assert(conv3D_5_act != nullptr); + + // conv3D_6ds_pad padding op. + auto conv3D_6ds_pad = addPad(plugin_factory, *network, *conv3D_5_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_6ds_pad"); + assert(conv3D_6ds_pad != nullptr); + + // conv3D_6ds 3D convolution op. + auto conv3D_6ds = addConv3D(plugin_factory, *network, *conv3D_6ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_6ds_k"), weights.at("conv3D_6ds_b"), + "conv3D_6ds"); + assert(conv3D_6ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_6ds_tran = addTransform(plugin_factory, *network, *conv3D_6ds->getOutput(0), {1, 0, 2, 3}, "conv3D_6ds_tran_transform"); + assert(conv3D_6ds_tran != nullptr); + + // conv3D_6ds_act ELU activation op. + auto conv3D_6ds_act = addElu(plugin_factory, *network, *conv3D_6ds_tran->getOutput(0), data_type, "conv3D_6ds_act"); + assert(conv3D_6ds_act != nullptr); + + // conv3D_7 3D convolution op. + auto conv3D_7 = addConv3D(plugin_factory, *network, *conv3D_6ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_7_k"), weights.at("conv3D_7_b"), + "conv3D_7"); + assert(conv3D_7 != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_7_tran = addTransform(plugin_factory, *network, *conv3D_7->getOutput(0), {1, 0, 2, 3}, "conv3D_7_tran_transform"); + assert(conv3D_7_tran != nullptr); + + // conv3D_7_act ELU activation op. + auto conv3D_7_act = addElu(plugin_factory, *network, *conv3D_7_tran->getOutput(0), data_type, "conv3D_7_act"); + assert(conv3D_7_act != nullptr); + + // conv3D_8 3D convolution op. + auto conv3D_8 = addConv3D(plugin_factory, *network, *conv3D_7_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_8_k"), weights.at("conv3D_8_b"), + "conv3D_8"); + assert(conv3D_8 != nullptr); + + // conv3D_8_act ELU activation op. + auto conv3D_8_act = addElu(plugin_factory, *network, *conv3D_8->getOutput(0), data_type, "conv3D_8_act"); + assert(conv3D_8_act != nullptr); + + // deconv3D_1 3D transposed convolution op. + Dims deconv3D_1_out_dims{4, {13, 32, 41, 129}}; + auto deconv3D_1 = addConv3DTranspose(plugin_factory, *network, *conv3D_8_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, deconv3D_1_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_1_k"), weights.at("deconv3D_1_b"), + "deconv3D_1"); + assert(deconv3D_1 != nullptr); + + // deconv3D_1 output slice op. + auto deconv3D_1_slice_layer = addSlice(plugin_factory, *network, *deconv3D_1->getOutput(0), + deconv3D_1_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_1_out_dims.d[0] - 1, deconv3D_1_out_dims.d[1], deconv3D_1_out_dims.d[2], deconv3D_1_out_dims.d[3]}}, + "deconv3D_1_slice"); + assert(deconv3D_1_slice_layer != nullptr); + + // deconv3D_1_add_skip tensor add op. + auto deconv3D_1_add_skip = network->addElementWise(*(deconv3D_1_slice_layer->getOutput(0)), *(conv3D_5_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_1_add_skip != nullptr); + + // deconv3D_1_act ELU activation op. + auto deconv3D_1_act = addElu(plugin_factory, *network, *deconv3D_1_add_skip->getOutput(0), data_type, "deconv3D_1_act"); + assert(deconv3D_1_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_1_transform = addTransform(plugin_factory, *network, *deconv3D_1_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_1_transform_transform"); + assert(deconv3D_1_transform != nullptr); + + // deconv3D_2 3D transposed convolution op. + Dims deconv3D_2_out_dims{4, {25, 16, 81, 257}}; + auto deconv3D_2 = addConv3DTranspose(plugin_factory, *network, *deconv3D_1_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 16, 3, 3}}, deconv3D_2_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_2_k"), weights.at("deconv3D_2_b"), + "deconv3D_2"); + assert(deconv3D_2 != nullptr); + + // deconv3D_2 output slice op. + auto deconv3D_2_slice_layer = addSlice(plugin_factory, *network, *deconv3D_2->getOutput(0), + deconv3D_2_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_2_out_dims.d[0] - 1, deconv3D_2_out_dims.d[1], deconv3D_2_out_dims.d[2], deconv3D_2_out_dims.d[3]}}, + "deconv3D_2_slice"); + assert(deconv3D_2_slice_layer != nullptr); + + // deconv3D_2_add_skip tensor add op. + auto deconv3D_2_add_skip = network->addElementWise(*(deconv3D_2_slice_layer->getOutput(0)), *(conv3D_2_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_2_add_skip != nullptr); + + // deconv3D_2_act ELU activation op. + auto deconv3D_2_act = addElu(plugin_factory, *network, *deconv3D_2_add_skip->getOutput(0), data_type, "deconv3D_2_act"); + assert(deconv3D_2_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_2_transform = addTransform(plugin_factory, *network, *deconv3D_2_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_2_transform_transform"); + assert(deconv3D_2_transform != nullptr); + + // deconv3D_3 3D transposed convolution op. + Dims deconv3D_3_out_dims{4, {49, 1, 161, 513}}; + auto deconv3D_3 = addConv3DTranspose(plugin_factory, *network, *deconv3D_2_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {16, 3, 1, 3, 3}}, deconv3D_3_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_3_k"), weights.at("deconv3D_3_b"), + "deconv3D_3"); + assert(deconv3D_3 != nullptr); + + // deconv3D_3 output slice op. + auto deconv3D_3_slice_layer = addSlice(plugin_factory, *network, *deconv3D_3->getOutput(0), + deconv3D_3_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_3_out_dims.d[0] - 1, deconv3D_3_out_dims.d[1], deconv3D_3_out_dims.d[2], deconv3D_3_out_dims.d[3]}}, + "deconv3D_3_slice"); + assert(deconv3D_3_slice_layer != nullptr); + + // Softargmax. + auto disp = addSoftargmax(plugin_factory, *network, *deconv3D_3_slice_layer->getOutput(0), "disp_softargmax"); + assert(disp != nullptr); + + auto disp_out = disp->getOutput(0); + disp_out->setName("disp"); + network->markOutput(*disp_out); + + return network; +} + +} } // namespace diff --git a/stereoDNN/sample_app/resnet18_1025x321_net.cpp b/stereoDNN/sample_app/resnet18_1025x321_net.cpp new file mode 100644 index 0000000..426a5fc --- /dev/null +++ b/stereoDNN/sample_app/resnet18_1025x321_net.cpp @@ -0,0 +1,880 @@ +// Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +//------------------------------------------------------------------- +// !!! This file was automatically generated. Do not edit. !!! +//------------------------------------------------------------------- + +#include +#include +#include +#include +#include "redtail_tensorrt_plugins.h" + +namespace redtail { namespace tensorrt +{ + +using namespace nvinfer1; + +using weight_map = std::unordered_map; + +INetworkDefinition* createResNet18_1025x321Network(IBuilder& builder, IPluginContainer& plugin_factory, + DimsCHW img_dims, const weight_map& weights, DataType data_type, + ILogger& log) +{ + INetworkDefinition* network = builder.createNetwork(); + assert(network != nullptr); + // Input tensor. + auto left = network->addInput("left", DataType::kFLOAT, img_dims); + assert(left != nullptr); + + // Input tensor. + auto right = network->addInput("right", DataType::kFLOAT, img_dims); + assert(right != nullptr); + + // Scaling op. + auto left_scale = network->addScale(*left, ScaleMode::kUNIFORM, + weights.at("left_scale_shift"), weights.at("left_scale_scale"), weights.at("left_scale_power")); + assert(left_scale != nullptr); + + // Scaling op. + auto right_scale = network->addScale(*right, ScaleMode::kUNIFORM, + weights.at("right_scale_shift"), weights.at("right_scale_scale"), weights.at("right_scale_power")); + assert(right_scale != nullptr); + + // left_conv1 convolution op. + auto left_conv1 = network->addConvolution(*left_scale->getOutput(0), 32, DimsHW {5, 5}, + weights.at("left_conv1_k"), weights.at("left_conv1_b")); + assert(left_conv1 != nullptr); + left_conv1->setStride( DimsHW {2, 2}); + left_conv1->setPadding(DimsHW {2, 2}); + + // left_conv1_act ELU activation op. + auto left_conv1_act = addElu(plugin_factory, *network, *left_conv1->getOutput(0), data_type, "left_conv1_act"); + assert(left_conv1_act != nullptr); + + // right_conv1 convolution op. + auto right_conv1 = network->addConvolution(*right_scale->getOutput(0), 32, DimsHW {5, 5}, + weights.at("right_conv1_k"), weights.at("right_conv1_b")); + assert(right_conv1 != nullptr); + right_conv1->setStride( DimsHW {2, 2}); + right_conv1->setPadding(DimsHW {2, 2}); + + // right_conv1_act ELU activation op. + auto right_conv1_act = addElu(plugin_factory, *network, *right_conv1->getOutput(0), data_type, "right_conv1_act"); + assert(right_conv1_act != nullptr); + + // left_resblock1_conv1 convolution op. + auto left_resblock1_conv1 = network->addConvolution(*left_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock1_conv1_k"), weights.at("left_resblock1_conv1_b")); + assert(left_resblock1_conv1 != nullptr); + left_resblock1_conv1->setStride( DimsHW {1, 1}); + left_resblock1_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock1_conv1_act ELU activation op. + auto left_resblock1_conv1_act = addElu(plugin_factory, *network, *left_resblock1_conv1->getOutput(0), data_type, "left_resblock1_conv1_act"); + assert(left_resblock1_conv1_act != nullptr); + + // left_resblock1_conv2 convolution op. + auto left_resblock1_conv2 = network->addConvolution(*left_resblock1_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock1_conv2_k"), weights.at("left_resblock1_conv2_b")); + assert(left_resblock1_conv2 != nullptr); + left_resblock1_conv2->setStride( DimsHW {1, 1}); + left_resblock1_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock1_conv2_add tensor add op. + auto left_resblock1_conv2_add = network->addElementWise(*(left_resblock1_conv2->getOutput(0)), *(left_conv1_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(left_resblock1_conv2_add != nullptr); + + // left_resblock1_conv2_add_act ELU activation op. + auto left_resblock1_conv2_add_act = addElu(plugin_factory, *network, *left_resblock1_conv2_add->getOutput(0), data_type, "left_resblock1_conv2_add_act"); + assert(left_resblock1_conv2_add_act != nullptr); + + // right_resblock1_conv1 convolution op. + auto right_resblock1_conv1 = network->addConvolution(*right_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock1_conv1_k"), weights.at("right_resblock1_conv1_b")); + assert(right_resblock1_conv1 != nullptr); + right_resblock1_conv1->setStride( DimsHW {1, 1}); + right_resblock1_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock1_conv1_act ELU activation op. + auto right_resblock1_conv1_act = addElu(plugin_factory, *network, *right_resblock1_conv1->getOutput(0), data_type, "right_resblock1_conv1_act"); + assert(right_resblock1_conv1_act != nullptr); + + // right_resblock1_conv2 convolution op. + auto right_resblock1_conv2 = network->addConvolution(*right_resblock1_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock1_conv2_k"), weights.at("right_resblock1_conv2_b")); + assert(right_resblock1_conv2 != nullptr); + right_resblock1_conv2->setStride( DimsHW {1, 1}); + right_resblock1_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock1_conv2_add tensor add op. + auto right_resblock1_conv2_add = network->addElementWise(*(right_resblock1_conv2->getOutput(0)), *(right_conv1_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock1_conv2_add != nullptr); + + // right_resblock1_conv2_add_act ELU activation op. + auto right_resblock1_conv2_add_act = addElu(plugin_factory, *network, *right_resblock1_conv2_add->getOutput(0), data_type, "right_resblock1_conv2_add_act"); + assert(right_resblock1_conv2_add_act != nullptr); + + // left_resblock2_conv1 convolution op. + auto left_resblock2_conv1 = network->addConvolution(*left_resblock1_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock2_conv1_k"), weights.at("left_resblock2_conv1_b")); + assert(left_resblock2_conv1 != nullptr); + left_resblock2_conv1->setStride( DimsHW {1, 1}); + left_resblock2_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock2_conv1_act ELU activation op. + auto left_resblock2_conv1_act = addElu(plugin_factory, *network, *left_resblock2_conv1->getOutput(0), data_type, "left_resblock2_conv1_act"); + assert(left_resblock2_conv1_act != nullptr); + + // left_resblock2_conv2 convolution op. + auto left_resblock2_conv2 = network->addConvolution(*left_resblock2_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock2_conv2_k"), weights.at("left_resblock2_conv2_b")); + assert(left_resblock2_conv2 != nullptr); + left_resblock2_conv2->setStride( DimsHW {1, 1}); + left_resblock2_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock2_conv2_add tensor add op. + auto left_resblock2_conv2_add = network->addElementWise(*(left_resblock2_conv2->getOutput(0)), *(left_resblock1_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(left_resblock2_conv2_add != nullptr); + + // left_resblock2_conv2_add_act ELU activation op. + auto left_resblock2_conv2_add_act = addElu(plugin_factory, *network, *left_resblock2_conv2_add->getOutput(0), data_type, "left_resblock2_conv2_add_act"); + assert(left_resblock2_conv2_add_act != nullptr); + + // right_resblock2_conv1 convolution op. + auto right_resblock2_conv1 = network->addConvolution(*right_resblock1_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock2_conv1_k"), weights.at("right_resblock2_conv1_b")); + assert(right_resblock2_conv1 != nullptr); + right_resblock2_conv1->setStride( DimsHW {1, 1}); + right_resblock2_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock2_conv1_act ELU activation op. + auto right_resblock2_conv1_act = addElu(plugin_factory, *network, *right_resblock2_conv1->getOutput(0), data_type, "right_resblock2_conv1_act"); + assert(right_resblock2_conv1_act != nullptr); + + // right_resblock2_conv2 convolution op. + auto right_resblock2_conv2 = network->addConvolution(*right_resblock2_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock2_conv2_k"), weights.at("right_resblock2_conv2_b")); + assert(right_resblock2_conv2 != nullptr); + right_resblock2_conv2->setStride( DimsHW {1, 1}); + right_resblock2_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock2_conv2_add tensor add op. + auto right_resblock2_conv2_add = network->addElementWise(*(right_resblock2_conv2->getOutput(0)), *(right_resblock1_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock2_conv2_add != nullptr); + + // right_resblock2_conv2_add_act ELU activation op. + auto right_resblock2_conv2_add_act = addElu(plugin_factory, *network, *right_resblock2_conv2_add->getOutput(0), data_type, "right_resblock2_conv2_add_act"); + assert(right_resblock2_conv2_add_act != nullptr); + + // left_resblock3_conv1 convolution op. + auto left_resblock3_conv1 = network->addConvolution(*left_resblock2_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock3_conv1_k"), weights.at("left_resblock3_conv1_b")); + assert(left_resblock3_conv1 != nullptr); + left_resblock3_conv1->setStride( DimsHW {1, 1}); + left_resblock3_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock3_conv1_act ELU activation op. + auto left_resblock3_conv1_act = addElu(plugin_factory, *network, *left_resblock3_conv1->getOutput(0), data_type, "left_resblock3_conv1_act"); + assert(left_resblock3_conv1_act != nullptr); + + // left_resblock3_conv2 convolution op. + auto left_resblock3_conv2 = network->addConvolution(*left_resblock3_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock3_conv2_k"), weights.at("left_resblock3_conv2_b")); + assert(left_resblock3_conv2 != nullptr); + left_resblock3_conv2->setStride( DimsHW {1, 1}); + left_resblock3_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock3_conv2_add tensor add op. + auto left_resblock3_conv2_add = network->addElementWise(*(left_resblock3_conv2->getOutput(0)), *(left_resblock2_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(left_resblock3_conv2_add != nullptr); + + // left_resblock3_conv2_add_act ELU activation op. + auto left_resblock3_conv2_add_act = addElu(plugin_factory, *network, *left_resblock3_conv2_add->getOutput(0), data_type, "left_resblock3_conv2_add_act"); + assert(left_resblock3_conv2_add_act != nullptr); + + // right_resblock3_conv1 convolution op. + auto right_resblock3_conv1 = network->addConvolution(*right_resblock2_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock3_conv1_k"), weights.at("right_resblock3_conv1_b")); + assert(right_resblock3_conv1 != nullptr); + right_resblock3_conv1->setStride( DimsHW {1, 1}); + right_resblock3_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock3_conv1_act ELU activation op. + auto right_resblock3_conv1_act = addElu(plugin_factory, *network, *right_resblock3_conv1->getOutput(0), data_type, "right_resblock3_conv1_act"); + assert(right_resblock3_conv1_act != nullptr); + + // right_resblock3_conv2 convolution op. + auto right_resblock3_conv2 = network->addConvolution(*right_resblock3_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock3_conv2_k"), weights.at("right_resblock3_conv2_b")); + assert(right_resblock3_conv2 != nullptr); + right_resblock3_conv2->setStride( DimsHW {1, 1}); + right_resblock3_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock3_conv2_add tensor add op. + auto right_resblock3_conv2_add = network->addElementWise(*(right_resblock3_conv2->getOutput(0)), *(right_resblock2_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock3_conv2_add != nullptr); + + // right_resblock3_conv2_add_act ELU activation op. + auto right_resblock3_conv2_add_act = addElu(plugin_factory, *network, *right_resblock3_conv2_add->getOutput(0), data_type, "right_resblock3_conv2_add_act"); + assert(right_resblock3_conv2_add_act != nullptr); + + // left_resblock4_conv1 convolution op. + auto left_resblock4_conv1 = network->addConvolution(*left_resblock3_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock4_conv1_k"), weights.at("left_resblock4_conv1_b")); + assert(left_resblock4_conv1 != nullptr); + left_resblock4_conv1->setStride( DimsHW {1, 1}); + left_resblock4_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock4_conv1_act ELU activation op. + auto left_resblock4_conv1_act = addElu(plugin_factory, *network, *left_resblock4_conv1->getOutput(0), data_type, "left_resblock4_conv1_act"); + assert(left_resblock4_conv1_act != nullptr); + + // left_resblock4_conv2 convolution op. + auto left_resblock4_conv2 = network->addConvolution(*left_resblock4_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock4_conv2_k"), weights.at("left_resblock4_conv2_b")); + assert(left_resblock4_conv2 != nullptr); + left_resblock4_conv2->setStride( DimsHW {1, 1}); + left_resblock4_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock4_conv2_add tensor add op. + auto left_resblock4_conv2_add = network->addElementWise(*(left_resblock4_conv2->getOutput(0)), *(left_resblock3_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(left_resblock4_conv2_add != nullptr); + + // left_resblock4_conv2_add_act ELU activation op. + auto left_resblock4_conv2_add_act = addElu(plugin_factory, *network, *left_resblock4_conv2_add->getOutput(0), data_type, "left_resblock4_conv2_add_act"); + assert(left_resblock4_conv2_add_act != nullptr); + + // right_resblock4_conv1 convolution op. + auto right_resblock4_conv1 = network->addConvolution(*right_resblock3_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock4_conv1_k"), weights.at("right_resblock4_conv1_b")); + assert(right_resblock4_conv1 != nullptr); + right_resblock4_conv1->setStride( DimsHW {1, 1}); + right_resblock4_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock4_conv1_act ELU activation op. + auto right_resblock4_conv1_act = addElu(plugin_factory, *network, *right_resblock4_conv1->getOutput(0), data_type, "right_resblock4_conv1_act"); + assert(right_resblock4_conv1_act != nullptr); + + // right_resblock4_conv2 convolution op. + auto right_resblock4_conv2 = network->addConvolution(*right_resblock4_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock4_conv2_k"), weights.at("right_resblock4_conv2_b")); + assert(right_resblock4_conv2 != nullptr); + right_resblock4_conv2->setStride( DimsHW {1, 1}); + right_resblock4_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock4_conv2_add tensor add op. + auto right_resblock4_conv2_add = network->addElementWise(*(right_resblock4_conv2->getOutput(0)), *(right_resblock3_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock4_conv2_add != nullptr); + + // right_resblock4_conv2_add_act ELU activation op. + auto right_resblock4_conv2_add_act = addElu(plugin_factory, *network, *right_resblock4_conv2_add->getOutput(0), data_type, "right_resblock4_conv2_add_act"); + assert(right_resblock4_conv2_add_act != nullptr); + + // left_resblock5_conv1 convolution op. + auto left_resblock5_conv1 = network->addConvolution(*left_resblock4_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock5_conv1_k"), weights.at("left_resblock5_conv1_b")); + assert(left_resblock5_conv1 != nullptr); + left_resblock5_conv1->setStride( DimsHW {1, 1}); + left_resblock5_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock5_conv1_act ELU activation op. + auto left_resblock5_conv1_act = addElu(plugin_factory, *network, *left_resblock5_conv1->getOutput(0), data_type, "left_resblock5_conv1_act"); + assert(left_resblock5_conv1_act != nullptr); + + // left_resblock5_conv2 convolution op. + auto left_resblock5_conv2 = network->addConvolution(*left_resblock5_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock5_conv2_k"), weights.at("left_resblock5_conv2_b")); + assert(left_resblock5_conv2 != nullptr); + left_resblock5_conv2->setStride( DimsHW {1, 1}); + left_resblock5_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock5_conv2_add tensor add op. + auto left_resblock5_conv2_add = network->addElementWise(*(left_resblock5_conv2->getOutput(0)), *(left_resblock4_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(left_resblock5_conv2_add != nullptr); + + // left_resblock5_conv2_add_act ELU activation op. + auto left_resblock5_conv2_add_act = addElu(plugin_factory, *network, *left_resblock5_conv2_add->getOutput(0), data_type, "left_resblock5_conv2_add_act"); + assert(left_resblock5_conv2_add_act != nullptr); + + // right_resblock5_conv1 convolution op. + auto right_resblock5_conv1 = network->addConvolution(*right_resblock4_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock5_conv1_k"), weights.at("right_resblock5_conv1_b")); + assert(right_resblock5_conv1 != nullptr); + right_resblock5_conv1->setStride( DimsHW {1, 1}); + right_resblock5_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock5_conv1_act ELU activation op. + auto right_resblock5_conv1_act = addElu(plugin_factory, *network, *right_resblock5_conv1->getOutput(0), data_type, "right_resblock5_conv1_act"); + assert(right_resblock5_conv1_act != nullptr); + + // right_resblock5_conv2 convolution op. + auto right_resblock5_conv2 = network->addConvolution(*right_resblock5_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock5_conv2_k"), weights.at("right_resblock5_conv2_b")); + assert(right_resblock5_conv2 != nullptr); + right_resblock5_conv2->setStride( DimsHW {1, 1}); + right_resblock5_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock5_conv2_add tensor add op. + auto right_resblock5_conv2_add = network->addElementWise(*(right_resblock5_conv2->getOutput(0)), *(right_resblock4_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock5_conv2_add != nullptr); + + // right_resblock5_conv2_add_act ELU activation op. + auto right_resblock5_conv2_add_act = addElu(plugin_factory, *network, *right_resblock5_conv2_add->getOutput(0), data_type, "right_resblock5_conv2_add_act"); + assert(right_resblock5_conv2_add_act != nullptr); + + // left_resblock6_conv1 convolution op. + auto left_resblock6_conv1 = network->addConvolution(*left_resblock5_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock6_conv1_k"), weights.at("left_resblock6_conv1_b")); + assert(left_resblock6_conv1 != nullptr); + left_resblock6_conv1->setStride( DimsHW {1, 1}); + left_resblock6_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock6_conv1_act ELU activation op. + auto left_resblock6_conv1_act = addElu(plugin_factory, *network, *left_resblock6_conv1->getOutput(0), data_type, "left_resblock6_conv1_act"); + assert(left_resblock6_conv1_act != nullptr); + + // left_resblock6_conv2 convolution op. + auto left_resblock6_conv2 = network->addConvolution(*left_resblock6_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock6_conv2_k"), weights.at("left_resblock6_conv2_b")); + assert(left_resblock6_conv2 != nullptr); + left_resblock6_conv2->setStride( DimsHW {1, 1}); + left_resblock6_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock6_conv2_add tensor add op. + auto left_resblock6_conv2_add = network->addElementWise(*(left_resblock6_conv2->getOutput(0)), *(left_resblock5_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(left_resblock6_conv2_add != nullptr); + + // left_resblock6_conv2_add_act ELU activation op. + auto left_resblock6_conv2_add_act = addElu(plugin_factory, *network, *left_resblock6_conv2_add->getOutput(0), data_type, "left_resblock6_conv2_add_act"); + assert(left_resblock6_conv2_add_act != nullptr); + + // right_resblock6_conv1 convolution op. + auto right_resblock6_conv1 = network->addConvolution(*right_resblock5_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock6_conv1_k"), weights.at("right_resblock6_conv1_b")); + assert(right_resblock6_conv1 != nullptr); + right_resblock6_conv1->setStride( DimsHW {1, 1}); + right_resblock6_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock6_conv1_act ELU activation op. + auto right_resblock6_conv1_act = addElu(plugin_factory, *network, *right_resblock6_conv1->getOutput(0), data_type, "right_resblock6_conv1_act"); + assert(right_resblock6_conv1_act != nullptr); + + // right_resblock6_conv2 convolution op. + auto right_resblock6_conv2 = network->addConvolution(*right_resblock6_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock6_conv2_k"), weights.at("right_resblock6_conv2_b")); + assert(right_resblock6_conv2 != nullptr); + right_resblock6_conv2->setStride( DimsHW {1, 1}); + right_resblock6_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock6_conv2_add tensor add op. + auto right_resblock6_conv2_add = network->addElementWise(*(right_resblock6_conv2->getOutput(0)), *(right_resblock5_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock6_conv2_add != nullptr); + + // right_resblock6_conv2_add_act ELU activation op. + auto right_resblock6_conv2_add_act = addElu(plugin_factory, *network, *right_resblock6_conv2_add->getOutput(0), data_type, "right_resblock6_conv2_add_act"); + assert(right_resblock6_conv2_add_act != nullptr); + + // left_resblock7_conv1 convolution op. + auto left_resblock7_conv1 = network->addConvolution(*left_resblock6_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock7_conv1_k"), weights.at("left_resblock7_conv1_b")); + assert(left_resblock7_conv1 != nullptr); + left_resblock7_conv1->setStride( DimsHW {1, 1}); + left_resblock7_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock7_conv1_act ELU activation op. + auto left_resblock7_conv1_act = addElu(plugin_factory, *network, *left_resblock7_conv1->getOutput(0), data_type, "left_resblock7_conv1_act"); + assert(left_resblock7_conv1_act != nullptr); + + // left_resblock7_conv2 convolution op. + auto left_resblock7_conv2 = network->addConvolution(*left_resblock7_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock7_conv2_k"), weights.at("left_resblock7_conv2_b")); + assert(left_resblock7_conv2 != nullptr); + left_resblock7_conv2->setStride( DimsHW {1, 1}); + left_resblock7_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock7_conv2_add tensor add op. + auto left_resblock7_conv2_add = network->addElementWise(*(left_resblock7_conv2->getOutput(0)), *(left_resblock6_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(left_resblock7_conv2_add != nullptr); + + // left_resblock7_conv2_add_act ELU activation op. + auto left_resblock7_conv2_add_act = addElu(plugin_factory, *network, *left_resblock7_conv2_add->getOutput(0), data_type, "left_resblock7_conv2_add_act"); + assert(left_resblock7_conv2_add_act != nullptr); + + // right_resblock7_conv1 convolution op. + auto right_resblock7_conv1 = network->addConvolution(*right_resblock6_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock7_conv1_k"), weights.at("right_resblock7_conv1_b")); + assert(right_resblock7_conv1 != nullptr); + right_resblock7_conv1->setStride( DimsHW {1, 1}); + right_resblock7_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock7_conv1_act ELU activation op. + auto right_resblock7_conv1_act = addElu(plugin_factory, *network, *right_resblock7_conv1->getOutput(0), data_type, "right_resblock7_conv1_act"); + assert(right_resblock7_conv1_act != nullptr); + + // right_resblock7_conv2 convolution op. + auto right_resblock7_conv2 = network->addConvolution(*right_resblock7_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock7_conv2_k"), weights.at("right_resblock7_conv2_b")); + assert(right_resblock7_conv2 != nullptr); + right_resblock7_conv2->setStride( DimsHW {1, 1}); + right_resblock7_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock7_conv2_add tensor add op. + auto right_resblock7_conv2_add = network->addElementWise(*(right_resblock7_conv2->getOutput(0)), *(right_resblock6_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock7_conv2_add != nullptr); + + // right_resblock7_conv2_add_act ELU activation op. + auto right_resblock7_conv2_add_act = addElu(plugin_factory, *network, *right_resblock7_conv2_add->getOutput(0), data_type, "right_resblock7_conv2_add_act"); + assert(right_resblock7_conv2_add_act != nullptr); + + // left_resblock8_conv1 convolution op. + auto left_resblock8_conv1 = network->addConvolution(*left_resblock7_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock8_conv1_k"), weights.at("left_resblock8_conv1_b")); + assert(left_resblock8_conv1 != nullptr); + left_resblock8_conv1->setStride( DimsHW {1, 1}); + left_resblock8_conv1->setPadding(DimsHW {1, 1}); + + // left_resblock8_conv1_act ELU activation op. + auto left_resblock8_conv1_act = addElu(plugin_factory, *network, *left_resblock8_conv1->getOutput(0), data_type, "left_resblock8_conv1_act"); + assert(left_resblock8_conv1_act != nullptr); + + // left_resblock8_conv2 convolution op. + auto left_resblock8_conv2 = network->addConvolution(*left_resblock8_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_resblock8_conv2_k"), weights.at("left_resblock8_conv2_b")); + assert(left_resblock8_conv2 != nullptr); + left_resblock8_conv2->setStride( DimsHW {1, 1}); + left_resblock8_conv2->setPadding(DimsHW {1, 1}); + + // left_resblock8_conv2_add tensor add op. + auto left_resblock8_conv2_add = network->addElementWise(*(left_resblock8_conv2->getOutput(0)), *(left_resblock7_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(left_resblock8_conv2_add != nullptr); + + // left_resblock8_conv2_add_act ELU activation op. + auto left_resblock8_conv2_add_act = addElu(plugin_factory, *network, *left_resblock8_conv2_add->getOutput(0), data_type, "left_resblock8_conv2_add_act"); + assert(left_resblock8_conv2_add_act != nullptr); + + // right_resblock8_conv1 convolution op. + auto right_resblock8_conv1 = network->addConvolution(*right_resblock7_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock8_conv1_k"), weights.at("right_resblock8_conv1_b")); + assert(right_resblock8_conv1 != nullptr); + right_resblock8_conv1->setStride( DimsHW {1, 1}); + right_resblock8_conv1->setPadding(DimsHW {1, 1}); + + // right_resblock8_conv1_act ELU activation op. + auto right_resblock8_conv1_act = addElu(plugin_factory, *network, *right_resblock8_conv1->getOutput(0), data_type, "right_resblock8_conv1_act"); + assert(right_resblock8_conv1_act != nullptr); + + // right_resblock8_conv2 convolution op. + auto right_resblock8_conv2 = network->addConvolution(*right_resblock8_conv1_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_resblock8_conv2_k"), weights.at("right_resblock8_conv2_b")); + assert(right_resblock8_conv2 != nullptr); + right_resblock8_conv2->setStride( DimsHW {1, 1}); + right_resblock8_conv2->setPadding(DimsHW {1, 1}); + + // right_resblock8_conv2_add tensor add op. + auto right_resblock8_conv2_add = network->addElementWise(*(right_resblock8_conv2->getOutput(0)), *(right_resblock7_conv2_add_act->getOutput(0)), + ElementWiseOperation::kSUM); + assert(right_resblock8_conv2_add != nullptr); + + // right_resblock8_conv2_add_act ELU activation op. + auto right_resblock8_conv2_add_act = addElu(plugin_factory, *network, *right_resblock8_conv2_add->getOutput(0), data_type, "right_resblock8_conv2_add_act"); + assert(right_resblock8_conv2_add_act != nullptr); + + // left_encoder2D_out convolution op. + auto left_encoder2D_out = network->addConvolution(*left_resblock8_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("left_encoder2D_out_k"), weights.at("left_encoder2D_out_b")); + assert(left_encoder2D_out != nullptr); + left_encoder2D_out->setStride( DimsHW {1, 1}); + left_encoder2D_out->setPadding(DimsHW {1, 1}); + + // right_encoder2D_out convolution op. + auto right_encoder2D_out = network->addConvolution(*right_resblock8_conv2_add_act->getOutput(0), 32, DimsHW {3, 3}, + weights.at("right_encoder2D_out_k"), weights.at("right_encoder2D_out_b")); + assert(right_encoder2D_out != nullptr); + right_encoder2D_out->setStride( DimsHW {1, 1}); + right_encoder2D_out->setPadding(DimsHW {1, 1}); + + // cost_vol cost volume op. + auto cost_vol = addCostVolume(plugin_factory, *network, *left_encoder2D_out->getOutput(0), *right_encoder2D_out->getOutput(0), 68, "cost_vol"); + assert(cost_vol != nullptr); + + // conv3D_1a 3D convolution op. + auto conv3D_1a = addConv3D(plugin_factory, *network, *cost_vol->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_1a_k"), weights.at("conv3D_1a_b"), + "conv3D_1a"); + assert(conv3D_1a != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_1a_tran = addTransform(plugin_factory, *network, *conv3D_1a->getOutput(0), {1, 0, 2, 3}, "conv3D_1a_tran_transform"); + assert(conv3D_1a_tran != nullptr); + + // conv3D_1a_act ELU activation op. + auto conv3D_1a_act = addElu(plugin_factory, *network, *conv3D_1a_tran->getOutput(0), data_type, "conv3D_1a_act"); + assert(conv3D_1a_act != nullptr); + + // conv3D_1b 3D convolution op. + auto conv3D_1b = addConv3D(plugin_factory, *network, *conv3D_1a_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 32, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_1b_k"), weights.at("conv3D_1b_b"), + "conv3D_1b"); + assert(conv3D_1b != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_1b_tran = addTransform(plugin_factory, *network, *conv3D_1b->getOutput(0), {1, 0, 2, 3}, "conv3D_1b_tran_transform"); + assert(conv3D_1b_tran != nullptr); + + // conv3D_1b_act ELU activation op. + auto conv3D_1b_act = addElu(plugin_factory, *network, *conv3D_1b_tran->getOutput(0), data_type, "conv3D_1b_act"); + assert(conv3D_1b_act != nullptr); + + // conv3D_1ds_pad padding op. + auto conv3D_1ds_pad = addPad(plugin_factory, *network, *conv3D_1b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_1ds_pad"); + assert(conv3D_1ds_pad != nullptr); + + // conv3D_1ds 3D convolution op. + auto conv3D_1ds = addConv3D(plugin_factory, *network, *conv3D_1ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_1ds_k"), weights.at("conv3D_1ds_b"), + "conv3D_1ds"); + assert(conv3D_1ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_1ds_tran = addTransform(plugin_factory, *network, *conv3D_1ds->getOutput(0), {1, 0, 2, 3}, "conv3D_1ds_tran_transform"); + assert(conv3D_1ds_tran != nullptr); + + // conv3D_1ds_act ELU activation op. + auto conv3D_1ds_act = addElu(plugin_factory, *network, *conv3D_1ds_tran->getOutput(0), data_type, "conv3D_1ds_act"); + assert(conv3D_1ds_act != nullptr); + + // conv3D_2a 3D convolution op. + auto conv3D_2a = addConv3D(plugin_factory, *network, *conv3D_1ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_2a_k"), weights.at("conv3D_2a_b"), + "conv3D_2a"); + assert(conv3D_2a != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_2a_tran = addTransform(plugin_factory, *network, *conv3D_2a->getOutput(0), {1, 0, 2, 3}, "conv3D_2a_tran_transform"); + assert(conv3D_2a_tran != nullptr); + + // conv3D_2a_act ELU activation op. + auto conv3D_2a_act = addElu(plugin_factory, *network, *conv3D_2a_tran->getOutput(0), data_type, "conv3D_2a_act"); + assert(conv3D_2a_act != nullptr); + + // conv3D_2b 3D convolution op. + auto conv3D_2b = addConv3D(plugin_factory, *network, *conv3D_2a_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_2b_k"), weights.at("conv3D_2b_b"), + "conv3D_2b"); + assert(conv3D_2b != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_2b_tran = addTransform(plugin_factory, *network, *conv3D_2b->getOutput(0), {1, 0, 2, 3}, "conv3D_2b_tran_transform"); + assert(conv3D_2b_tran != nullptr); + + // conv3D_2b_act ELU activation op. + auto conv3D_2b_act = addElu(plugin_factory, *network, *conv3D_2b_tran->getOutput(0), data_type, "conv3D_2b_act"); + assert(conv3D_2b_act != nullptr); + + // conv3D_2ds_pad padding op. + auto conv3D_2ds_pad = addPad(plugin_factory, *network, *conv3D_2b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_2ds_pad"); + assert(conv3D_2ds_pad != nullptr); + + // conv3D_2ds 3D convolution op. + auto conv3D_2ds = addConv3D(plugin_factory, *network, *conv3D_2ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_2ds_k"), weights.at("conv3D_2ds_b"), + "conv3D_2ds"); + assert(conv3D_2ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_2ds_tran = addTransform(plugin_factory, *network, *conv3D_2ds->getOutput(0), {1, 0, 2, 3}, "conv3D_2ds_tran_transform"); + assert(conv3D_2ds_tran != nullptr); + + // conv3D_2ds_act ELU activation op. + auto conv3D_2ds_act = addElu(plugin_factory, *network, *conv3D_2ds_tran->getOutput(0), data_type, "conv3D_2ds_act"); + assert(conv3D_2ds_act != nullptr); + + // conv3D_3a 3D convolution op. + auto conv3D_3a = addConv3D(plugin_factory, *network, *conv3D_2ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_3a_k"), weights.at("conv3D_3a_b"), + "conv3D_3a"); + assert(conv3D_3a != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_3a_tran = addTransform(plugin_factory, *network, *conv3D_3a->getOutput(0), {1, 0, 2, 3}, "conv3D_3a_tran_transform"); + assert(conv3D_3a_tran != nullptr); + + // conv3D_3a_act ELU activation op. + auto conv3D_3a_act = addElu(plugin_factory, *network, *conv3D_3a_tran->getOutput(0), data_type, "conv3D_3a_act"); + assert(conv3D_3a_act != nullptr); + + // conv3D_3b 3D convolution op. + auto conv3D_3b = addConv3D(plugin_factory, *network, *conv3D_3a_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_3b_k"), weights.at("conv3D_3b_b"), + "conv3D_3b"); + assert(conv3D_3b != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_3b_tran = addTransform(plugin_factory, *network, *conv3D_3b->getOutput(0), {1, 0, 2, 3}, "conv3D_3b_tran_transform"); + assert(conv3D_3b_tran != nullptr); + + // conv3D_3b_act ELU activation op. + auto conv3D_3b_act = addElu(plugin_factory, *network, *conv3D_3b_tran->getOutput(0), data_type, "conv3D_3b_act"); + assert(conv3D_3b_act != nullptr); + + // conv3D_3ds_pad padding op. + auto conv3D_3ds_pad = addPad(plugin_factory, *network, *conv3D_3b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_3ds_pad"); + assert(conv3D_3ds_pad != nullptr); + + // conv3D_3ds 3D convolution op. + auto conv3D_3ds = addConv3D(plugin_factory, *network, *conv3D_3ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_3ds_k"), weights.at("conv3D_3ds_b"), + "conv3D_3ds"); + assert(conv3D_3ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_3ds_tran = addTransform(plugin_factory, *network, *conv3D_3ds->getOutput(0), {1, 0, 2, 3}, "conv3D_3ds_tran_transform"); + assert(conv3D_3ds_tran != nullptr); + + // conv3D_3ds_act ELU activation op. + auto conv3D_3ds_act = addElu(plugin_factory, *network, *conv3D_3ds_tran->getOutput(0), data_type, "conv3D_3ds_act"); + assert(conv3D_3ds_act != nullptr); + + // conv3D_4a 3D convolution op. + auto conv3D_4a = addConv3D(plugin_factory, *network, *conv3D_3ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_4a_k"), weights.at("conv3D_4a_b"), + "conv3D_4a"); + assert(conv3D_4a != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_4a_tran = addTransform(plugin_factory, *network, *conv3D_4a->getOutput(0), {1, 0, 2, 3}, "conv3D_4a_tran_transform"); + assert(conv3D_4a_tran != nullptr); + + // conv3D_4a_act ELU activation op. + auto conv3D_4a_act = addElu(plugin_factory, *network, *conv3D_4a_tran->getOutput(0), data_type, "conv3D_4a_act"); + assert(conv3D_4a_act != nullptr); + + // conv3D_4b 3D convolution op. + auto conv3D_4b = addConv3D(plugin_factory, *network, *conv3D_4a_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_4b_k"), weights.at("conv3D_4b_b"), + "conv3D_4b"); + assert(conv3D_4b != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_4b_tran = addTransform(plugin_factory, *network, *conv3D_4b->getOutput(0), {1, 0, 2, 3}, "conv3D_4b_tran_transform"); + assert(conv3D_4b_tran != nullptr); + + // conv3D_4b_act ELU activation op. + auto conv3D_4b_act = addElu(plugin_factory, *network, *conv3D_4b_tran->getOutput(0), data_type, "conv3D_4b_act"); + assert(conv3D_4b_act != nullptr); + + // conv3D_4ds_pad padding op. + auto conv3D_4ds_pad = addPad(plugin_factory, *network, *conv3D_4b_act->getOutput(0), {0, 0, 0, 0}, {1, 0, 0, 0}, "conv3D_4ds_pad"); + assert(conv3D_4ds_pad != nullptr); + + // conv3D_4ds 3D convolution op. + auto conv3D_4ds = addConv3D(plugin_factory, *network, *conv3D_4ds_pad->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}}, + Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_4ds_k"), weights.at("conv3D_4ds_b"), + "conv3D_4ds"); + assert(conv3D_4ds != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_4ds_tran = addTransform(plugin_factory, *network, *conv3D_4ds->getOutput(0), {1, 0, 2, 3}, "conv3D_4ds_tran_transform"); + assert(conv3D_4ds_tran != nullptr); + + // conv3D_4ds_act ELU activation op. + auto conv3D_4ds_act = addElu(plugin_factory, *network, *conv3D_4ds_tran->getOutput(0), data_type, "conv3D_4ds_act"); + assert(conv3D_4ds_act != nullptr); + + // conv3D_5a 3D convolution op. + auto conv3D_5a = addConv3D(plugin_factory, *network, *conv3D_4ds_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_5a_k"), weights.at("conv3D_5a_b"), + "conv3D_5a"); + assert(conv3D_5a != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto conv3D_5a_tran = addTransform(plugin_factory, *network, *conv3D_5a->getOutput(0), {1, 0, 2, 3}, "conv3D_5a_tran_transform"); + assert(conv3D_5a_tran != nullptr); + + // conv3D_5a_act ELU activation op. + auto conv3D_5a_act = addElu(plugin_factory, *network, *conv3D_5a_tran->getOutput(0), data_type, "conv3D_5a_act"); + assert(conv3D_5a_act != nullptr); + + // conv3D_5b 3D convolution op. + auto conv3D_5b = addConv3D(plugin_factory, *network, *conv3D_5a_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 128, 3, 3}}, + Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("conv3D_5b_k"), weights.at("conv3D_5b_b"), + "conv3D_5b"); + assert(conv3D_5b != nullptr); + + // conv3D_5b_act ELU activation op. + auto conv3D_5b_act = addElu(plugin_factory, *network, *conv3D_5b->getOutput(0), data_type, "conv3D_5b_act"); + assert(conv3D_5b_act != nullptr); + + // deconv3D_1 3D transposed convolution op. + Dims deconv3D_1_out_dims{4, {9, 64, 21, 65}}; + auto deconv3D_1 = addConv3DTranspose(plugin_factory, *network, *conv3D_5b_act->getOutput(0), + Conv3DType::kTensorFlow, {5, {128, 3, 64, 3, 3}}, deconv3D_1_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("deconv3D_1_k"), weights.at("deconv3D_1_b"), + "deconv3D_1"); + assert(deconv3D_1 != nullptr); + + // deconv3D_1_add_skip tensor add op. + auto deconv3D_1_add_skip = network->addElementWise(*(deconv3D_1->getOutput(0)), *(conv3D_4b_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_1_add_skip != nullptr); + + // deconv3D_1_act ELU activation op. + auto deconv3D_1_act = addElu(plugin_factory, *network, *deconv3D_1_add_skip->getOutput(0), data_type, "deconv3D_1_act"); + assert(deconv3D_1_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_1_transform = addTransform(plugin_factory, *network, *deconv3D_1_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_1_transform_transform"); + assert(deconv3D_1_transform != nullptr); + + // deconv3D_2 3D transposed convolution op. + Dims deconv3D_2_out_dims{4, {17, 64, 41, 129}}; + auto deconv3D_2 = addConv3DTranspose(plugin_factory, *network, *deconv3D_1_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, deconv3D_2_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {1, 1, 1}}, Dims{3, {1, 1, 1}}, + weights.at("deconv3D_2_k"), weights.at("deconv3D_2_b"), + "deconv3D_2"); + assert(deconv3D_2 != nullptr); + + // deconv3D_2_add_skip tensor add op. + auto deconv3D_2_add_skip = network->addElementWise(*(deconv3D_2->getOutput(0)), *(conv3D_3b_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_2_add_skip != nullptr); + + // deconv3D_2_act ELU activation op. + auto deconv3D_2_act = addElu(plugin_factory, *network, *deconv3D_2_add_skip->getOutput(0), data_type, "deconv3D_2_act"); + assert(deconv3D_2_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_2_transform = addTransform(plugin_factory, *network, *deconv3D_2_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_2_transform_transform"); + assert(deconv3D_2_transform != nullptr); + + // deconv3D_3 3D transposed convolution op. + Dims deconv3D_3_out_dims{4, {35, 64, 81, 257}}; + auto deconv3D_3 = addConv3DTranspose(plugin_factory, *network, *deconv3D_2_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 64, 3, 3}}, deconv3D_3_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_3_k"), weights.at("deconv3D_3_b"), + "deconv3D_3"); + assert(deconv3D_3 != nullptr); + + // deconv3D_3 output slice op. + auto deconv3D_3_slice_layer = addSlice(plugin_factory, *network, *deconv3D_3->getOutput(0), + deconv3D_3_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_3_out_dims.d[0] - 1, deconv3D_3_out_dims.d[1], deconv3D_3_out_dims.d[2], deconv3D_3_out_dims.d[3]}}, + "deconv3D_3_slice"); + assert(deconv3D_3_slice_layer != nullptr); + + // deconv3D_3_add_skip tensor add op. + auto deconv3D_3_add_skip = network->addElementWise(*(deconv3D_3_slice_layer->getOutput(0)), *(conv3D_2b_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_3_add_skip != nullptr); + + // deconv3D_3_act ELU activation op. + auto deconv3D_3_act = addElu(plugin_factory, *network, *deconv3D_3_add_skip->getOutput(0), data_type, "deconv3D_3_act"); + assert(deconv3D_3_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_3_transform = addTransform(plugin_factory, *network, *deconv3D_3_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_3_transform_transform"); + assert(deconv3D_3_transform != nullptr); + + // deconv3D_4 3D transposed convolution op. + Dims deconv3D_4_out_dims{4, {69, 32, 161, 513}}; + auto deconv3D_4 = addConv3DTranspose(plugin_factory, *network, *deconv3D_3_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {64, 3, 32, 3, 3}}, deconv3D_4_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_4_k"), weights.at("deconv3D_4_b"), + "deconv3D_4"); + assert(deconv3D_4 != nullptr); + + // deconv3D_4 output slice op. + auto deconv3D_4_slice_layer = addSlice(plugin_factory, *network, *deconv3D_4->getOutput(0), + deconv3D_4_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_4_out_dims.d[0] - 1, deconv3D_4_out_dims.d[1], deconv3D_4_out_dims.d[2], deconv3D_4_out_dims.d[3]}}, + "deconv3D_4_slice"); + assert(deconv3D_4_slice_layer != nullptr); + + // deconv3D_4_add_skip tensor add op. + auto deconv3D_4_add_skip = network->addElementWise(*(deconv3D_4_slice_layer->getOutput(0)), *(conv3D_1b_act->getOutput(0)), ElementWiseOperation::kSUM); + assert(deconv3D_4_add_skip != nullptr); + + // deconv3D_4_act ELU activation op. + auto deconv3D_4_act = addElu(plugin_factory, *network, *deconv3D_4_add_skip->getOutput(0), data_type, "deconv3D_4_act"); + assert(deconv3D_4_act != nullptr); + + // Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose + auto deconv3D_4_transform = addTransform(plugin_factory, *network, *deconv3D_4_act->getOutput(0), {1, 0, 2, 3}, "deconv3D_4_transform_transform"); + assert(deconv3D_4_transform != nullptr); + + // deconv3D_5 3D transposed convolution op. + Dims deconv3D_5_out_dims{4, {137, 1, 321, 1025}}; + auto deconv3D_5 = addConv3DTranspose(plugin_factory, *network, *deconv3D_4_transform->getOutput(0), + Conv3DType::kTensorFlow, {5, {32, 3, 1, 3, 3}}, deconv3D_5_out_dims, + Dims{3, {2, 2, 2}}, Dims{3, {0, 1, 1}}, Dims{3, {0, 1, 1}}, + weights.at("deconv3D_5_k"), weights.at("deconv3D_5_b"), + "deconv3D_5"); + assert(deconv3D_5 != nullptr); + + // deconv3D_5 output slice op. + auto deconv3D_5_slice_layer = addSlice(plugin_factory, *network, *deconv3D_5->getOutput(0), + deconv3D_5_out_dims, + {4, {0, 0, 0, 0}}, + {4, {deconv3D_5_out_dims.d[0] - 1, deconv3D_5_out_dims.d[1], deconv3D_5_out_dims.d[2], deconv3D_5_out_dims.d[3]}}, + "deconv3D_5_slice"); + assert(deconv3D_5_slice_layer != nullptr); + + // Softargmax. + auto disp = addSoftargmax(plugin_factory, *network, *deconv3D_5_slice_layer->getOutput(0), "disp_softargmax"); + assert(disp != nullptr); + + auto disp_out = disp->getOutput(0); + disp_out->setName("disp"); + network->markOutput(*disp_out); + + return network; +} + +} } // namespace diff --git a/stereoDNN/scripts/data_converters.py b/stereoDNN/scripts/data_converters.py new file mode 100644 index 0000000..3aa28f1 --- /dev/null +++ b/stereoDNN/scripts/data_converters.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +""" +Contains data conversion routines used in other scripts. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +def nhwc_to_nchw(src): + """Converts tensor from NHWC to NCHW format. Usually + used for converting input/output tensors. + NHWC format is commonly used by TensorFlow + while NCHW - TensorRT/cuDNN.""" + assert len(src.shape) == 4, "Expected 4D tensor." + return np.transpose(src, [0, 3, 1, 2]) + +def rsck_to_kcrs(src): + """Converts tensor from RSCK to KCRS format. Usually + used for converting convolution filter tensors. + cuDNN notation is used for dimensions where RS are + spatial dimensions (height and width), C - number + of input channels and K - number of output channels. + """ + assert len(src.shape) == 4, "Expected 4D tensor." + return np.transpose(src, [3, 2, 0, 1]) + +def ndhwc_to_ndchw(src): + """Converts tensor from NDHWC to NDCHW format. Usually + used for converting input/output tensors. + NDHWC format is commonly used by TensorFlow + while NDCHW - TensorRT/cuDNN.""" + assert len(src.shape) == 5, "Expected 5D tensor." + return np.transpose(src, [0, 1, 4, 2, 3]) + +def ndhwc_to_ncdhw(src): + """Converts tensor from NDHWC to NCDHW format. Usually + used for converting input/output tensors. + NDHWC format is commonly used by TensorFlow + while NCDHW - TensorRT/cuDNN. This particular format + is used to convert input of 3D convolution from TensorFlow + to cuDNN format.""" + assert len(src.shape) == 5, "Expected 5D tensor." + return np.transpose(src, [0, 4, 1, 2, 3]) + +def vrsck_to_kvcrs(src): + """Converts tensor from VRSCK to KVCRS format. Usually + used for converting convolution filter tensors. + cuDNN notation is used for dimensions where RS are + spatial dimensions (height and width), V - depth + dimension, C - number of input channels + and K - number of output channels. + """ + assert len(src.shape) == 5, "Expected 5D tensor." + return np.transpose(src, [4, 0, 3, 1, 2]) + diff --git a/stereoDNN/scripts/model_builder.py b/stereoDNN/scripts/model_builder.py new file mode 100644 index 0000000..7e49d44 --- /dev/null +++ b/stereoDNN/scripts/model_builder.py @@ -0,0 +1,74 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +""" +Generates TensorRT C++ API code from TensorFlow model. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse + +import warnings +# Ignore 'FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated' warning. +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning, module='h5py') + import tensorflow as tf + +import tensorrt_model_builder +import model_nvsmall +import model_resnet18 + +def check_model_type(src): + if src == 'nvsmall' or src == 'resnet18': + return src + else: + raise argparse.ArgumentTypeError('Invalid model type {}. Supported: nvsmall, resnet18'.format(src)) + +def check_data_type(src): + if src == 'fp32' or src == 'fp16': + return src + else: + raise argparse.ArgumentTypeError('Invalid data type {}. Supported: fp32, fp16'.format(src)) + +parser = argparse.ArgumentParser(description='Stereo DNN TensorRT C++ code generator') + +parser.add_argument('--model_type', type=check_model_type, help='model type, currently supported: nvsmall', required=True) +parser.add_argument('--net_name', type=str, help='network name to use in C++ code generation', required=True) +parser.add_argument('--checkpoint_path', type=str, help='path to checkpoint file (without extension)', required=True) +parser.add_argument('--weights_file', type=str, help='path to generated weights file', required=True) +parser.add_argument('--cpp_file', type=str, help='path to generated TensorRT C++ model file', required=True) +parser.add_argument('--data_type', type=check_data_type, help='model data type, supported: fp32, fp16', default='fp32') + +args = parser.parse_args() + +def read_model(model_path, session): + print('Reading model...') + saver = tf.train.import_meta_graph(model_path + '.meta') + print('Loaded graph meta.') + saver.restore(session, model_path) + print('Loaded weights.') + print('Done reading model.') + +def main(): + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) + config.gpu_options.allow_growth = True + sess = tf.InteractiveSession(config=config) + + model = read_model(args.checkpoint_path, sess) + + with open(args.weights_file, 'wb') as weights_w: + with open(args.cpp_file, 'w') as cpp_w: + builder = tensorrt_model_builder.TrtModelBuilder(model, args.net_name, cpp_w, weights_w, args.data_type) + if args.model_type == 'nvsmall': + model_nvsmall.create(builder) + elif args.model_type == 'resnet18': + model_resnet18.create(builder) + else: + # Should never happen, yeah. + assert False, 'Not supported.' + print('Done.') + +if __name__ == '__main__': + main() diff --git a/stereoDNN/scripts/model_nvsmall.py b/stereoDNN/scripts/model_nvsmall.py new file mode 100644 index 0000000..cf559ea --- /dev/null +++ b/stereoDNN/scripts/model_nvsmall.py @@ -0,0 +1,68 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +""" +Generates TensorRT code for NVSmall model. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +def create(builder): + def write_2d_encoder(): + sides = ['left', 'right'] + builder.write_input(sides[0]) + builder.write_input(sides[1]) + layer_inp = [sides[0] + '_scale', sides[1] + '_scale'] + builder.write_scale(sides[0], layer_inp[0]) + builder.write_scale(sides[1], layer_inp[1]) + for l in ['conv1', 'conv2', 'conv3', 'conv4']: + for i in range(len(sides)): + cur = '{}_{}'.format(sides[i], l) + builder.write_2d_convolution(layer_inp[i], cur, os.path.join('model/encoder2D', l)) + builder.write_elu(cur, cur + '_act') + layer_inp[i] = cur + '_act' + left = builder.write_2d_convolution(layer_inp[0], sides[0] + '_conv5', 'model/encoder2D/conv5') + right = builder.write_2d_convolution(layer_inp[1], sides[1] + '_conv5', 'model/encoder2D/conv5') + return left, right + + def write_3d_encoder(input): + input = 'cost_vol' + for l in ['conv3D_1', 'conv3D_2', 'conv3D_3ds', 'conv3D_4', 'conv3D_5', 'conv3D_6ds', 'conv3D_7', 'conv3D_8']: + # Pad after each D stride == 2. + if l in ['conv3D_3ds', 'conv3D_6ds']: + input = builder.write_conv3d_pad(input, l + '_pad') + input = builder.write_3d_convolution(input, l, 'model/encoder3D') + # No transpose for conv3D_8 as it goes directly to decoder. + if l != 'conv3D_8': + input = builder.write_conv3d_transform(input, l + '_tran') + input = builder.write_elu(input, l + '_act') + return input + + def write_3d_decoder(input): + # deconv3D_1 + cur = builder.write_3d_convolution_transpose(input, 'deconv3D_1', 'model/decoder3D') + cur = builder.write_add_tensors(cur, 'conv3D_5_act', 'deconv3D_1_add_skip') + cur = builder.write_elu(cur, 'deconv3D_1_act') + # deconv3D_2 + cur = builder.write_conv3d_transform(cur, 'deconv3D_1_transform') + cur = builder.write_3d_convolution_transpose(cur, 'deconv3D_2', 'model/decoder3D') + cur = builder.write_add_tensors(cur, 'conv3D_2_act', 'deconv3D_2_add_skip') + cur = builder.write_elu(cur, 'deconv3D_2_act') + # deconv3D_3 + cur = builder.write_conv3d_transform(cur, 'deconv3D_2_transform') + cur = builder.write_3d_convolution_transpose(cur, 'deconv3D_3', 'model/decoder3D') + return cur + + builder.write_header() + builder.do_indent() + left, right = write_2d_encoder() + cur = builder.write_cost_vol(left, right, 'cost_vol', 'model/cost_vol/cost_volume_left') + cur = write_3d_encoder(cur) + cur = write_3d_decoder(cur) + # Softargmax + cur = builder.write_softargmax(cur, 'disp') + builder.write_output(cur) + builder.write_footer() diff --git a/stereoDNN/scripts/model_resnet18.py b/stereoDNN/scripts/model_resnet18.py new file mode 100644 index 0000000..42b93b6 --- /dev/null +++ b/stereoDNN/scripts/model_resnet18.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +""" +Generates TensorRT code for NVSmall model. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +def create(builder): + def write_residual_block(input, name, path): + cur = builder.write_2d_convolution(input, name + '_conv1', os.path.join(path, 'res_conv1')) + cur = builder.write_elu(cur, cur + '_act') + cur = builder.write_2d_convolution(cur, name + '_conv2', os.path.join(path, 'res_conv2')) + cur = builder.write_add_tensors(cur, input, cur + '_add') + cur = builder.write_elu(cur, cur + '_act') + return cur + + def write_2d_encoder(): + sides = ['left', 'right'] + builder.write_input(sides[0]) + builder.write_input(sides[1]) + layer_inp = [sides[0] + '_scale', sides[1] + '_scale'] + builder.write_scale(sides[0], layer_inp[0]) + builder.write_scale(sides[1], layer_inp[1]) + # conv1 + for i in range(len(sides)): + cur = builder.write_2d_convolution(layer_inp[i], '{}_{}'.format(sides[i], 'conv1'), os.path.join('model/encoder2D', 'conv1')) + cur = builder.write_elu(cur, cur + '_act') + layer_inp[i] = cur + # resblock 1 - 8 + for l in ['resblock1', 'resblock2', 'resblock3', 'resblock4', 'resblock5', 'resblock6', 'resblock7', 'resblock8']: + for i in range(len(sides)): + cur = '{}_{}'.format(sides[i], l) + layer_inp[i] = write_residual_block(layer_inp[i], cur, os.path.join('model/encoder2D', l)) + # encoder2D_out + left = builder.write_2d_convolution(layer_inp[0], sides[0] + '_encoder2D_out', 'model/encoder2D/encoder2D_out') + right = builder.write_2d_convolution(layer_inp[1], sides[1] + '_encoder2D_out', 'model/encoder2D/encoder2D_out') + return left, right + + def write_3d_encoder(input): + input = 'cost_vol' + for l in ['conv3D_1a', 'conv3D_1b', 'conv3D_1ds', + 'conv3D_2a', 'conv3D_2b', 'conv3D_2ds', + 'conv3D_3a', 'conv3D_3b', 'conv3D_3ds', + 'conv3D_4a', 'conv3D_4b', 'conv3D_4ds', + 'conv3D_5a', 'conv3D_5b']: + # Pad after each D stride == 2. + if l in ['conv3D_1ds', 'conv3D_2ds', 'conv3D_3ds', 'conv3D_4ds']: + input = builder.write_conv3d_pad(input, l + '_pad') + input = builder.write_3d_convolution(input, l, 'model/encoder3D') + # No transpose for conv3D_5b as it goes directly to decoder. + if l != 'conv3D_5b': + input = builder.write_conv3d_transform(input, l + '_tran') + input = builder.write_elu(input, l + '_act') + return input + + def write_3d_decoder(input): + cur = input + for i in range(1, 5): + l = 'deconv3D_{}'.format(i) + cur = builder.write_3d_convolution_transpose(cur, l, 'model/decoder3D') + cur = builder.write_add_tensors(cur, 'conv3D_{}b_act'.format(5 - i), l + '_add_skip') + cur = builder.write_elu(cur, l + '_act') + cur = builder.write_conv3d_transform(cur, l + '_transform') + # deconv3D_5 + cur = builder.write_3d_convolution_transpose(cur, 'deconv3D_5', 'model/decoder3D') + return cur + + builder.write_header() + builder.do_indent() + left, right = write_2d_encoder() + cur = builder.write_cost_vol(left, right, 'cost_vol', 'model/cost_vol/cost_volume_left') + cur = write_3d_encoder(cur) + cur = write_3d_decoder(cur) + # Softargmax + cur = builder.write_softargmax(cur, 'disp') + builder.write_output(cur) + builder.write_footer() diff --git a/stereoDNN/scripts/tensorrt_model_builder.py b/stereoDNN/scripts/tensorrt_model_builder.py new file mode 100644 index 0000000..b7da802 --- /dev/null +++ b/stereoDNN/scripts/tensorrt_model_builder.py @@ -0,0 +1,462 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +""" +Generates TensorRT C++ API code from TensorFlow model. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import os +import textwrap as tw +import time +import struct +import sys + +import warnings +# Ignore 'FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated' warning. +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning, module='h5py') + import tensorflow as tf + +from data_converters import * + +class TrtModelBuilder(object): + def __init__(self, model, net_name, code_writer, weights_writer, data_type): + self.default_indent = 4 + self.cur_indent = 0 + self.max_line_width = 160 + self.indent = tw.TextWrapper(initial_indent=' '*self.cur_indent, + subsequent_indent=' '*(self.cur_indent + self.default_indent), + width=self.max_line_width, break_long_words=False) + self.model = model + self.net_name = net_name + self.code_writer = code_writer + self.weights_writer = weights_writer + self.data_type = data_type + + def _indent_lines(self, src): + src = src.split('\n') + for i in range(len(src)): + src[i] = self.indent.fill(src[i]) + return '\n'.join(src) + + def do_indent(self): + self.cur_indent = self.cur_indent + self.default_indent + self.indent.initial_indent = ' '*self.cur_indent + + def _write_weights(self, name, src): + # Write name as null-terminated string. + self.weights_writer.write(struct.pack('%ds'%len(name), name.encode())) + self.weights_writer.write(struct.pack('B', 0)) + # Weight count and data. + src_flat = np.reshape(src, -1) + src_flat = src_flat.astype(np.float16) if self.data_type == 'fp16' else src_flat.astype(np.float32) + self.weights_writer.write(struct.pack(' +#include +#include +#include +#include "redtail_tensorrt_plugins.h" + +namespace redtail {{ namespace tensorrt +{{ + +using namespace nvinfer1; + +using weight_map = std::unordered_map; + +INetworkDefinition* create{0}Network(IBuilder& builder, IPluginContainer& plugin_factory, + DimsCHW img_dims, const weight_map& weights, DataType data_type, + ILogger& log) +{{ + INetworkDefinition* network = builder.createNetwork(); + assert(network != nullptr); +""" + self.code_writer.write(code.format(self.net_name)) + + def write_footer(self): + code = """\ + return network; +} + +} } // namespace +""" + self.code_writer.write(code) + + def write_input(self, name): + code = """\ +// Input tensor. +auto {0} = network->addInput("{0}", DataType::kFLOAT, img_dims); +assert({0} != nullptr); + +""" + code = code.format(name) + self.code_writer.write(self._indent_lines(code)) + + def write_output(self, output): + code = """\ +auto {0}_out = {0}->getOutput(0); +{0}_out->setName("{0}"); +network->markOutput(*{0}_out); + + """ + self.code_writer.write(self._indent_lines(code.format(output))) + + def write_scale(self, input, name): + # Write code. + code = """\ +// Scaling op. +auto {1} = network->addScale(*{0}, ScaleMode::kUNIFORM, + weights.at("{1}_shift"), weights.at("{1}_scale"), weights.at("{1}_power")); +assert({1} != nullptr); + +""" + code = code.format(input, name) + self.code_writer.write(self._indent_lines(code)) + # REVIEW alexeyk: default for now. + self._write_weights(name + '_shift', [0.0]) + self._write_weights(name + '_scale', [1.0]) + self._write_weights(name + '_power', [1.0]) + + # TensorFlow padding computation, taken from: + # https://www.tensorflow.org/api_guides/python/nn#Convolution + def _compute_tf_padding(self, in_dim, kern_dim, stride_dim): + if in_dim % stride_dim == 0: + pad_along = max(kern_dim - stride_dim, 0) + else: + pad_along = max(kern_dim - (in_dim % stride_dim), 0) + pad_start = pad_along // 2 + pad_end = pad_along - pad_start + return pad_start, pad_end + + def write_2d_convolution(self, input, name, op_path): + # Convolution code. + conv_code = """\ +// {1} convolution op. +auto {1} = network->addConvolution(*{0}->getOutput(0), {2}, DimsHW {{{3}, {4}}}, + weights.at("{1}_k"), weights.at("{1}_b")); +assert({1} != nullptr); +{1}->setStride( DimsHW {{{5}, {6}}}); +{1}->setPadding(DimsHW {{{7}, {8}}}); + +""" + # Padding code. + pad_code = """\ +// {1} padding op. +auto {1}_pad = network->addPadding(*{0}->getOutput(0), DimsHW {{{2}, {3}}}, DimsHW {{{4}, {5}}}); +assert({1}_pad != nullptr); + +""" + g = tf.get_default_graph() + assert g is not None, "No default graph set" + # Get and check convolution operation. + conv_op = g.get_operation_by_name(os.path.join(op_path, 'Conv2D')) + # Assuming input/output of 2D convolution are in NHWC format. + assert conv_op.type == 'Conv2D', 'Expected Conv2D operation but got {}.'.format(conv_op.type) + assert conv_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 2D convolutions.' + assert len(conv_op.inputs) == 2, 'Convolution expected to have 2 inputs.' + assert len(conv_op.outputs) == 1, 'Convolution expected to have only one output.' + # Get and check bias operation + bias_op = g.get_operation_by_name(os.path.join(op_path, 'BiasAdd')) + assert bias_op.type == 'BiasAdd', 'Expected BiasAdd operation but got {}.'.format(bias_op.type) + assert bias_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 2D convolution biases.' + assert len(bias_op.inputs) == 2, 'BiasAdd expected to have 2 inputs.' + assert len(bias_op.outputs) == 1, 'BiasAdd expected to have only one output.' + + # Get weights, stride and padding. + kernel_weights = conv_op.inputs[1].eval() + kernel_shape = kernel_weights.shape + assert len(kernel_shape) == 4, 'Convolution kernel weights tensor expected to have rank 4.' + # Weights in TF are in RSCK format (in TensorRT speak). + kh, kw, kc, kk = kernel_shape + strides = conv_op.get_attr('strides') + assert len(strides) == 4, 'Convolution strides tensor expected to have length 4.' + # Compute padding. + in_shape = conv_op.inputs[0].shape.as_list() + out_shape = conv_op.outputs[0].shape.as_list() + pad_top, pad_bottom = self._compute_tf_padding(in_shape[1], kh, strides[1]) + pad_left, pad_right = self._compute_tf_padding(in_shape[2], kw, strides[2]) + # If padding is symetrical - use TensorRT convolution padding + # otherwise need to explicitly pad the input. + trt_conv_pad_h = 0 + trt_conv_pad_w = 0 + code = '' + conv_input = input + if pad_top == pad_bottom and pad_left == pad_right: + trt_conv_pad_h = pad_top + trt_conv_pad_w = pad_left + else: + assert False, 'Not supported at the moment due to bug (#3199) in TRT.' + p_b = pad_bottom - pad_top + p_r = pad_right - pad_left + assert 0 <= p_b and p_b <= 1, 'Bottom pad should not be greater than top pad by more than 1.' + assert 0 <= p_r and p_r <= 1, 'Right pad should not be greater than left pad by more than 1.' + # Write padding layer. + code = pad_code.format(input, name, pad_top, pad_left, pad_bottom, pad_right) + conv_input = name + '_pad' + # Write code. + code += conv_code.format(conv_input, name, kk, kh, kw, strides[1], strides[2], trt_conv_pad_h, trt_conv_pad_w) + self.code_writer.write(self._indent_lines(code)) + # TRT requires kernel weights to be in KCRS format while TensorFlow uses RSCK. + kernel_weights = rsck_to_kcrs(kernel_weights) + # Write kernel weights. + self._write_weights(name + '_k', kernel_weights) + # Write bias weights. + bias_weights = bias_op.inputs[1].eval() + bias_shape = bias_weights.shape + assert len(bias_shape) == 1, 'Convolution bias weights tensor expected to have rank 1.' + assert bias_shape[0] == kernel_shape[3], 'Convolution bias size does not match convolution output channels.' + self._write_weights(name + '_b', bias_weights) + return name + + def write_3d_convolution(self, input, name, op_path): + # Write code. + code = """\ +// {1} 3D convolution op. +auto {1} = addConv3D(plugin_factory, *network, *{0}->getOutput(0), + Conv3DType::kTensorFlow, {{5, {{{k_dims}}}}}, + Dims{{3, {{{stride_dims}}}}}, Dims{{3, {{{pad_start_dims}}}}}, Dims{{3, {{{pad_end_dims}}}}}, + weights.at("{1}_k"), weights.at("{1}_b"), + "{1}"); +assert({1} != nullptr); + +""" + g = tf.get_default_graph() + assert g is not None, "No default graph set" + # Get and check convolution operation. + conv_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'Conv3D'))) + # Assuming input/output of 3D convolution are in NHWC format. + assert conv_op.type == 'Conv3D', 'Expected Conv3D operation but got {}.'.format(conv_op.type) + assert conv_op.get_attr('data_format').decode() == 'NDHWC', 'Only NDHWC format is currently supported for 3D convolutions.' + assert len(conv_op.inputs) == 2, 'Convolution expected to have 2 inputs.' + assert len(conv_op.outputs) == 1, 'Convolution expected to have only one output.' + # Get and check bias operation + bias_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'BiasAdd'))) + assert bias_op.type == 'BiasAdd', 'Expected BiasAdd operation but got {}.'.format(bias_op.type) + # REVIEW alexeyk: is this a bug in our model code? Should the bias tensor be NDCHW dim as well? + assert bias_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 3D convolution biases.' + assert len(bias_op.inputs) == 2, 'BiasAdd expected to have 2 inputs.' + assert len(bias_op.outputs) == 1, 'BiasAdd expected to have only one output.' + + # Get weights, stride and padding. + kernel_weights = conv_op.inputs[1].eval() + kernel_shape = kernel_weights.shape + assert len(kernel_shape) == 5, '3D convolution kernel weights tensor expected to have rank 5.' + # Weights in TF are in VRSCK format (in TensorRT speak). + kd, kh, kw, kc, kk = kernel_shape + strides = conv_op.get_attr('strides') + assert len(strides) == 5, '3D convolution strides tensor expected to have length 5.' + # Compute padding. + in_shape = conv_op.inputs[0].shape.as_list() + out_shape = conv_op.outputs[0].shape.as_list() + pad_c_start, pad_c_end = self._compute_tf_padding(in_shape[1], kd, strides[1]) + pad_h_start, pad_h_end = self._compute_tf_padding(in_shape[2], kh, strides[2]) + pad_w_start, pad_w_end = self._compute_tf_padding(in_shape[3], kw, strides[3]) + # cuDNN limitations... + assert pad_h_start == pad_h_end, 'Only symmetrical padding is currently supported for H dimension.' + assert pad_w_start == pad_w_end, 'Only symmetrical padding is currently supported for W dimension.' + # REVIEW alexeyk: C padding is done by padding plugin now, not by conv3d plugin. + p_c = pad_c_end - pad_c_start + assert 0 <= p_c and p_c <= 1, 'Depth end pad should not be greater than start pad by more than 1.' + + code = code.format(input, name, + k_dims ='{}, {}, {}, {}, {}'.format(kk, kd, kc, kh, kw), + stride_dims ='{}, {}, {}'.format(strides[1], strides[2], strides[3]), + pad_start_dims ='{}, {}, {}'.format(pad_c_start, pad_h_start, pad_w_start), + pad_end_dims ='{}, {}, {}'.format(pad_c_end, pad_h_end, pad_w_end)) + self.code_writer.write(self._indent_lines(code)) + # TRT requires kernel weights to be in KVCRS format while TensorFlow uses VRSCK. + kernel_weights = vrsck_to_kvcrs(kernel_weights) + # Write kernel weights. + self._write_weights(name + '_k', kernel_weights) + # Write bias weights. + bias_weights = bias_op.inputs[1].eval() + bias_shape = bias_weights.shape + assert len(bias_shape) == 1, '3D convolution bias weights tensor expected to have rank 1.' + # REVIEW alexeyk: should really assert against D? + assert bias_shape[0] == kernel_shape[4], '3D convolution bias size does not match convolution output channels.' + self._write_weights(name + '_b', bias_weights) + return name + + def write_3d_convolution_transpose(self, input, name, op_path): + # Write code. + code = """\ +// {1} 3D transposed convolution op. +Dims {1}_out_dims{{4, {{{out_dims}}}}}; +auto {1} = addConv3DTranspose(plugin_factory, *network, *{0}->getOutput(0), + Conv3DType::kTensorFlow, {{5, {{{k_dims}}}}}, {1}_out_dims, + Dims{{3, {{{stride_dims}}}}}, Dims{{3, {{{pad_start_dims}}}}}, Dims{{3, {{{pad_end_dims}}}}}, + weights.at("{1}_k"), weights.at("{1}_b"), + "{1}"); +assert({1} != nullptr); + +""" + slice_code = """\ +// {0} output slice op. +auto {0}_slice_layer = addSlice(plugin_factory, *network, *{0}->getOutput(0), + {0}_out_dims, + {{4, {{0, 0, 0, 0}}}}, + {{4, {{{0}_out_dims.d[0] - 1, {0}_out_dims.d[1], {0}_out_dims.d[2], {0}_out_dims.d[3]}}}}, + "{0}_slice"); +assert({0}_slice_layer != nullptr); + + """ + g = tf.get_default_graph() + assert g is not None, "No default graph set" + # Get and check tran convolution operation. + conv_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'conv3d_transpose'))) + # Assuming input/output of 3D convolution are in NHWC format. + assert conv_op.type == 'Conv3DBackpropInputV2', 'Expected Conv3DBackpropInputV2 operation but got {}.'.format(conv_op.type) + assert conv_op.get_attr('data_format').decode() == 'NDHWC', 'Only NDHWC format is currently supported for 3D transposed convolutions.' + assert len(conv_op.inputs) == 3, 'Transposed convolution expected to have 3 inputs.' + assert len(conv_op.outputs) == 1, 'Transposed convolution expected to have only one output.' + # Get and check bias operation + bias_op = g.get_operation_by_name(os.path.join(op_path, os.path.join(name, 'BiasAdd'))) + assert bias_op.type == 'BiasAdd', 'Expected BiasAdd operation but got {}.'.format(bias_op.type) + # REVIEW alexeyk: is this a bug in our model code? Should the bias tensor be NDCHW dim as well? + assert bias_op.get_attr('data_format').decode() == 'NHWC', 'Only NHWC format is currently supported for 3D transposed convolution biases.' + assert len(bias_op.inputs) == 2, 'BiasAdd expected to have 2 inputs.' + assert len(bias_op.outputs) == 1, 'BiasAdd expected to have only one output.' + + # Get weights, stride and padding. + kernel_weights = conv_op.inputs[1].eval() + kernel_shape = kernel_weights.shape + assert len(kernel_shape) == 5, '3D transposed convolution kernel weights tensor expected to have rank 5.' + # Weights in TF are in VRSCK format. + kd, kh, kw, kc, kk = kernel_shape + strides = conv_op.get_attr('strides') + assert len(strides) == 5, '3D transposed convolution strides tensor expected to have length 5.' + # Compute padding. + # Note that padding is with respect to convolution input, that is transposed convolution output. + conv_out_shape = conv_op.inputs[0].shape.as_list() + conv_in_shape = np.array(conv_op.outputs[0].shape.as_list()) + pad_c_start, pad_c_end = self._compute_tf_padding(conv_in_shape[1], kd, strides[1]) + pad_h_start, pad_h_end = self._compute_tf_padding(conv_in_shape[2], kh, strides[2]) + pad_w_start, pad_w_end = self._compute_tf_padding(conv_in_shape[3], kw, strides[3]) + # cuDNN limitations... + assert pad_h_start == pad_h_end, 'Only symmetrical padding is currently supported for H dimension.' + assert pad_w_start == pad_w_end, 'Only symmetrical padding is currently supported for W dimension.' + p_c = pad_c_end - pad_c_start + assert 0 <= p_c and p_c <= 1, 'Depth end pad should not be greater than start pad by more than 1.' + # This is a special case for 3D transposed convolution: + # Do not pad in C dimension in cuDNN but rather increase corresponding output (i.e. convo input) dimension. + # In such configuration, this transposed convolution should be followed by slicing plugin. + if pad_c_end != pad_c_start: + conv_in_shape[1] += 1 + pad_c_start = 0 + pad_c_end = 0 + code = code.format(input, name, + k_dims ='{}, {}, {}, {}, {}'.format(kk, kd, kc, kh, kw), + out_dims ='{}, {}, {}, {}'.format(*(conv_in_shape[1:])[[0, 3, 1, 2]]), + stride_dims ='{}, {}, {}'.format(strides[1], strides[2], strides[3]), + pad_start_dims ='{}, {}, {}'.format(pad_c_start, pad_h_start, pad_w_start), + pad_end_dims ='{}, {}, {}'.format(pad_c_end, pad_h_end, pad_w_end)) + self.code_writer.write(self._indent_lines(code)) + out_name = name + # Write slice layer in case of asymmetric C padding. + if p_c != 0: + self.code_writer.write(self._indent_lines(slice_code.format(name))) + out_name += '_slice_layer' + # TRT requires kernel weights to be in KVCRS format while TensorFlow uses VRSCK. + kernel_weights = vrsck_to_kvcrs(kernel_weights) + # Write kernel weights. + self._write_weights(name + '_k', kernel_weights) + # Write bias weights. + bias_weights = bias_op.inputs[1].eval() + bias_shape = bias_weights.shape + assert len(bias_shape) == 1, '3D transposed convolution bias weights tensor expected to have rank 1.' + # REVIEW alexeyk: should really assert against D? + assert bias_shape[0] == kernel_shape[3], '3D transposed convolution bias size does not match convolution output channels.' + self._write_weights(name + '_b', bias_weights) + return out_name + + def write_elu(self, input, name): + # Write code. + code = """\ +// {1} ELU activation op. +auto {1} = addElu(plugin_factory, *network, *{0}->getOutput(0), data_type, "{1}"); +assert({1} != nullptr); + +""" + code = code.format(input, name) + self.code_writer.write(self._indent_lines(code)) + return name + + def write_cost_vol(self, left_input, right_input, name, op_path): + # Write code. + code = """\ +// {2} cost volume op. +auto {2} = addCostVolume(plugin_factory, *network, *{0}->getOutput(0), *{1}->getOutput(0), {3}, "{2}"); +assert({2} != nullptr); + +""" + g = tf.get_default_graph() + assert g is not None, "No default graph set" + last_op = g.get_operation_by_name(os.path.join(op_path, 'concat')) + out_shape = last_op.outputs[0].shape + assert len(out_shape) == 5, 'Cost volume output tensor expected to have rank 5 but got {}.'.format(len(out_shape)) + # Max disparity is the second outermost dimension in the output. + max_disp = out_shape[1] + code = code.format(left_input, right_input, name, max_disp) + self.code_writer.write(self._indent_lines(code)) + return name + + def write_conv3d_pad(self, input, name): + # Write code. + code = """\ +// {1} padding op. +auto {1} = addPad(plugin_factory, *network, *{0}->getOutput(0), {{0, 0, 0, 0}}, {{1, 0, 0, 0}}, "{1}"); +assert({1} != nullptr); + +""" + code = code.format(input, name) + self.code_writer.write(self._indent_lines(code)) + return name + + def write_conv3d_transform(self, input, name): + code = """\ +// Transpose output: KDHW -> DKHW for conv3d and DKHW -> KDHW for conv3d_transpose +auto {1} = addTransform(plugin_factory, *network, *{0}->getOutput(0), {{1, 0, 2, 3}}, "{1}_transform"); +assert({1} != nullptr); + +""" + code = code.format(input, name) + self.code_writer.write(self._indent_lines(code)) + return name + + def write_add_tensors(self, t1, t2, name): + # Write code. + code = """\ +// {2} tensor add op. +auto {2} = network->addElementWise(*({0}->getOutput(0)), *({1}->getOutput(0)), ElementWiseOperation::kSUM); +assert({2} != nullptr); + +""" + code = code.format(t1, t2, name) + self.code_writer.write(self._indent_lines(code)) + return name + + def write_softargmax(self, input, name): + code = """\ +// Softargmax. +auto {1} = addSoftargmax(plugin_factory, *network, *{0}->getOutput(0), "{1}_softargmax"); +assert({1} != nullptr); + +""" + code = code.format(input, name) + self.code_writer.write(self._indent_lines(code)) + return name diff --git a/stereoDNN/scripts/test_data_generator.py b/stereoDNN/scripts/test_data_generator.py new file mode 100644 index 0000000..bcf4948 --- /dev/null +++ b/stereoDNN/scripts/test_data_generator.py @@ -0,0 +1,311 @@ +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +""" +Generates data for nvstereo_inference library tests. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import numpy as np +import os +import struct +import time + +import warnings +# Ignore 'FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated' warning. +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning, module='h5py') + import tensorflow as tf + +from data_converters import * + +parser = argparse.ArgumentParser(description='Stereo DNN TensorRT C++ test data generator') + +parser.add_argument('--data_dir', type=str, help='directory path to store generated test data', required=True) + +args = parser.parse_args() + +def get_rand(shape): + return np.random.randn(*shape).astype(np.float32) + +def write_bin(src, filename): + with open(filename, 'wb') as w: + w.write(struct.pack('conv3D_4 blocks in NVSmall. + np.random.seed(1) + x = get_rand([1, 8, 9, 9, 3]) # NDHWC + w = get_rand([3, 3, 3, 3, 3]) # VRSCK + s_1 = [1, 1, 1, 1, 1] + s_2 = [1, 2, 2, 2, 1] + # First convo has non-symmetrical D padding. + y_1 = tf.nn.conv3d(x, w, s_1, padding='SAME') + y_2 = tf.nn.conv3d(y_1, w, s_2, padding='SAME').eval() + write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_07_x.bin')) + write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_07_w.bin')) + write_bin(ndhwc_to_ndchw(y_2), os.path.join(data_dir, 'conv3d_07_y.bin')) + +def create_conv3d_tran_plugin_data(data_dir): + print("---") + print("Creating data for Conv3dTranspose plugin...") + print("---") + # The most basic test: single input/feature map, no padding/unit strides. + np.random.seed(1) + y = get_rand([1, 1, 1, 1, 1]) # NDHWK + w = get_rand([1, 3, 3, 3, 1]) # VRSCK + s = [1, 1, 1, 1, 1] + x_shape = [1, 1, 3, 3, 3] # NDHWC + x = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='VALID').eval() + write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_tran_01_y.bin')) + write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_tran_01_w.bin')) + write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_tran_01_x.bin')) + + # Testing HW padding and strides (D == 1, K == 1). + np.random.seed(1) + y = get_rand([1, 1, 3, 3, 1]) # NDHWK + w = get_rand([1, 3, 3, 3, 1]) # VRSCK + s = [1, 1, 2, 2, 1] + x_shape = [1, 1, 5, 5, 3] # NDHWC + x = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='SAME').eval() + write_bin(ndhwc_to_ndchw(y), os.path.join(data_dir, 'conv3d_tran_02_y.bin')) + write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_tran_02_w.bin')) + write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_tran_02_x.bin')) + + # Testing DHW padding and strides. + # Note: similar to deconv3D_1 convo in NVSmall. + np.random.seed(1) + y = get_rand([1, 4, 5, 5, 8]) # NDHWK + w = get_rand([3, 3, 3, 4, 8]) # VRSCK + s = [1, 2, 2, 2, 1] + x_shape = [1, 8, 9, 9, 4] # NDHWC + x = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='SAME').eval() + write_bin(ndhwc_to_ncdhw(y), os.path.join(data_dir, 'conv3d_tran_03_y.bin')) + write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_tran_03_w.bin')) + write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'conv3d_tran_03_x.bin')) + + # Testing DHW padding and strides with bias and ELU. + # Note: similar to deconv3D_1 convo in NVSmall. + np.random.seed(1) + y = get_rand([1, 4, 5, 5, 8]) # NDHWK + w = get_rand([3, 3, 3, 4, 8]) # VRSCK + b = get_rand([4]) # C + s = [1, 2, 2, 2, 1] + x_shape = [1, 8, 9, 9, 4] # NDHWC + x = tf.nn.conv3d_transpose(y, w, output_shape=x_shape, strides=s, padding='SAME') + x_b = tf.nn.bias_add(x, b) + x_a = tf.nn.elu(x_b).eval() + write_bin(ndhwc_to_ncdhw(y), os.path.join(data_dir, 'conv3d_tran_04_y.bin')) + write_bin(vrsck_to_kvcrs(w), os.path.join(data_dir, 'conv3d_tran_04_w.bin')) + write_bin(ndhwc_to_ndchw(x_a), os.path.join(data_dir, 'conv3d_tran_04_x.bin')) + write_bin(b, os.path.join(data_dir, 'conv3d_tran_04_b.bin')) + + # Testing DHW padding and strides in 2 subsequent deconvolutions. + # Note: similar to deconv3D_1->deconv3D_2 convo in NVSmall. + np.random.seed(1) + # First convo^T. + y = get_rand([1, 4, 5, 5, 16]) # NDHWK + w1 = get_rand([3, 3, 3, 8, 16]) # VRSCK + s1 = [1, 2, 2, 2, 1] + x1_shape = [1, 8, 9, 9, 8] # NDHWC + # Second convo^T. + w2 = get_rand([3, 3, 3, 4, 8]) # VRSCK + s2 = [1, 2, 2, 2, 1] + x2_shape = [1, 16, 17, 17, 4] # NDHWC + x1 = tf.nn.conv3d_transpose(y, w1, output_shape=x1_shape, strides=s1, padding='SAME') + x2 = tf.nn.conv3d_transpose(x1, w2, output_shape=x2_shape, strides=s2, padding='SAME').eval() + write_bin(ndhwc_to_ncdhw(y), os.path.join(data_dir, 'conv3d_tran_05_y.bin')) + write_bin(vrsck_to_kvcrs(w1), os.path.join(data_dir, 'conv3d_tran_05_w1.bin')) + write_bin(vrsck_to_kvcrs(w2), os.path.join(data_dir, 'conv3d_tran_05_w2.bin')) + write_bin(ndhwc_to_ndchw(x2), os.path.join(data_dir, 'conv3d_tran_05_x.bin')) + +def create_cost_volume_data(data_dir): + def cost_volume(left, right, max_disp): + height = int(left.shape[1]) + width = int(left.shape[2]) + depth = int(left.shape[3]) + + right_padded = tf.pad(right, [[0, 0], [0, 0], [max_disp - 1, 0], [0,0]], "CONSTANT") + right_disp = tf.extract_image_patches(right_padded, [1, height, width, 1], [1, 1, 1, 1], [1, 1, 1, 1], padding="VALID") + right_disp = tf.squeeze(right_disp, axis=1) + disp_dim = int(right_disp.shape[1]) + right_disp = tf.reshape(right_disp, [-1, disp_dim, height, width, depth]) + right_disp = tf.reverse(right_disp, [1]) + + left_disp = tf.expand_dims(left, axis=1) + left_disp = tf.tile(left_disp, [1, disp_dim, 1, 1, 1]) + + cost_volume = tf.concat([left_disp, right_disp], axis=4) + + return cost_volume + + print("---") + print("Creating data for CostVolume plugin...") + print("---") + + # Basic test: 6x6x4 input, max_disp == 2, output is 2x6x6x8. + np.random.seed(1) + in_shape = [1, 6, 6, 4] # NHWC + left = get_rand(in_shape) + right = get_rand(in_shape) + max_disp = 2 + cost_vol = cost_volume(left, right, max_disp).eval() + write_bin(nhwc_to_nchw(left), os.path.join(data_dir, 'cost_vol_01_l.bin')) + write_bin(nhwc_to_nchw(right), os.path.join(data_dir, 'cost_vol_01_r.bin')) + write_bin(ndhwc_to_ndchw(cost_vol), os.path.join(data_dir, 'cost_vol_01_cv.bin')) + + # More realistic test: 33x32x8 input, max_disp == 8, output is 8x33x32x16. + np.random.seed(1) + in_shape = [1, 32, 33, 8] # NHWC + #in_shape = [1, 128, 257, 32] # NHWC + left = get_rand(in_shape) + right = get_rand(in_shape) + max_disp = 12 + #max_disp = 24 + cost_vol = cost_volume(left, right, max_disp).eval() + write_bin(nhwc_to_nchw(left), os.path.join(data_dir, 'cost_vol_02_l.bin')) + write_bin(nhwc_to_nchw(right), os.path.join(data_dir, 'cost_vol_02_r.bin')) + write_bin(ndhwc_to_ndchw(cost_vol), os.path.join(data_dir, 'cost_vol_02_cv.bin')) + +def create_softargmax_data(data_dir): + def softargmax(volume): + input_depth = int(volume.shape[1]) + index_steps = tf.constant(np.reshape(np.array(range(input_depth)),(1,input_depth,1,1,1)), dtype=tf.float32) + prob_volume = tf.nn.softmax(tf.multiply(volume, -1.0), dim=1) + softargmax_result = tf.reduce_sum(tf.multiply(prob_volume, index_steps), axis=1) + + return softargmax_result + + print("---") + print("Creating data for Softargmax plugin...") + print("---") + + # Basic test: 4x5x5x1 input, output is 5x5x1. + np.random.seed(1) + x = get_rand([1, 4, 5, 7, 1]) # NDHWC + y = softargmax(x).eval() + write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'softargmax_01_x.bin')) + write_bin(nhwc_to_nchw(y), os.path.join(data_dir, 'softargmax_01_y.bin')) + + # Large test: 2x12x33x65x1 input, output is 2x33x65x1. + np.random.seed(1) + x = get_rand([2, 12, 33, 65, 1]) # NDHWC + y = softargmax(x).eval() + write_bin(ndhwc_to_ndchw(x), os.path.join(data_dir, 'softargmax_02_x.bin')) + write_bin(nhwc_to_nchw(y), os.path.join(data_dir, 'softargmax_02_y.bin')) + +def main(): + config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) + config.gpu_options.allow_growth = True + sess = tf.InteractiveSession(config=config) + + create_elu_plugin_data(args.data_dir) + create_conv3d_plugin_data(args.data_dir) + create_conv3d_tran_plugin_data(args.data_dir) + create_cost_volume_data(args.data_dir) + create_softargmax_data(args.data_dir) + + print("Done.") + +if __name__ == '__main__': + main() diff --git a/stereoDNN/tests/CMakeLists.txt b/stereoDNN/tests/CMakeLists.txt new file mode 100644 index 0000000..37bd2db --- /dev/null +++ b/stereoDNN/tests/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.5) + +set(PROJECT_NAME nvstereo_tests) +project(${PROJECT_NAME}) + +find_package(CUDA 9.0 REQUIRED) +find_package(OpenCV 3.3.1 REQUIRED) + +set(GTEST_ROOT "/usr/src/gtest") +find_package(GTest REQUIRED) + +include_directories(${CUDA_INCLUDE_DIRS}) +include_directories(${GTEST_INCLUDE_DIR}) +include_directories(${CMAKE_SOURCE_DIR}/lib) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") + +file(GLOB ${PROJECT_NAME}_sources ./*.cpp) +set(PROJECT_SOURCES ${${PROJECT_NAME}_sources}) + +set(TARGET_NAME ${PROJECT_NAME}${TARGET_SUFFIX}) + +add_executable(${TARGET_NAME} ${PROJECT_SOURCES}) + +target_link_libraries(${TARGET_NAME} + nvstereo_inference${TARGET_SUFFIX} + ${CUDA_LIBRARIES} + cudnn + nvinfer + ${GTEST_LIBRARIES} + pthread + opencv_core) diff --git a/stereoDNN/tests/data/conv3d_01_w.bin b/stereoDNN/tests/data/conv3d_01_w.bin new file mode 100644 index 0000000..ef006ca Binary files /dev/null and b/stereoDNN/tests/data/conv3d_01_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_01_x.bin b/stereoDNN/tests/data/conv3d_01_x.bin new file mode 100644 index 0000000..deb71fe Binary files /dev/null and b/stereoDNN/tests/data/conv3d_01_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_01_y.bin b/stereoDNN/tests/data/conv3d_01_y.bin new file mode 100644 index 0000000..223950d Binary files /dev/null and b/stereoDNN/tests/data/conv3d_01_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_02_w.bin b/stereoDNN/tests/data/conv3d_02_w.bin new file mode 100644 index 0000000..9d5fe0f Binary files /dev/null and b/stereoDNN/tests/data/conv3d_02_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_02_x.bin b/stereoDNN/tests/data/conv3d_02_x.bin new file mode 100644 index 0000000..6ec89cd Binary files /dev/null and b/stereoDNN/tests/data/conv3d_02_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_02_y.bin b/stereoDNN/tests/data/conv3d_02_y.bin new file mode 100644 index 0000000..9629d27 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_02_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_03_w.bin b/stereoDNN/tests/data/conv3d_03_w.bin new file mode 100644 index 0000000..88c9fad Binary files /dev/null and b/stereoDNN/tests/data/conv3d_03_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_03_x.bin b/stereoDNN/tests/data/conv3d_03_x.bin new file mode 100644 index 0000000..95ae1ec Binary files /dev/null and b/stereoDNN/tests/data/conv3d_03_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_03_y.bin b/stereoDNN/tests/data/conv3d_03_y.bin new file mode 100644 index 0000000..61cab77 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_03_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_04_w.bin b/stereoDNN/tests/data/conv3d_04_w.bin new file mode 100644 index 0000000..04464ff Binary files /dev/null and b/stereoDNN/tests/data/conv3d_04_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_04_x.bin b/stereoDNN/tests/data/conv3d_04_x.bin new file mode 100644 index 0000000..6b565e0 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_04_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_04_y.bin b/stereoDNN/tests/data/conv3d_04_y.bin new file mode 100644 index 0000000..51b3bc3 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_04_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_05_w.bin b/stereoDNN/tests/data/conv3d_05_w.bin new file mode 100644 index 0000000..04464ff Binary files /dev/null and b/stereoDNN/tests/data/conv3d_05_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_05_x.bin b/stereoDNN/tests/data/conv3d_05_x.bin new file mode 100644 index 0000000..6b565e0 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_05_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_05_y.bin b/stereoDNN/tests/data/conv3d_05_y.bin new file mode 100644 index 0000000..68ae000 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_05_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_06_b.bin b/stereoDNN/tests/data/conv3d_06_b.bin new file mode 100644 index 0000000..9eba91c Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_b.bin differ diff --git a/stereoDNN/tests/data/conv3d_06_w.bin b/stereoDNN/tests/data/conv3d_06_w.bin new file mode 100644 index 0000000..f281e5f Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_06_x.bin b/stereoDNN/tests/data/conv3d_06_x.bin new file mode 100644 index 0000000..6b565e0 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_06_y.bin b/stereoDNN/tests/data/conv3d_06_y.bin new file mode 100644 index 0000000..d94cdd0 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_06_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_07_w.bin b/stereoDNN/tests/data/conv3d_07_w.bin new file mode 100644 index 0000000..78711c2 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_07_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_07_x.bin b/stereoDNN/tests/data/conv3d_07_x.bin new file mode 100644 index 0000000..6b565e0 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_07_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_07_y.bin b/stereoDNN/tests/data/conv3d_07_y.bin new file mode 100644 index 0000000..f2b3793 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_07_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_01_w.bin b/stereoDNN/tests/data/conv3d_tran_01_w.bin new file mode 100644 index 0000000..53dad4a Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_01_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_01_x.bin b/stereoDNN/tests/data/conv3d_tran_01_x.bin new file mode 100644 index 0000000..88f9fd8 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_01_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_01_y.bin b/stereoDNN/tests/data/conv3d_tran_01_y.bin new file mode 100644 index 0000000..3100966 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_01_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_02_w.bin b/stereoDNN/tests/data/conv3d_tran_02_w.bin new file mode 100644 index 0000000..e7deb6e Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_02_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_02_x.bin b/stereoDNN/tests/data/conv3d_tran_02_x.bin new file mode 100644 index 0000000..c92034e Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_02_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_02_y.bin b/stereoDNN/tests/data/conv3d_tran_02_y.bin new file mode 100644 index 0000000..a0e96b9 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_02_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_03_w.bin b/stereoDNN/tests/data/conv3d_tran_03_w.bin new file mode 100644 index 0000000..0be9855 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_03_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_03_x.bin b/stereoDNN/tests/data/conv3d_tran_03_x.bin new file mode 100644 index 0000000..10ec61f Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_03_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_03_y.bin b/stereoDNN/tests/data/conv3d_tran_03_y.bin new file mode 100644 index 0000000..cf0bea3 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_03_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_04_b.bin b/stereoDNN/tests/data/conv3d_tran_04_b.bin new file mode 100644 index 0000000..bd752ea Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_b.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_04_w.bin b/stereoDNN/tests/data/conv3d_tran_04_w.bin new file mode 100644 index 0000000..0be9855 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_w.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_04_x.bin b/stereoDNN/tests/data/conv3d_tran_04_x.bin new file mode 100644 index 0000000..2dfad65 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_04_y.bin b/stereoDNN/tests/data/conv3d_tran_04_y.bin new file mode 100644 index 0000000..cf0bea3 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_04_y.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_05_w1.bin b/stereoDNN/tests/data/conv3d_tran_05_w1.bin new file mode 100644 index 0000000..c6e49dd Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_w1.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_05_w2.bin b/stereoDNN/tests/data/conv3d_tran_05_w2.bin new file mode 100644 index 0000000..84cd269 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_w2.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_05_x.bin b/stereoDNN/tests/data/conv3d_tran_05_x.bin new file mode 100644 index 0000000..ef1f2d0 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_x.bin differ diff --git a/stereoDNN/tests/data/conv3d_tran_05_y.bin b/stereoDNN/tests/data/conv3d_tran_05_y.bin new file mode 100644 index 0000000..ea06255 Binary files /dev/null and b/stereoDNN/tests/data/conv3d_tran_05_y.bin differ diff --git a/stereoDNN/tests/data/cost_vol_01_cv.bin b/stereoDNN/tests/data/cost_vol_01_cv.bin new file mode 100644 index 0000000..a1543e5 Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_01_cv.bin differ diff --git a/stereoDNN/tests/data/cost_vol_01_l.bin b/stereoDNN/tests/data/cost_vol_01_l.bin new file mode 100644 index 0000000..f20a15b Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_01_l.bin differ diff --git a/stereoDNN/tests/data/cost_vol_01_r.bin b/stereoDNN/tests/data/cost_vol_01_r.bin new file mode 100644 index 0000000..421fadf Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_01_r.bin differ diff --git a/stereoDNN/tests/data/cost_vol_02_cv.bin b/stereoDNN/tests/data/cost_vol_02_cv.bin new file mode 100644 index 0000000..dd4cfe8 Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_02_cv.bin differ diff --git a/stereoDNN/tests/data/cost_vol_02_l.bin b/stereoDNN/tests/data/cost_vol_02_l.bin new file mode 100644 index 0000000..b21dca5 Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_02_l.bin differ diff --git a/stereoDNN/tests/data/cost_vol_02_r.bin b/stereoDNN/tests/data/cost_vol_02_r.bin new file mode 100644 index 0000000..f194f58 Binary files /dev/null and b/stereoDNN/tests/data/cost_vol_02_r.bin differ diff --git a/stereoDNN/tests/data/elu_i_01.bin b/stereoDNN/tests/data/elu_i_01.bin new file mode 100644 index 0000000..b933285 Binary files /dev/null and b/stereoDNN/tests/data/elu_i_01.bin differ diff --git a/stereoDNN/tests/data/elu_i_02.bin b/stereoDNN/tests/data/elu_i_02.bin new file mode 100644 index 0000000..53e745c Binary files /dev/null and b/stereoDNN/tests/data/elu_i_02.bin differ diff --git a/stereoDNN/tests/data/elu_o_01.bin b/stereoDNN/tests/data/elu_o_01.bin new file mode 100644 index 0000000..58c4fe7 Binary files /dev/null and b/stereoDNN/tests/data/elu_o_01.bin differ diff --git a/stereoDNN/tests/data/elu_o_02.bin b/stereoDNN/tests/data/elu_o_02.bin new file mode 100644 index 0000000..f034c43 Binary files /dev/null and b/stereoDNN/tests/data/elu_o_02.bin differ diff --git a/stereoDNN/tests/data/softargmax_01_x.bin b/stereoDNN/tests/data/softargmax_01_x.bin new file mode 100644 index 0000000..b62ba78 Binary files /dev/null and b/stereoDNN/tests/data/softargmax_01_x.bin differ diff --git a/stereoDNN/tests/data/softargmax_01_y.bin b/stereoDNN/tests/data/softargmax_01_y.bin new file mode 100644 index 0000000..8914737 Binary files /dev/null and b/stereoDNN/tests/data/softargmax_01_y.bin differ diff --git a/stereoDNN/tests/data/softargmax_02_x.bin b/stereoDNN/tests/data/softargmax_02_x.bin new file mode 100644 index 0000000..1523e7c Binary files /dev/null and b/stereoDNN/tests/data/softargmax_02_x.bin differ diff --git a/stereoDNN/tests/data/softargmax_02_y.bin b/stereoDNN/tests/data/softargmax_02_y.bin new file mode 100644 index 0000000..2d75a09 Binary files /dev/null and b/stereoDNN/tests/data/softargmax_02_y.bin differ diff --git a/stereoDNN/tests/tests_main.cpp b/stereoDNN/tests/tests_main.cpp new file mode 100644 index 0000000..4a54fcf --- /dev/null +++ b/stereoDNN/tests/tests_main.cpp @@ -0,0 +1,988 @@ +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Full license terms provided in LICENSE.md file. + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "redtail_tensorrt_plugins.h" +#include "internal_utils.h" + +using namespace nvinfer1; +using namespace redtail::tensorrt; + +using FloatVec = std::vector; + +// ----------------------------------------------------------------- +// TensorRT logger. +// ----------------------------------------------------------------- +class Logger : public nvinfer1::ILogger +{ +public: + Logger(ILogger::Severity max_log_level) : + max_log_level_(max_log_level) + { + } + + void log(nvinfer1::ILogger::Severity severity, const char* msg) override + { + if (severity > max_log_level_) + return; + + switch (severity) + { + case Severity::kINTERNAL_ERROR: std::cerr << "TRT INTERNAL_ERROR: "; break; + case Severity::kERROR: std::cerr << "TRT ERROR: "; break; + case Severity::kWARNING: std::cerr << "TRT WARNING: "; break; + case Severity::kINFO: std::cerr << "TRT INFO: "; break; + default: std::cerr << "TRT UNKNOWN: "; break; + } + std::cerr << msg << std::endl; + } + +private: + Severity max_log_level_; +}; + +static std::unique_ptr g_logger; + +std::string g_data_dir(""); + +// ----------------------------------------------------------------- +// Helper struct to store network input information. +// ----------------------------------------------------------------- +struct NetInput +{ + std::string name; + Dims dims; + FloatVec data; +}; + +// ----------------------------------------------------------------- +// Main test driver function. Creates all required TensorRT components +// to run TRT plugin. The plugin is created by the provided factory. +// Returns the output of the plugin. +// ----------------------------------------------------------------- +template +FloatVec runPlugin(int batch_size, const std::vector& inputs, Dims out_dims, + FactoryOp factory_op, PostProcOp post_proc_op, + IPluginContainer& factory, DataType data_type = DataType::kFLOAT) +{ + // REVIEW alexeyk: assuming single output for now. + const char* output_name = "output"; + + IBuilder* builder = createInferBuilder(*g_logger); + // Note: must use EXPECT_* as ASSERT_ contains return statement. + EXPECT_NE(builder, nullptr); + + INetworkDefinition* network = builder->createNetwork(); + EXPECT_NE(network, nullptr); + + // Add inputs. + std::vector plugin_inputs(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) + { + // Must have at least rank 2 and same batch size. + EXPECT_GT(inputs[i].dims.nbDims, 1); + EXPECT_EQ(batch_size, inputs[i].dims.d[0]); + // Get input dims without batch index dim. + Dims in_dims; + in_dims.nbDims = inputs[i].dims.nbDims - 1; + std::copy(inputs[i].dims.d + 1, inputs[i].dims.d + inputs[i].dims.nbDims, in_dims.d); + // addInput currently supports only 1D-3D input. + if (in_dims.nbDims <= 3) + { + auto input = network->addInput(inputs[i].name.c_str(), data_type, + DimsCHW(in_dims.d[0], in_dims.d[1], in_dims.d[2])); + EXPECT_NE(input, nullptr); + plugin_inputs[i] = input; + } + else if (in_dims.nbDims == 4) + { + // Create input with flattened dims. + EXPECT_EQ(DimsUtils::getTensorSize(in_dims), (int)DimsUtils::getTensorSize(in_dims)); + DimsCHW flat_dims{1, 1, (int)DimsUtils::getTensorSize(in_dims)}; + auto input = network->addInput(inputs[i].name.c_str(), data_type, flat_dims); + EXPECT_NE(input, nullptr); + // Add reshape layer to restore original dims. + auto shuf = network->addShuffle(*input); + EXPECT_NE(shuf, nullptr); + shuf->setReshapeDimensions(DimsNCHW(in_dims.d[0], in_dims.d[1], in_dims.d[2], in_dims.d[3])); + plugin_inputs[i] = shuf->getOutput(0); + } + else + assert(false); // TRT does not support input tensors with rank > 4. + } + + // Create plugin. The factory method can create additional layers/plugins. + IPlugin* plugin = factory_op(); + EXPECT_NE(plugin, nullptr); + + // Add the plugin to the network. + auto plugin_layer = network->addPlugin(&plugin_inputs[0], inputs.size(), *plugin); + EXPECT_NE(plugin_layer, nullptr); + + ILayer* out_layer = post_proc_op(network, plugin_layer, factory); + + // Setup network output. + auto out_layer_out = out_layer->getOutput(0); + EXPECT_NE(out_layer_out, nullptr); + out_layer_out->setName(output_name); + network->markOutput(*out_layer_out); + + // Build the engine. + builder->setMaxBatchSize(batch_size); + // "ought to be enough for anybody." + builder->setMaxWorkspaceSize(1024 * 1024 * 1024); + + builder->setHalf2Mode(data_type == DataType::kHALF); + + auto engine = builder->buildCudaEngine(*network); + EXPECT_NE(engine, nullptr); + // Network and builder can be destroyed right after network is built. + // This follows the behavior in real (non-test) code. + builder->destroy(); + network->destroy(); + + // Setup input and output buffers. + std::vector buffers(inputs.size() + 1); + EXPECT_EQ(engine->getNbBindings(), buffers.size()); + + size_t elt_size_bytes = data_type == DataType::kHALF ? 2 : 4; + + for (size_t i = 0; i < inputs.size(); i++) + { + // Expecting binding indices to match inputs. + EXPECT_EQ(engine->getBindingIndex(inputs[i].name.c_str()), i); + // Allocate and copy. + CHECKL(cudaMalloc(&buffers[i], inputs[i].data.size() * elt_size_bytes), *g_logger); + // Do FP32 -> FP16 of input if necessary. + if (data_type == DataType::kFLOAT) + CHECKL(cudaMemcpy(buffers[i], inputs[i].data.data(), inputs[i].data.size() * elt_size_bytes, cudaMemcpyHostToDevice), *g_logger); + else + { + cv::Mat dst; + cv::convertFp16(cv::Mat(inputs[i].data), dst); + CHECKL(cudaMemcpy(buffers[i], dst.data, inputs[i].data.size() * elt_size_bytes, cudaMemcpyHostToDevice), *g_logger); + } + } + + int out_idx = engine->getBindingIndex(output_name); + size_t out_size = DimsUtils::getTensorSize(out_dims); + EXPECT_EQ(out_idx, buffers.size() - 1); + CHECKL(cudaMalloc(&buffers[out_idx], out_size * elt_size_bytes), *g_logger); + + // Create the context. + IExecutionContext *context = engine->createExecutionContext(); + EXPECT_NE(context, nullptr); + + // Run (finally). + auto host_start = std::chrono::high_resolution_clock::now(); + auto res = context->execute(batch_size, buffers.data()); + auto host_end = std::chrono::high_resolution_clock::now(); + EXPECT_TRUE(res); + auto host_elapsed_ms = std::chrono::duration(host_end - host_start).count(); + auto msg = std::string("Host execution time: ") + std::to_string(host_elapsed_ms); + g_logger->log(ILogger::Severity::kINFO, msg.c_str()); + + // Copy results back to host. + FloatVec out_h(out_size); + // Do FP32 -> FP16 of input if necessary. + if (data_type == DataType::kFLOAT) + { + auto out_h_p = const_cast(out_h.data()); + CHECKL(cudaMemcpy(out_h_p, buffers[out_idx], out_h.size() * sizeof(float), cudaMemcpyDeviceToHost), *g_logger); + } + else if (data_type == DataType::kHALF) + { + std::vector out_h_16(out_size); + auto out_h_p = const_cast(out_h_16.data()); + CHECKL(cudaMemcpy(out_h_p, buffers[out_idx], out_h.size() * elt_size_bytes, cudaMemcpyDeviceToHost), *g_logger); + cv::Mat dst; + cv::convertFp16(cv::Mat(out_size, 1, CV_16S, out_h_p), dst); + std::copy((float*)dst.data, (float*)dst.data + out_size, out_h.begin()); + } + else + ADD_FAILURE(); + // Clean up. + for (size_t i = 0; i < buffers.size(); i++) + CHECKL(cudaFree(buffers[i]), *g_logger); + context->destroy(); + engine->destroy(); + + return out_h; +} + +// ----------------------------------------------------------------- +// Reads the binary file produced by Python test_data_generator.py script. +// The format of the file: +// - number of dimensions (int32) +// - dimensions (int32[]) +// - data (float32[]) +// ----------------------------------------------------------------- +FloatVec readBinaryFile(const std::string& filename, Dims& dims) +{ + std::ifstream file(filename, std::ios::binary); + EXPECT_TRUE(file.is_open()); + + int32_t dim_count; + file.read(reinterpret_cast(&dim_count), sizeof(int32_t)); + EXPECT_GE(dim_count, 1); + EXPECT_LE(dim_count, sizeof(dims.d) / sizeof(dims.d[0])); + + dims.nbDims = dim_count; + file.read(reinterpret_cast(dims.d), dim_count * sizeof(int32_t)); + + FloatVec res(DimsUtils::getTensorSize(dims)); + file.read(reinterpret_cast(&res[0]), res.size() * sizeof(float)); + return res; +} + +// ----------------------------------------------------------------- +// ELU plugin tests. +// ----------------------------------------------------------------- +TEST(EluPluginTests, Basic) +{ + Dims in_dims; + Dims out_dims; + FloatVec in = readBinaryFile(g_data_dir + "elu_i_01.bin", in_dims); + FloatVec out = readBinaryFile(g_data_dir + "elu_o_01.bin", out_dims); + ASSERT_EQ(in_dims.nbDims, out_dims.nbDims); + ASSERT_EQ(in_dims.nbDims, 4); + ASSERT_EQ(in_dims.d[0], 1); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"input", in_dims, in}}, out_dims, + [&] { return factory->createEluPlugin(DataType::kFLOAT, "ELU"); }, + [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; }, + *factory); + + ASSERT_EQ(out.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_FLOAT_EQ(out[i], actual[i]) << "Vectors 'actual' and 'out' differ at index " << i; +} + +TEST(EluPluginTests, Input4DBatchSize2) +{ + Dims in_dims; + Dims out_dims; + FloatVec in = readBinaryFile(g_data_dir + "elu_i_02.bin", in_dims); + FloatVec out = readBinaryFile(g_data_dir + "elu_o_02.bin", out_dims); + ASSERT_EQ(in_dims.nbDims, out_dims.nbDims); + ASSERT_EQ(in_dims.nbDims, 5); + ASSERT_EQ(in_dims.d[0], 2); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(2, {{"input", in_dims, in}}, out_dims, + [&] { return factory->createEluPlugin(DataType::kFLOAT, "ELU"); }, + [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; }, + *factory); + + ASSERT_EQ(out.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_FLOAT_EQ(out[i], actual[i]) << "Vectors 'actual' and 'out' differ at index " << i; +} + +// ----------------------------------------------------------------- +// Conv3DPlugin plugin tests. +// REVIEW alexeyk: consider converting all these tests to data-driven test. +// ----------------------------------------------------------------- + +// Post processing for Conv3D layer. +ILayer* addConv3DPostProcessing(INetworkDefinition* network, ILayer* plugin, IPluginContainer& factory) +{ + auto transform = factory.createTransformPlugin({1, 0, 2, 3}, "Conv3DTransform"); + EXPECT_NE(transform, nullptr); + auto transform_in = plugin->getOutput(0); + auto transform_layer = network->addPlugin(&transform_in, 1, *transform); + EXPECT_NE(transform_layer, nullptr); + return transform_layer; +} + +TEST(Conv3DPluginTests, Basic) +{ + Dims x_dims; + Dims w_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "conv3d_01_x.bin", x_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_01_w.bin", w_dims); + FloatVec y = readBinaryFile(g_data_dir + "conv3d_01_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, {3, {1, 1, 1}}, + {3, {0, 0, 0}}, {3, {0, 0, 0}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3D"); + }, + addConv3DPostProcessing, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_FLOAT_EQ(y[i], actual[i]) << "Vectors 'actual' and 'y' differ at index " << i; +} + +TEST(Conv3DPluginTests, HWStridesAndPadWithMultiK) +{ + Dims x_dims; + Dims w_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "conv3d_02_x.bin", x_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_02_w.bin", w_dims); + FloatVec y = readBinaryFile(g_data_dir + "conv3d_02_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, {3, {1, 2, 2}}, + {3, {0, 1, 1}}, {3, {0, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3D"); + }, + addConv3DPostProcessing, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(y[i], actual[i], 0.00001) << "Vectors 'actual' and 'y' differ at index " << i; +} + +TEST(Conv3DPluginTests, DHWStridesAndPadWithMultiK) +{ + Dims x_dims; + Dims w_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "conv3d_03_x.bin", x_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_03_w.bin", w_dims); + FloatVec y = readBinaryFile(g_data_dir + "conv3d_03_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + // In this test we have to manually pad input in D dimension. + x_dims.d[1] += 1; + FloatVec zero_pad(x_dims.d[2] * x_dims.d[3] * x_dims.d[4]); + x.insert(x.end(), zero_pad.cbegin(), zero_pad.cend()); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, {3, {1, 2, 2}}, + {3, {0, 1, 1}}, {3, {0, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3D"); + }, + addConv3DPostProcessing, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(y[i], actual[i], 0.00001) << "Vectors 'actual' and 'y' differ at index " << i; +} + +TEST(Conv3DPluginTests, UnitStridesAndPadSymDWithMultiK) +{ + Dims x_dims; + Dims w_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "conv3d_04_x.bin", x_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_04_w.bin", w_dims); + FloatVec y = readBinaryFile(g_data_dir + "conv3d_04_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, {3, {1, 1, 1}}, + {3, {1, 1, 1}}, {3, {1, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3D"); + }, + addConv3DPostProcessing, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i; +} + +TEST(Conv3DPluginTests, DHWStridesAndPadAsymDWithMultiK) +{ + Dims x_dims; + Dims w_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "conv3d_05_x.bin", x_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_05_w.bin", w_dims); + FloatVec y = readBinaryFile(g_data_dir + "conv3d_05_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + // In this test we have to manually pad input in D dimension. + x_dims.d[1] += 1; + FloatVec zero_pad(x_dims.d[2] * x_dims.d[3] * x_dims.d[4]); + x.insert(x.end(), zero_pad.cbegin(), zero_pad.cend()); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, {3, {2, 2, 2}}, + {3, {0, 1, 1}}, {3, {1, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3D"); + }, + addConv3DPostProcessing, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i; +} + +TEST(Conv3DPluginTests, DHWStridesAndPadAsymDWithMultiKWithBiasAndElu) +{ + Dims x_dims; + Dims w_dims; + Dims b_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "conv3d_06_x.bin", x_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_06_w.bin", w_dims); + FloatVec b = readBinaryFile(g_data_dir + "conv3d_06_b.bin", b_dims); + FloatVec y = readBinaryFile(g_data_dir + "conv3d_06_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(b_dims.nbDims, 1); + ASSERT_EQ(y_dims.nbDims, 5); + + auto data_type = DataType::kFLOAT; + auto factory = IPluginContainer::create(*g_logger); + // In this test we have to manually pad input in D dimension. + x_dims.d[1] += 1; + FloatVec zero_pad(x_dims.d[2] * x_dims.d[3] * x_dims.d[4]); + x.insert(x.end(), zero_pad.cbegin(), zero_pad.cend()); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, {3, {2, 2, 2}}, + {3, {0, 1, 1}}, {3, {1, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, b.data(), (int64_t)b.size() }, + "Conv3D"); + }, + [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f) + { + auto transform = addConv3DPostProcessing(network, plugin, f); + // Add ELU. + auto elu_plugin = f.createEluPlugin(data_type, "ELU"); + EXPECT_NE(elu_plugin, nullptr); + auto elu_plugin_in = transform->getOutput(0); + auto plugin_layer = network->addPlugin(&elu_plugin_in, 1, *elu_plugin); + EXPECT_NE(plugin_layer, nullptr); + return plugin_layer; + }, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i; +} + +TEST(Conv3DPluginTests, Multiple) +{ + Dims x_dims; + Dims w_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "conv3d_07_x.bin", x_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_07_w.bin", w_dims); + FloatVec y = readBinaryFile(g_data_dir + "conv3d_07_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, {3, {1, 1, 1}}, + {3, {1, 1, 1}}, {3, {1, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3D_1"); + }, + [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f) + { + auto transform_1 = addConv3DPostProcessing(network, plugin, f); + auto pad_plugin = f.createPaddingPlugin({0, 0, 0, 0}, {1, 0, 0, 0}, "Pad_1"); + EXPECT_NE(pad_plugin, nullptr); + auto pad_plugin_in = transform_1->getOutput(0); + auto pad_plugin_layer = network->addPlugin(&pad_plugin_in, 1, *pad_plugin); + EXPECT_NE(pad_plugin_layer, nullptr); + // Add second Conv3D. + auto conv_plugin_2 = f.createConv3DPlugin(Conv3DType::kTensorFlow, + w_dims, DimsCHW{2, 2, 2}, + DimsCHW{0, 1, 1}, DimsCHW{0, 1, 1}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3D_2"); + EXPECT_NE(conv_plugin_2, nullptr); + auto conv_plugin_2_in = pad_plugin_layer->getOutput(0); + auto plugin_layer_2 = network->addPlugin(&conv_plugin_2_in, 1, *conv_plugin_2); + EXPECT_NE(plugin_layer_2, nullptr); + // Add transform for the second convo. + auto transform_2 = addConv3DPostProcessing(network, plugin_layer_2, f); + return transform_2; + }, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(y[i], actual[i], 0.0001) << "Vectors 'actual' and 'y' differ at index " << i; +} + +// ----------------------------------------------------------------- +// Conv3DTransposePlugin plugin tests. +// ----------------------------------------------------------------- + +// Post processing for Conv3DTranspose layer. +ILayer* addConv3DTranPostProc(INetworkDefinition* network, ILayer* plugin, IPluginContainer& factory) +{ + auto transform = factory.createTransformPlugin({1, 0, 2, 3}, "Conv3DTransposeTransform"); + EXPECT_NE(transform, nullptr); + auto transform_in = plugin->getOutput(0); + auto transform_layer = network->addPlugin(&transform_in, 1, *transform); + EXPECT_NE(transform_layer, nullptr); + return transform_layer; +} + +ILayer* addConv3DTranSliceLayer(Dims dims, INetworkDefinition* network, ILayer* src_layer, IPluginContainer& factory) +{ + auto slice_plugin = factory.createSlicePlugin(dims, + {4, {0, 0, 0, 0}}, + {4, {dims.d[0] - 1, dims.d[1], dims.d[2], dims.d[3]}}, + "Slice"); + EXPECT_NE(slice_plugin, nullptr); + auto slice_plugin_in = src_layer->getOutput(0); + auto slice_plugin_layer = network->addPlugin(&slice_plugin_in, 1, *slice_plugin); + EXPECT_NE(slice_plugin_layer, nullptr); + return slice_plugin_layer; +} + +TEST(Conv3DTransposePluginTests, Basic) +{ + Dims y_dims; + Dims w_dims; + Dims x_dims; + FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_01_y.bin", y_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_01_w.bin", w_dims); + FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_01_x.bin", x_dims); + ASSERT_EQ(y_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(x_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims, + [&] + { + return factory->createConv3DTransposePlugin( + Conv3DType::kTensorFlow, + w_dims, {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}}, + {3, {1, 1, 1}}, {3, {0, 0, 0}}, {3, {0, 0, 0}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3DTranspose"); + }, + addConv3DTranPostProc, + *factory); + + ASSERT_EQ(x.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_FLOAT_EQ(x[i], actual[i]) << "Vectors 'x' and 'actual' differ at index " << i; +} + +TEST(Conv3DTransposePluginTests, HWStridesAndPadWithMultiK) +{ + Dims y_dims; + Dims w_dims; + Dims x_dims; + FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_02_y.bin", y_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_02_w.bin", w_dims); + FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_02_x.bin", x_dims); + ASSERT_EQ(y_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(x_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"y",y_dims, y}}, x_dims, + [&] + { + return factory->createConv3DTransposePlugin( + Conv3DType::kTensorFlow, + w_dims, {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}}, + {3, {1, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3DTranspose"); + }, + addConv3DTranPostProc, + *factory); + + ASSERT_EQ(x.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i; +} + +TEST(Conv3DTransposePluginTests, DHWStridesAndPadAsymDWithMultiK) +{ + Dims y_dims; + Dims w_dims; + Dims x_dims; + FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_03_y.bin", y_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_03_w.bin", w_dims); + FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_03_x.bin", x_dims); + ASSERT_EQ(y_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(x_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + // In this test we have to manually pad output by 1 in D dimension. + auto out_dims = DimsNCHW(x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]); + out_dims.d[0] += 1; + auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims, + [&] + { + return factory->createConv3DTransposePlugin( + Conv3DType::kTensorFlow, + w_dims, out_dims, + {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3DTranspose"); + }, + [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f) + { + auto slice_plugin = f.createSlicePlugin(out_dims, + {4, {0, 0, 0, 0}}, + {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}}, + "Slice"); + EXPECT_NE(slice_plugin, nullptr); + auto slice_plugin_in = plugin->getOutput(0); + auto slice_plugin_layer = network->addPlugin(&slice_plugin_in, 1, *slice_plugin); + EXPECT_NE(slice_plugin_layer, nullptr); + return slice_plugin_layer; + }, + *factory); + + ASSERT_EQ(x.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i; +} + +TEST(Conv3DTransposePluginTests, DHWStridesAndPadAsymDWithMultiKWithBiasAndElu) +{ + Dims y_dims; + Dims w_dims; + Dims x_dims; + Dims b_dims; + FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_04_y.bin", y_dims); + FloatVec w = readBinaryFile(g_data_dir + "conv3d_tran_04_w.bin", w_dims); + FloatVec b = readBinaryFile(g_data_dir + "conv3d_tran_04_b.bin", b_dims); + FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_04_x.bin", x_dims); + ASSERT_EQ(y_dims.nbDims, 5); + ASSERT_EQ(w_dims.nbDims, 5); + ASSERT_EQ(b_dims.nbDims, 1); + ASSERT_EQ(x_dims.nbDims, 5); + + auto data_type = DataType::kFLOAT; + auto factory = IPluginContainer::create(*g_logger); + // In this test we have to manually pad output by 1 in D dimension. + auto out_dims = DimsNCHW(x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]); + out_dims.d[0] += 1; + auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims, + [&] + { + return factory->createConv3DTransposePlugin( + Conv3DType::kTensorFlow, + w_dims, out_dims, + {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}}, + Weights{DataType::kFLOAT, w.data(), (int64_t)w.size() }, + Weights{DataType::kFLOAT, b.data(), (int64_t)b.size() }, + "Conv3DTranspose"); + }, + [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f) + { + auto slice_plugin = f.createSlicePlugin(out_dims, + {4, {0, 0, 0, 0}}, + {4, {x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]}}, + "Slice"); + EXPECT_NE(slice_plugin, nullptr); + auto slice_plugin_in = plugin->getOutput(0); + auto slice_plugin_layer = network->addPlugin(&slice_plugin_in, 1, *slice_plugin); + EXPECT_NE(slice_plugin_layer, nullptr); + // Add ELU. + auto elu_plugin = f.createEluPlugin(data_type, "ELU"); + EXPECT_NE(elu_plugin, nullptr); + auto elu_plugin_in = slice_plugin_layer->getOutput(0); + auto elu_layer = network->addPlugin(&elu_plugin_in, 1, *elu_plugin); + EXPECT_NE(elu_layer, nullptr); + return elu_layer; + }, + *factory); + + ASSERT_EQ(x.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i; +} + +TEST(Conv3DTransposePluginTests, Multiple) +{ + Dims y_dims; + Dims w1_dims; + Dims w2_dims; + Dims x_dims; + FloatVec y = readBinaryFile(g_data_dir + "conv3d_tran_05_y.bin", y_dims); + FloatVec w1 = readBinaryFile(g_data_dir + "conv3d_tran_05_w1.bin", w1_dims); + FloatVec w2 = readBinaryFile(g_data_dir + "conv3d_tran_05_w2.bin", w2_dims); + FloatVec x = readBinaryFile(g_data_dir + "conv3d_tran_05_x.bin", x_dims); + ASSERT_EQ(y_dims.nbDims, 5); + ASSERT_EQ(w1_dims.nbDims, 5); + ASSERT_EQ(w2_dims.nbDims, 5); + ASSERT_EQ(x_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + // In this test we have to manually pad output by 1 in D dimension. + auto out_dims1 = DimsNCHW(8 + 1, 8, 9, 9); // REVIEW alexeyk: hardcoded for now. + auto out_dims2 = DimsNCHW(x_dims.d[1], x_dims.d[2], x_dims.d[3], x_dims.d[4]); + out_dims2.d[0] += 1; + auto actual = runPlugin(1, {{"y", y_dims, y}}, x_dims, + [&] + { + return factory->createConv3DTransposePlugin( + Conv3DType::kTensorFlow, + w1_dims, out_dims1, + {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}}, + Weights{DataType::kFLOAT, w1.data(), (int64_t)w1.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3DTranspose_1"); + }, + [&] (INetworkDefinition* network, ILayer* plugin, IPluginContainer& f) + { + // Slice for the first convo. + auto slice_1_plugin_layer = addConv3DTranSliceLayer(out_dims1, network, plugin, f); + auto transform_layer = addConv3DTranPostProc(network, slice_1_plugin_layer, f); + + // Second convo. + auto conv_tran_2_plugin = f.createConv3DTransposePlugin( + Conv3DType::kTensorFlow, + w2_dims, out_dims2, + {3, {2, 2, 2}}, {3, {0, 1, 1}}, {3, {0, 1, 1}}, + Weights{DataType::kFLOAT, w2.data(), (int64_t)w2.size() }, + Weights{DataType::kFLOAT, nullptr, 0 }, + "Conv3DTranspose_2"); + EXPECT_NE(conv_tran_2_plugin, nullptr); + auto conv_tran_2_plugin_in = transform_layer->getOutput(0); + auto conv_tran_2_plugin_layer = network->addPlugin(&conv_tran_2_plugin_in, 1, *conv_tran_2_plugin); + EXPECT_NE(conv_tran_2_plugin_layer, nullptr); + + // Slice for the second convo. + auto slice_2_plugin_layer = addConv3DTranSliceLayer(out_dims2, network, conv_tran_2_plugin_layer, f); + return slice_2_plugin_layer; + }, + *factory); + + ASSERT_EQ(x.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(x[i], actual[i], 0.0001) << "Vectors 'x' and 'actual' differ at index " << i; +} + +// ----------------------------------------------------------------- +// CostVolumePlugin plugin tests. +// ----------------------------------------------------------------- + +TEST(CostVolumePluginTests, Basic) +{ + Dims left_dims; + Dims right_dims; + Dims cost_vol_dims; + FloatVec left = readBinaryFile(g_data_dir + "cost_vol_01_l.bin", left_dims); + FloatVec right = readBinaryFile(g_data_dir + "cost_vol_01_r.bin", right_dims); + FloatVec cost_vol = readBinaryFile(g_data_dir + "cost_vol_01_cv.bin", cost_vol_dims); + ASSERT_EQ(left_dims.nbDims, 4); + ASSERT_EQ(right_dims.nbDims, 4); + ASSERT_EQ(cost_vol_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"left", left_dims, left}, {"right", right_dims, right}}, cost_vol_dims, + [&] + { + return factory->createCostVolumePlugin(cost_vol_dims.d[1], "CostVolume"); + }, + [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; }, + *factory); + + ASSERT_EQ(cost_vol.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_FLOAT_EQ(cost_vol[i], actual[i]) << "Vectors 'actual' and 'cost_vol' differ at index " << i; +} + +TEST(CostVolumePluginTests, Large) +{ + Dims left_dims; + Dims right_dims; + Dims cost_vol_dims; + FloatVec left = readBinaryFile(g_data_dir + "cost_vol_02_l.bin", left_dims); + FloatVec right = readBinaryFile(g_data_dir + "cost_vol_02_r.bin", right_dims); + FloatVec cost_vol = readBinaryFile(g_data_dir + "cost_vol_02_cv.bin", cost_vol_dims); + ASSERT_EQ(left_dims.nbDims, 4); + ASSERT_EQ(right_dims.nbDims, 4); + ASSERT_EQ(cost_vol_dims.nbDims, 5); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"left", left_dims, left}, {"right", right_dims, right}}, cost_vol_dims, + [&] + { + return factory->createCostVolumePlugin(cost_vol_dims.d[1], "CostVolume"); + }, + [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; }, + *factory); + + ASSERT_EQ(cost_vol.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_FLOAT_EQ(cost_vol[i], actual[i]) << "Vectors 'actual' and 'cost_vol' differ at index " << i; +} + +// Performance tests, should be run under nvprof. +// REVIEW alexeyk: add more details and script to run as well as parse nvprof results. +TEST(CostVolumePluginPerfTests, NVSmall) +{ + Dims in_dims{4, {1, 32, 161, 513}}; + Dims cv_dims{5, {1, 48, 64, 161, 513}}; + + // REVIEW alexeyk: populate with random values. + FloatVec left(DimsUtils::getTensorSize(in_dims)); + FloatVec right(DimsUtils::getTensorSize(in_dims)); + FloatVec cost_vol(DimsUtils::getTensorSize(cv_dims)); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"left", in_dims, left}, {"right", in_dims, right}}, cv_dims, + [&] + { + return factory->createCostVolumePlugin(cv_dims.d[1], "CostVolume"); + }, + [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; }, + *factory); + + ASSERT_EQ(cost_vol.size(), actual.size()); +} + +// ----------------------------------------------------------------- +// SoftargmaxPlugin plugin tests. +// ----------------------------------------------------------------- + +TEST(SoftargmaxPluginTests, Basic) +{ + Dims x_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "softargmax_01_x.bin", x_dims); + FloatVec y = readBinaryFile(g_data_dir + "softargmax_01_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 4); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(1, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createSoftargmaxPlugin("Softargmax"); + }, + [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; }, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_FLOAT_EQ(y[i], actual[i]) << "Vectors 'actual' and 'y' differ at index " << i; +} + +TEST(SoftargmaxPluginTests, BatchSize2) +{ + Dims x_dims; + Dims y_dims; + FloatVec x = readBinaryFile(g_data_dir + "softargmax_02_x.bin", x_dims); + FloatVec y = readBinaryFile(g_data_dir + "softargmax_02_y.bin", y_dims); + ASSERT_EQ(x_dims.nbDims, 5); + ASSERT_EQ(y_dims.nbDims, 4); + + auto factory = IPluginContainer::create(*g_logger); + auto actual = runPlugin(2, {{"x", x_dims, x}}, y_dims, + [&] + { + return factory->createSoftargmaxPlugin("Softargmax"); + }, + [] (INetworkDefinition*, ILayer* plugin, IPluginContainer&) { return plugin; }, + *factory); + + ASSERT_EQ(y.size(), actual.size()); + for (size_t i = 0; i < actual.size(); i++) + EXPECT_NEAR(y[i], actual[i], 0.00001) << "Vectors 'actual' and 'y' differ at index " << i; +} + +// ----------------------------------------------------------------- +// End of tests. +// ----------------------------------------------------------------- + +int main(int argc, char* argv[]) +{ + testing::InitGoogleTest(&argc, argv); + if (argc < 2) + { + std::cout << "Usage: nvstereo_tests [TRT log level: 0|1|2|3]" << std::endl; + return 1; + } + + // Set tests data directory. + g_data_dir = argv[1]; + assert(g_data_dir.size() > 0); + if (g_data_dir[g_data_dir.size() - 1] != '/') + g_data_dir += '/'; + + // Create logger. + auto max_log_level = ILogger::Severity::kWARNING; + if (argc > 2) + { + max_log_level = (ILogger::Severity)std::stoi(argv[2]); + assert((int)max_log_level < EnumMax()); + } + g_logger = std::make_unique(max_log_level); + + //getchar(); + return RUN_ALL_TESTS(); +} diff --git a/tools/simulation/docker/Dockerfile.kinetic b/tools/simulation/docker/Dockerfile.kinetic index 1baf130..0e507bd 100644 --- a/tools/simulation/docker/Dockerfile.kinetic +++ b/tools/simulation/docker/Dockerfile.kinetic @@ -1,4 +1,4 @@ -FROM nvidia/cuda:8.0-cudnn6-devel-ubuntu16.04 +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. # Full license terms provided in LICENSE.md file. @@ -138,7 +138,7 @@ RUN echo 'source /opt/ros/kinetic/setup.bash' >> ${HOME}/.bashrc # Install OpenCV with CUDA support. WORKDIR ${HOME} RUN git clone https://github.com/opencv/opencv.git && cd opencv \ - && git checkout 2.4.13.2 \ + && git checkout 3.3.1 \ && mkdir build && cd build \ && cmake -D CMAKE_BUILD_TYPE=RELEASE \ -D CMAKE_INSTALL_PREFIX=/usr/local \ @@ -160,7 +160,8 @@ WORKDIR ${HOME} COPY ${TENSORRT_TAR_FILE} ${HOME} ENV TENSORRT_BASE_DIR /opt/tensorrt -ENV TENSORRT_DIR ${TENSORRT_BASE_DIR}/TensorRT-2.1.2 +ENV TENSORRT_VER 3.0.2 +ENV TENSORRT_DIR ${TENSORRT_BASE_DIR}/TensorRT-${TENSORRT_VER} RUN mkdir ${TENSORRT_BASE_DIR} \ && tar -xf ${TENSORRT_TAR_FILE} -C ${TENSORRT_BASE_DIR} \ diff --git a/tools/simulation/docker/build_redtail_image.sh b/tools/simulation/docker/build_redtail_image.sh index 834913b..e54e4d8 100755 --- a/tools/simulation/docker/build_redtail_image.sh +++ b/tools/simulation/docker/build_redtail_image.sh @@ -7,11 +7,20 @@ TENSORRT_TAR_FILE=$1 if [[ -z "${TENSORRT_TAR_FILE}" ]]; then echo "First argument is missing." echo "Usage : build_redtail_image.sh " - echo "Example: build_redtail_image.sh /data/downloads/NVIDIA/TensorRT-2.1.2.x86_64.cuda-8.0-14-04.tar.bz2" + echo "Example: build_redtail_image.sh /data/downloads/NVIDIA/TensorRT-3.0.2.Ubuntu-16.04.3.x86_64.cuda-9.0.cudnn7.0.tar.gz" exit 1 fi +# Image tag suffix, e.g. v2. +IMAGE_TAG_SUFFIX=$2 +if [[ -n "${IMAGE_TAG_SUFFIX}" ]]; then + IMAGE_TAG_SUFFIX="-${IMAGE_TAG_SUFFIX}" +else + IMAGE_TAG_SUFFIX="-v2" +fi +echo "Using ${IMAGE_TAG_SUFFIX} image suffix." + # Copy the file to Docker context first. cp ${TENSORRT_TAR_FILE} . # Build the image. -docker build -t nvidia-redtail-sim:kinetic --build-arg TENSORRT_TAR_FILE=`basename ${TENSORRT_TAR_FILE}` -f Dockerfile.kinetic . +docker build -t nvidia-redtail-sim:kinetic${IMAGE_TAG_SUFFIX} --build-arg TENSORRT_TAR_FILE=`basename ${TENSORRT_TAR_FILE}` -f Dockerfile.kinetic . diff --git a/tools/simulation/docker/scripts/init_workspace.sh b/tools/simulation/docker/scripts/init_workspace.sh index bb33b88..1bcdefe 100755 --- a/tools/simulation/docker/scripts/init_workspace.sh +++ b/tools/simulation/docker/scripts/init_workspace.sh @@ -17,20 +17,9 @@ cd .. catkin_make source devel/setup.bash -# Install gscam ROS package and its dependencies. -cd ${HOME} -git clone https://github.com/ros-perception/image_common.git -ln -s ${HOME}/image_common/camera_calibration_parsers ${CATKIN_WS}/src/ -ln -s ${HOME}/image_common/camera_info_manager ${CATKIN_WS}/src/ -ln -s ${HOME}/image_common/image_transport ${CATKIN_WS}/src/ - +# Install gscam ROS package. cd ${HOME} git clone https://github.com/ros-drivers/gscam.git -cd gscam -# At present, master branch does not support GStreamer 1.0 so need to switch to gstreamer-1-0-support branch. -git checkout gstreamer-1-0-support -# Currently you get a build error in gscam (‘scoped_ptr’ in namespace ‘boost’ does not name a template type). -sed -i '9i\#include ' ./include/gscam/gscam_nodelet.h # Create symlink to catkin workspace. ln -s ${HOME}/gscam ${CATKIN_WS}/src/ diff --git a/tools/simulation/run_redtail_docker.sh b/tools/simulation/run_redtail_docker.sh index 729b25c..a620c0a 100755 --- a/tools/simulation/run_redtail_docker.sh +++ b/tools/simulation/run_redtail_docker.sh @@ -5,7 +5,7 @@ REDTAIL_NAME=$1 if [[ -z "${REDTAIL_NAME}" ]]; then - REDTAIL_NAME=redtail-sim + REDTAIL_NAME=redtail-sim-v2 fi HOST_DATA_DIR=$2 @@ -25,7 +25,7 @@ REDTAIL_ID=`docker ps -aqf "name=^/${REDTAIL_NAME}$"` if [ -z "${REDTAIL_ID}" ]; then echo "Creating new redtail container." xhost + - nvidia-docker run -it --privileged --network=host -v ${HOST_DATA_DIR}:${CONTAINER_DATA_DIR}:rw -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=unix${DISPLAY} -p 14556:14556/udp --name=${REDTAIL_NAME} nvidia-redtail-sim:kinetic bash + nvidia-docker run -it --privileged --network=host -v ${HOST_DATA_DIR}:${CONTAINER_DATA_DIR}:rw -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=unix${DISPLAY} -p 14556:14556/udp --name=${REDTAIL_NAME} nvidia-redtail-sim:kinetic-v2 bash else echo "Found redtail container: ${REDTAIL_ID}." # Check if the container is already running and start if necessary. diff --git a/tools/tensorflow/docker/build_redtail_tf_image.sh b/tools/tensorflow/docker/build_redtail_tf_image.sh new file mode 100755 index 0000000..0060b78 --- /dev/null +++ b/tools/tensorflow/docker/build_redtail_tf_image.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +# Image tag suffix, e.g. 1.5. +IMAGE_TAG_SUFFIX=$2 +if [[ -z "${IMAGE_TAG_SUFFIX}" ]]; then + IMAGE_TAG_SUFFIX="1.5" +fi +echo "Using ${IMAGE_TAG_SUFFIX} image suffix." + +# Build the image. +docker build -t nvidia-redtail-tf:${IMAGE_TAG_SUFFIX} -f tf1.5.Dockerfile . diff --git a/tools/tensorflow/docker/run_redtail_tf_docker.sh b/tools/tensorflow/docker/run_redtail_tf_docker.sh new file mode 100755 index 0000000..5468669 --- /dev/null +++ b/tools/tensorflow/docker/run_redtail_tf_docker.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +REDTAIL_TF_NAME=$1 +if [[ -z "${REDTAIL_TF_NAME}" ]]; then + REDTAIL_TF_NAME=nvidia-redtail-tf +fi + +HOST_DATA_DIR=$2 +if [[ -z "${HOST_DATA_DIR}" ]]; then + HOST_DATA_DIR=/data/ +fi + +CONTAINER_DATA_DIR=$3 +if [[ -z "${CONTAINER_DATA_DIR}" ]]; then + CONTAINER_DATA_DIR=/data/ +fi + +echo "Container name : ${REDTAIL_TF_NAME}" +echo "Host data dir : ${HOST_DATA_DIR}" +echo "Container data dir: ${CONTAINER_DATA_DIR}" +REDTAIL_ID=`docker ps -aqf "name=^/${REDTAIL_TF_NAME}$"` +if [ -z "${REDTAIL_ID}" ]; then + echo "Creating new redtail container." + xhost + + nvidia-docker run -it --privileged --network=host -v ${HOST_DATA_DIR}:${CONTAINER_DATA_DIR}:rw -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=unix${DISPLAY} -p 8888:8888 -p 6006:6006 --name=${REDTAIL_TF_NAME} nvidia-redtail-tf:1.5 bash +else + echo "Found redtail container: ${REDTAIL_ID}." + # Check if the container is already running and start if necessary. + if [ -z `docker ps -qf "name=^/${REDTAIL_TF_NAME}$"` ]; then + xhost +local:${REDTAIL_ID} + echo "Starting and attaching to ${REDTAIL_TF_NAME} container..." + docker start ${REDTAIL_ID} + docker attach ${REDTAIL_ID} + else + echo "Found running ${REDTAIL_TF_NAME} container, attaching bash..." + docker exec -it ${REDTAIL_ID} bash + fi +fi + diff --git a/tools/tensorflow/docker/scripts/install_tensorflow.sh b/tools/tensorflow/docker/scripts/install_tensorflow.sh new file mode 100755 index 0000000..57ed5cd --- /dev/null +++ b/tools/tensorflow/docker/scripts/install_tensorflow.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +# Stop in case of any error. +set -e + +wget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl + +conda create -n tensorflow python=3.5 anaconda +source activate tensorflow +conda install -y -c menpo opencv3 +pip install --ignore-installed --upgrade tensorflow_gpu-1.5.0-cp35-cp35m-linux_x86_64.whl +pip install plyfile + +echo "export LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/cuda/extras/CUPTI/lib64/" >> ${HOME}/.bashrc + +echo "source activate tensorflow" >> ${HOME}/.bashrc + diff --git a/tools/tensorflow/docker/tf1.5.Dockerfile b/tools/tensorflow/docker/tf1.5.Dockerfile new file mode 100644 index 0000000..898b5d9 --- /dev/null +++ b/tools/tensorflow/docker/tf1.5.Dockerfile @@ -0,0 +1,63 @@ +FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 + +# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +# Full license terms provided in LICENSE.md file. + +# Build with: +# docker build -t nvidia-redtail-tf:1.5 -f tf1.5.Dockerfile . + +ENV HOME /root + +ARG TENSORRT_TAR_FILE + +WORKDIR ${HOME} + +RUN apt-get update && apt-get -y --no-install-recommends install software-properties-common \ + && add-apt-repository ppa:ubuntu-toolchain-r/test + +# REVIEW alexeyk: this fixes the problem with TF 1.5 built against cuDNN 7.0 while recent version of the image has 7.1. +# https://github.com/tensorflow/tensorflow/issues/17566 +RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends libcudnn7=7.0.5.15-1+cuda9.0 libcudnn7-dev=7.0.5.15-1+cuda9.0 + +RUN apt-get update && apt-get -y --no-install-recommends install \ + curl \ + git \ + gosu \ + libgtk2.0-dev \ + libjpeg-dev \ + libpng-dev \ + iproute \ + iputils-ping \ + less \ + libasound2 \ + libx11-xcb-dev \ + libxss1 \ + mc \ + nano \ + net-tools \ + patch \ + pkg-config \ + protobuf-compiler \ + rsync \ + sudo \ + unzip \ + wget \ + zip \ + && apt-get -y autoremove \ + && apt-get clean autoclean \ + # cleanup + && rm -rf /var/lib/apt/lists/{apt,dpkg,cache,log} /tmp/* /var/tmp/* + +# Anaconda +WORKDIR ${HOME} +RUN wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.0.1-Linux-x86_64.sh \ + && chmod +x ./Anaconda3-5.0.0.1-Linux-x86_64.sh \ + && ./Anaconda3-5.0.0.1-Linux-x86_64.sh -b + +ENV PATH ${HOME}/anaconda3/bin:${PATH} + +# TensorFlow conda environment setup. +COPY ./scripts/install_tensorflow.sh ${HOME} +RUN ${HOME}/install_tensorflow.sh + +WORKDIR ${HOME}