diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b4103b..ed1241f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,7 +94,7 @@ endif()
 ###################
 
 # See https://github.com/hunter-packages/check_ci_tag when changing VERSION
-project(acf VERSION 0.1.7)
+project(acf VERSION 0.1.8)
 
 set(ACF_ROOT_DIR "${CMAKE_CURRENT_LIST_DIR}")
 
diff --git a/cmake/Hunter/config.cmake b/cmake/Hunter/config.cmake
index 4c46ab8..8a0a5ee 100644
--- a/cmake/Hunter/config.cmake
+++ b/cmake/Hunter/config.cmake
@@ -1,17 +1,14 @@
-if(IOS OR ANDROID)
-  # local workaround for protobuf compiler crash with Xcode 8.1
-  # see https://github.com/elucideye/acf/issues/41
-  set(opencv_cmake_args
-    WITH_PROTOBUF=OFF
-    BUILD_PROTOBUF=OFF
-    BUILD_LIBPROTOBUF_FROM_SOURCES=NO
-    BUILD_opencv_dnn=OFF
-     
-    WITH_JASPER=OFF
-    BUILD_JASPER=OFF
+# the oepncv protobuf isn't friendly to a lot of compilers, skip it by default
+set(opencv_cmake_args
+  WITH_PROTOBUF=OFF
+  BUILD_PROTOBUF=OFF
+  BUILD_LIBPROTOBUF_FROM_SOURCES=NO
+  BUILD_opencv_dnn=OFF
+  
+  WITH_JASPER=OFF
+  BUILD_JASPER=OFF
   )
-  hunter_config(OpenCV VERSION ${HUNTER_OpenCV_VERSION} CMAKE_ARGS ${opencv_cmake_args})  
-endif()
+hunter_config(OpenCV VERSION ${HUNTER_OpenCV_VERSION} CMAKE_ARGS ${opencv_cmake_args})
 
 ### ogles_gpgpu ###
 set(ogles_gpgpu_cmake_args
diff --git a/src/app/acf/GLDetector.cpp b/src/app/acf/GLDetector.cpp
index 157c54f..49ab1f6 100644
--- a/src/app/acf/GLDetector.cpp
+++ b/src/app/acf/GLDetector.cpp
@@ -66,7 +66,7 @@ void GLDetector::init(const cv::Mat& I)
     computePyramid(I, m_impl->Pcpu);
     const int shrink = opts.pPyramid->pChns->shrink.get();
     const auto sizes = getPyramidSizes(m_impl->Pcpu, shrink);
-    static const bool doGray = false;    
+    static const bool doGray = false;
     const ogles_gpgpu::Size2d inputSize(I.cols, I.rows);
     m_impl->acf = std::make_shared<ogles_gpgpu::ACF>(nullptr, inputSize, sizes, m_impl->featureKind, doGray, shrink);
     m_impl->acf->setDoLuvTransfer(false);
@@ -84,7 +84,7 @@ const acf::Detector::Pyramid& GLDetector::getPyramid(const cv::Mat& input, const
     (*m_impl->context)();
 
     // Fill in the pyramid:
-    (*m_impl->acf)({{ input.cols, input.rows }, void_ptr(input.ptr()), true, 0, DFLT_TEXTURE_FORMAT});
+    (*m_impl->acf)({ { input.cols, input.rows }, void_ptr(input.ptr()), true, 0, DFLT_TEXTURE_FORMAT });
     glFlush();
     m_impl->acf->fill(m_impl->Pgpu, m_impl->Pcpu);
 
@@ -129,7 +129,7 @@ std::vector<ogles_gpgpu::Size2d> getPyramidSizes(acf::Detector::Pyramid& Pcpu, i
 
 void GLDetector::clear()
 {
-    m_impl->size = {0,0};
+    m_impl->size = { 0, 0 };
 }
 
 cv::Mat GLDetector::draw(bool doGpu)
diff --git a/src/app/acf/GLDetector.h b/src/app/acf/GLDetector.h
index 7ee707c..2511609 100644
--- a/src/app/acf/GLDetector.h
+++ b/src/app/acf/GLDetector.h
@@ -37,7 +37,7 @@ class GLDetector : public acf::Detector
 
     cv::Mat draw(bool gpu); // debug routine
     void clear();
-    
+
 protected:
     void init(const cv::Mat& I);
     void initContext();
diff --git a/src/app/acf/acf.cpp b/src/app/acf/acf.cpp
index cbc80d1..c6ffd88 100644
--- a/src/app/acf/acf.cpp
+++ b/src/app/acf/acf.cpp
@@ -51,7 +51,7 @@ using ObjectDetectorPtr = std::shared_ptr<acf::ObjectDetector>;
 using AcfPtr = std::shared_ptr<acf::Detector>;
 using RectVec = std::vector<cv::Rect>;
 
-static void randomShapes(cv::Mat &image, int n);
+static void randomShapes(cv::Mat& image, int n);
 
 struct VideoSource
 {
@@ -65,16 +65,16 @@ struct VideoSource
     {
         filenames = util::cli::expand(filename);
     }
-    
-    VideoSource(int n) : m_n(n) // random frames
+
+    VideoSource(int n)
+        : m_n(n) // random frames
     {
-        
     }
 
     virtual Frame operator()(int i)
     {
         Frame frame;
-        if(filenames.size())
+        if (filenames.size())
         {
             frame.name = filenames[i];
             frame.image = cv::imread(filenames[i], cv::IMREAD_COLOR);
@@ -83,9 +83,9 @@ struct VideoSource
         {
             frame.name = std::to_string(i);
             frame.image = cv::Mat::zeros(640, 480, CV_8UC3);
-            randomShapes(frame.image, rand()%32);
+            randomShapes(frame.image, rand() % 32);
         }
-        
+
         return frame;
     }
 
@@ -244,7 +244,7 @@ int gauze_main(int argc, char** argv)
     }
 
     std::shared_ptr<VideoSource> video;
-    if(doRandom)
+    if (doRandom)
     {
         video = std::make_shared<VideoSource>(1000);
     }
@@ -308,9 +308,9 @@ int gauze_main(int argc, char** argv)
         // Get thread specific segmenter lazily:
         auto& detector = manager[std::this_thread::get_id()];
         assert(detector);
-        
+
         auto winSize = detector->getWindowSize();
-        if(!detector->getIsRowMajor())
+        if (!detector->getIsRowMajor())
         {
             std::swap(winSize.width, winSize.height);
         }
@@ -367,7 +367,7 @@ int gauze_main(int argc, char** argv)
                 {
                     maxScore = *iter;
                 }
-                
+
                 if (doPyramids)
                 {
                     // The "--pyramid" command line option can be used to visualize the
@@ -378,7 +378,7 @@ int gauze_main(int argc, char** argv)
                     // method in order to ensure the CPU pyramid will be computed for each
                     // frame.
 #if defined(ACF_DO_GPU)
-                    if(acf::GLDetector *handle = dynamic_cast<acf::GLDetector*>(detector.get()))
+                    if (acf::GLDetector* handle = dynamic_cast<acf::GLDetector*>(detector.get()))
                     {
                         cv::Mat Pcpu = handle->draw(false);
                         cv::Mat Pgpu = handle->draw(true);
@@ -450,17 +450,20 @@ int main(int argc, char** argv)
 {
     try
     {
-        const std::string home=getenv("HOME");
-        std::vector<char *> args(argc);
+        std::string home;
+#if !(defined(_WIN32) || defined(_WIN64))
+        home = getenv("HOME");
+#endif
+        std::vector<char*> args(argc);
         args[0] = argv[0];
-        
+
         std::vector<std::string> storage(argc);
-        for(int i = 0; i < argc; i++)
+        for (int i = 0; i < argc; i++)
         {
             storage[i] = std::regex_replace(std::string(argv[i]), std::regex("HOME"), home);
             args[i] = const_cast<char*>(storage[i].c_str());
         }
-        
+
         return gauze_main(argc, &args.front());
     }
     catch (std::exception& e)
@@ -534,60 +537,63 @@ static cv::Rect2f operator*(const cv::Rect2f& roi, float scale)
     return { roi.x * scale, roi.y * scale, roi.width * scale, roi.height * scale };
 }
 
-static void randomEllipse(cv::Mat &image, int n)
+static void randomEllipse(cv::Mat& image, int n)
 {
-    for(int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
     {
-        const cv::Point2f center(rand()%image.cols, rand()%image.rows);
-        const cv::Size2f size(rand()%image.cols, rand()%image.rows);
-        const cv::RotatedRect ellipse(center, size, static_cast<float>(rand() % 1000)/1000.f * M_PI);
-        const cv::Scalar bgr(rand()%255, rand()%255, rand()%255);
+        const cv::Point2f center(rand() % image.cols, rand() % image.rows);
+        const cv::Size2f size(rand() % image.cols, rand() % image.rows);
+        const cv::RotatedRect ellipse(center, size, static_cast<float>(rand() % 1000) / 1000.f * M_PI);
+        const cv::Scalar bgr(rand() % 255, rand() % 255, rand() % 255);
         cv::ellipse(image, ellipse, bgr, -1);
     }
 }
 
-static void randomRectangle(cv::Mat &image, int n)
+static void randomRectangle(cv::Mat& image, int n)
 {
-    for(int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
     {
         const cv::Point p1(rand() % image.cols, rand() % image.rows);
         const cv::Point p2(rand() % image.cols, rand() % image.rows);
-        
-        if((rand() % 8) > 4)
+
+        if ((rand() % 8) > 4)
         {
             cv::randu(image(cv::Rect(p1, p2)), cv::Scalar::all(0), cv::Scalar::all(255));
         }
         else
         {
-            const cv::Scalar bgr(rand()%255, rand()%255, rand()%255);
+            const cv::Scalar bgr(rand() % 255, rand() % 255, rand() % 255);
             cv::rectangle(image, p1, p2, bgr, -1);
         }
     }
 }
 
-static void randomLines(cv::Mat &image, int n)
+static void randomLines(cv::Mat& image, int n)
 {
-    for(int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
     {
         const cv::Point u1(rand() % image.cols, rand() % image.rows);
         const cv::Point u2(rand() % image.cols, rand() % image.rows);
-        const cv::Scalar bgr(rand()%255, rand()%255, rand()%255);
-        cv::line(image, u1, u2, bgr, (rand() % 16)+1, 8);
+        const cv::Scalar bgr(rand() % 255, rand() % 255, rand() % 255);
+        cv::line(image, u1, u2, bgr, (rand() % 16) + 1, 8);
     }
 }
 
 // Provide a simple mechanism for testing the ACF pyramids (GPU and CPU)
 // without the need for reading actual images.  This was added initially
 // to aid testing on mobile devices.
-static void randomShapes(cv::Mat &image, int n)
+static void randomShapes(cv::Mat& image, int n)
 {
-    for(int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
     {
-        switch(rand()%3)
+        switch (rand() % 3)
         {
-            case 0: randomLines(image, 1);
-            case 1: randomRectangle(image, 1);
-            case 2: randomEllipse(image, 1);
+            case 0:
+                randomLines(image, 1);
+            case 1:
+                randomRectangle(image, 1);
+            case 2:
+                randomEllipse(image, 1);
         }
     }
 }
diff --git a/src/app/acf/mat2cpb.cpp b/src/app/acf/mat2cpb.cpp
index 2b9a82c..44d50f6 100644
--- a/src/app/acf/mat2cpb.cpp
+++ b/src/app/acf/mat2cpb.cpp
@@ -83,7 +83,7 @@ int gauze_main(int argc, char** argv)
     acf::Detector acf(sInput);
 
     save_cpb(sOutput, acf);
-    
+
     return 0;
 }
 
diff --git a/src/app/pipeline/CMakeLists.txt b/src/app/pipeline/CMakeLists.txt
index 3a46116..018958a 100644
--- a/src/app/pipeline/CMakeLists.txt
+++ b/src/app/pipeline/CMakeLists.txt
@@ -9,6 +9,9 @@ set(acf_srcs
   pipeline.cpp
   GPUDetectionPipeline.h
   GPUDetectionPipeline.cpp
+
+  VideoCaptureImage.h
+  VideoCaptureImage.cpp
   
   # Simple line segment shader for the usual green box annotations:
   lines.h
@@ -38,16 +41,6 @@ target_compile_definitions(${test_app} PUBLIC ACF_DO_GPU=1)
 set_property(TARGET ${test_app} PROPERTY FOLDER "app/console")
 install(TARGETS ${test_app} DESTINATION bin)
 
-set_target_properties(
-  ${test_app}
-  PROPERTIES
-  MACOSX_BUNDLE_INFO_PLIST "${CMAKE_CURRENT_LIST_DIR}/plist.in" # file sharing
-  XCODE_ATTRIBUTE_PRODUCT_NAME "${test_app}"
-  XCODE_ATTRIBUTE_BUNDLE_IDENTIFIER "com.elucideye.acf.${test_app}"
-  XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER "com.elucideye.acf.${test_app}"
-  XCODE_ATTRIBUTE_TARGETED_DEVICE_FAMILY "1,2" # iPhone/iPad
-)
-
 #############
 ### TEST ####
 #############
@@ -58,7 +51,7 @@ gauze_add_test(
   NAME ${test_name}
   COMMAND ${test_app}
       --input=$<GAUZE_RESOURCE_FILE:${DRISHTI_FACES_FACE_IMAGE}>
-      --repeat=64
+      --repeat=600
       --model=$<GAUZE_RESOURCE_FILE:${DRISHTI_ASSETS_FACE_DETECTOR}>
       --minimum=128
       --calibration=0.01
diff --git a/src/app/pipeline/GPUDetectionPipeline.cpp b/src/app/pipeline/GPUDetectionPipeline.cpp
index 5da6c76..c0f6bed 100644
--- a/src/app/pipeline/GPUDetectionPipeline.cpp
+++ b/src/app/pipeline/GPUDetectionPipeline.cpp
@@ -14,6 +14,14 @@
 
 static void chooseBest(std::vector<cv::Rect>& objects, std::vector<double>& scores);
 
+#define ACF_DEBUG_PYRAMIDS 0
+
+#if ACF_DEBUG_PYRAMIDS
+#include <opencv2/highgui.hpp>
+static cv::Mat draw(const acf::Detector::Pyramid& pyramid);
+static void logPyramid(const std::string& filename, const acf::Detector::Pyramid& P);
+#endif
+
 template <typename Container>
 void push_fifo(Container& container, const typename Container::value_type& value, int size)
 {
@@ -74,6 +82,10 @@ struct GPUDetectionPipeline::Impl
 
     std::vector<DetectionCallback> callbacks;
 
+    bool doOptimizedPipeline = true;
+    bool doCpuACF = false;
+    bool doAnnotations = true;
+
     uint64_t frameIndex = 0;
     float ACFScale = 1.f;
     float acfCalibration = 0.f;
@@ -94,8 +106,7 @@ struct GPUDetectionPipeline::Impl
         double read = 0.0;
         double detect = 0.0;
         double complete = 0.0;
-    }
-    log;
+    } log;
 };
 
 GPUDetectionPipeline::GPUDetectionPipeline(DetectionPtr& detector, const cv::Size& inputSize, std::size_t n, int rotation, int minObjectWidth)
@@ -109,7 +120,7 @@ GPUDetectionPipeline::~GPUDetectionPipeline()
 {
     try
     {
-        if(impl && impl->scene.valid())
+        if (impl && impl->scene.valid())
         {
             // If this has already been retrieved it will throw
             impl->scene.get(); // block on any abandoned calls
@@ -120,6 +131,11 @@ GPUDetectionPipeline::~GPUDetectionPipeline()
     }
 }
 
+GLuint GPUDetectionPipeline::getInputTexture()
+{
+    return impl->acf->getInputTexId();
+}
+
 void GPUDetectionPipeline::operator+=(const DetectionCallback& callback)
 {
     impl->callbacks.push_back(callback);
@@ -230,6 +246,14 @@ int GPUDetectionPipeline::computeDetectionWidth(const cv::Size& inputSizeUp) con
 void GPUDetectionPipeline::fill(acf::Detector::Pyramid& P)
 {
     impl->acf->fill(P, impl->P);
+
+#if ACF_DEBUG_PYRAMIDS
+    // One can compare CPU and GPU pyramids using logging like this:
+    //std::string home = ".";
+    //cv::Mat channels = impl->acf->getChannels();
+    //cv::imwrite(home + "/acf_channels.png", channels);
+    //logPyramid(home + "/acf_pyramid.png", P);
+#endif
 }
 
 void GPUDetectionPipeline::computeAcf(const ogles_gpgpu::FrameInput& frame, bool doLuv, bool doDetection)
@@ -247,7 +271,7 @@ void GPUDetectionPipeline::computeAcf(const ogles_gpgpu::FrameInput& frame, bool
 
 GLuint GPUDetectionPipeline::paint(const Detections& scene, GLuint inputTexture)
 {
-    //if(impl->lines)
+    if (scene.roi.size())
     {
         std::vector<std::array<float, 2>> segments;
         for (const auto& r : scene.roi)
@@ -316,24 +340,20 @@ int GPUDetectionPipeline::detect(const ogles_gpgpu::FrameInput& frame, Detection
     return 0;
 }
 
-std::pair<GLuint, Detections> GPUDetectionPipeline::operator()(const ogles_gpgpu::FrameInput& frame2, bool doDetection)
+std::pair<GLuint, Detections> GPUDetectionPipeline::runFast(const ogles_gpgpu::FrameInput& frame2, bool doDetection)
 {
     ogles_gpgpu::FrameInput frame1;
     frame1.size = frame2.size;
 
-    util::ScopeTimeLogger logger = [&](double elapsed) { impl->log.complete += elapsed; };
-    
     Detections scene2(impl->frameIndex), scene1, scene0, *outputScene = &scene2;
 
     if (impl->fifo->getBufferCount() > 0)
     {
         util::ScopeTimeLogger logger = [&](double elapsed) { impl->log.read += elapsed; };
-        
-        // read GPU results for frame n-1
 
-        // Here we always trigger GPU pipeline reads
-        // to ensure upright + redeuced grayscale images will
-        // be available for regression, even if we won't be using ACF detection.
+        // Read GPU results for frame n-1.
+        // Here we always trigger GPU pipeline reads to ensure upright + redeuced grayscale images
+        // will be available for regression, even if we won't be using ACF detection.
         impl->acf->getChannels();
 
         if (impl->acf->getChannelStatus())
@@ -365,14 +385,21 @@ std::pair<GLuint, Detections> GPUDetectionPipeline::operator()(const ogles_gpgpu
             scene0 = impl->scene.get();                     // scene n-2
             texture0 = (*impl->fifo)[-2]->getOutputTexId(); // texture n-2
 
-            outputTexture = paint(scene0, texture0);
             outputScene = &scene0;
+            if (impl->doAnnotations)
+            {
+                outputTexture = paint(scene0, texture0);
+            }
+            else
+            {
+                outputTexture = texture0;
+            }
         }
 
         // Run CPU detection + regression for frame n-1
         impl->scene = impl->threads->process([scene1, frame1, this]() {
             util::ScopeTimeLogger logger = [&](double elapsed) { impl->log.detect += elapsed; };
-            
+
             Detections sceneOut = scene1;
             detect(frame1, sceneOut, scene1.P != nullptr);
             return sceneOut;
@@ -390,25 +417,135 @@ std::pair<GLuint, Detections> GPUDetectionPipeline::operator()(const ogles_gpgpu
     // Add the current frame to FIFO
     impl->fifo->useTexture(texture2, 1);
     impl->fifo->render();
+    push_fifo(impl->scenePrimitives, *outputScene, impl->history);
+
+    return std::make_pair(outputTexture, *outputScene);
+}
+
+auto GPUDetectionPipeline::runSimple(const ogles_gpgpu::FrameInput& frame1, bool doDetection) -> DetectionTex
+{
+    // Run GPU based processing on current thread and package results as a task for CPU
+    // processing so that it will be available on the next frame.  This method will compute
+    // ACF output using shaders on the GPU, and may optionally extract other GPU related
+    // features.
+    Detections scene1(impl->frameIndex), *outputScene = nullptr; // time: n+1 and n
+    preprocess(frame1, scene1, doDetection);
+
+    // Initialize input texture with ACF upright texture:
+    GLuint texture1 = impl->acf->first()->getOutputTexId(), outputTexture = 0;
+
+    detect(frame1, scene1, doDetection);
+
+    outputScene = &scene1;
+    if (impl->doAnnotations)
+    {
+        outputTexture = paint(scene1, texture1);
+    }
+    else
+    {
+        outputTexture = texture1;
+    }
 
-    // Clear face motion estimate, update window
+    // Add the current frame to FIFO
+    impl->fifo->useTexture(texture1, 1);
+    impl->fifo->render();
     push_fifo(impl->scenePrimitives, *outputScene, impl->history);
 
+    return std::make_pair(outputTexture, *outputScene);
+}
+
+auto GPUDetectionPipeline::run(const FrameInput& frame2, bool doDetection) -> DetectionTex
+{
+    if (impl->doOptimizedPipeline)
+    {
+        return runFast(frame2, doDetection);
+    }
+    else
+    {
+        return runSimple(frame2, doDetection);
+    }
+}
+
+auto GPUDetectionPipeline::operator()(const FrameInput& frame2, bool doDetection) -> DetectionTex
+{
+    util::ScopeTimeLogger logger = [&](double elapsed) { impl->log.complete += elapsed; };
+
+    std::pair<GLuint, Detections> result = run(frame2, doDetection);
+
     for (auto& c : impl->callbacks)
     {
-        c(outputTexture, *outputScene);
+        c(result.first, result.second);
     }
 
-    return std::make_pair(outputTexture, *outputScene);
+    return result;
 }
 
-std::map<std::string, double> GPUDetectionPipeline::summary()
+void GPUDetectionPipeline::preprocess(const FrameInput& frame, Detections& scene, bool doDetection)
 {
-    return
+    if (impl->doCpuACF)
     {
-        {"read", impl->log.read},
-        {"detect", impl->log.detect},
-        {"complete", impl->log.complete}
+        scene.P = createAcfCpu(frame, doDetection);
+    }
+    else
+    {
+        scene.P = createAcfGpu(frame, doDetection);
+    }
+}
+
+std::shared_ptr<acf::Detector::Pyramid> GPUDetectionPipeline::createAcfGpu(const FrameInput& frame, bool doDetection)
+{
+    computeAcf(frame, false, doDetection);
+
+    std::shared_ptr<decltype(impl->P)> P;
+
+    // Here we always trigger channel processing
+    // to ensure grayscale images will be available
+    // for regression, even if we won't be using ACF detection.
+    cv::Mat acf = impl->acf->getChannels();
+
+    if (doDetection)
+    {
+        assert(acf.type() == CV_8UC1);
+        assert(acf.channels() == 1);
+
+        if (impl->acf->getChannelStatus())
+        {
+            P = std::make_shared<decltype(impl->P)>();
+            fill(*P);
+        }
+    }
+
+    return P;
+}
+
+std::shared_ptr<acf::Detector::Pyramid> GPUDetectionPipeline::createAcfCpu(const FrameInput& frame, bool doDetection)
+{
+    computeAcf(frame, true, doDetection);
+
+    std::shared_ptr<decltype(impl->P)> P;
+    if (doDetection)
+    {
+        cv::Mat acf = impl->acf->getChannels();
+        assert(acf.type() == CV_8UC1);
+        assert(acf.channels() == 1);
+
+        P = std::make_shared<decltype(impl->P)>();
+
+        MatP LUVp = impl->acf->getLuvPlanar();
+        impl->detector->setIsLuv(true);
+        impl->detector->setIsTranspose(true);
+        impl->detector->computePyramid(LUVp, *P);
+    }
+
+    return P;
+}
+
+std::map<std::string, double> GPUDetectionPipeline::summary()
+{
+    return {
+        { "read", impl->log.read },
+        { "detect", impl->log.detect },
+        { "complete", impl->log.complete }
     };
 }
 
@@ -430,3 +567,41 @@ static void chooseBest(std::vector<cv::Rect>& objects, std::vector<double>& scor
         scores = { scores[best] };
     }
 }
+
+#if ACF_DEBUG_PYRAMIDS
+
+static cv::Mat draw(const acf::Detector::Pyramid& pyramid)
+{
+    cv::Mat canvas;
+    std::vector<cv::Mat> levels;
+    for (int i = 0; i < pyramid.nScales; i++)
+    {
+        // Concatenate the transposed faces, so they are compatible with the GPU layout
+        cv::Mat Ccpu;
+        std::vector<cv::Mat> images;
+        for (const auto& image : pyramid.data[i][0].get())
+        {
+            images.push_back(image.t());
+        }
+        cv::vconcat(images, Ccpu);
+
+        // Instead of upright:
+        //cv::vconcat(pyramid.data[i][0].get(), Ccpu);
+
+        if (levels.size())
+        {
+            cv::copyMakeBorder(Ccpu, Ccpu, 0, levels.front().rows - Ccpu.rows, 0, 0, cv::BORDER_CONSTANT);
+        }
+
+        levels.push_back(Ccpu);
+    }
+    cv::hconcat(levels, canvas);
+    return canvas;
+}
+
+static void logPyramid(const std::string& filename, const acf::Detector::Pyramid& P)
+{
+    cv::Mat canvas = draw(P);
+    cv::imwrite(filename, canvas);
+}
+#endif // ACF_DEBUG_PYRAMIDS
diff --git a/src/app/pipeline/GPUDetectionPipeline.h b/src/app/pipeline/GPUDetectionPipeline.h
index a6a3f5b..15e8b63 100644
--- a/src/app/pipeline/GPUDetectionPipeline.h
+++ b/src/app/pipeline/GPUDetectionPipeline.h
@@ -37,25 +37,37 @@ class GPUDetectionPipeline
     using HighResolutionClock = std::chrono::high_resolution_clock;
     using TimePoint = HighResolutionClock::time_point; // <std::chrono::system_clock>;
     using DetectionPtr = std::shared_ptr<acf::Detector>;
+    using DetectionTex = std::pair<GLuint, Detections>;
     using DetectionCallback = std::function<void(GLuint texture, const Detections& detections)>;
+    using FrameInput = ogles_gpgpu::FrameInput;
 
     GPUDetectionPipeline(DetectionPtr& detector, const cv::Size& inputSize, std::size_t n, int rotation, int minObjectWidth);
     virtual ~GPUDetectionPipeline();
 
-    // This method receives an input frame descriptor (pixel buffer or texture ID) on which to run 
-    // ACF object detection.  The doDetection parameter is provided in order to allow the user to 
-    // control the duty cycle of the detector (perhaps adaptively).  The detection pipeline introduces 
+    GLuint getInputTexture();
+
+    // This method receives an input frame descriptor (pixel buffer or texture ID) on which to run
+    // ACF object detection.  The doDetection parameter is provided in order to allow the user to
+    // control the duty cycle of the detector (perhaps adaptively).  The detection pipeline introduces
     // two frames of latency so that the GPU->CPU overhead can be hidden.  For input frame N, the results
     // are returned for frame N-2 (along with the corresponding texture ID).
-    std::pair<GLuint, Detections> operator()(const ogles_gpgpu::FrameInput& frame, bool doDetection=true);
- 
+    DetectionTex operator()(const ogles_gpgpu::FrameInput& frame, bool doDetection = true);
+
     void operator+=(const DetectionCallback& callback);
 
     std::map<std::string, double> summary();
-    
+
     void setDoGlobalNMS(bool flag);
 
 protected:
+    DetectionTex run(const FrameInput& frame2, bool doDetection);
+    DetectionTex runSimple(const ogles_gpgpu::FrameInput& frame, bool doDetection = true);
+    DetectionTex runFast(const ogles_gpgpu::FrameInput& frame, bool doDetection = true);
+
+    void preprocess(const ogles_gpgpu::FrameInput& frame, Detections& scene, bool doDetection);
+
+    std::shared_ptr<acf::Detector::Pyramid> createAcfGpu(const FrameInput& frame, bool doDetection);
+    std::shared_ptr<acf::Detector::Pyramid> createAcfCpu(const FrameInput& frame, bool doDetection);
 
     // Allow user defined object detection drawing via inheritance.
     virtual GLuint paint(const Detections& scene, GLuint inputTexture);
diff --git a/src/app/pipeline/VideoCaptureImage.cpp b/src/app/pipeline/VideoCaptureImage.cpp
new file mode 100644
index 0000000..e89199c
--- /dev/null
+++ b/src/app/pipeline/VideoCaptureImage.cpp
@@ -0,0 +1,66 @@
+#include "VideoCaptureImage.h"
+
+VideoCaptureImage::VideoCaptureImage(const cv::Mat& image, int frames)
+    : image(image)
+    , frames(frames)
+{
+}
+
+VideoCaptureImage::VideoCaptureImage(const std::string& filename, int frames)
+    : frames(frames)
+{
+    image = cv::imread(filename, cv::IMREAD_COLOR);
+}
+
+VideoCaptureImage::~VideoCaptureImage() = default;
+
+void VideoCaptureImage::setRepeat(int n)
+{
+    frames = n;
+}
+
+bool VideoCaptureImage::grab()
+{
+    return false;
+}
+
+bool VideoCaptureImage::isOpened() const
+{
+    return !image.empty();
+}
+
+void VideoCaptureImage::release()
+{
+    image.release();
+}
+
+bool VideoCaptureImage::open(const cv::String& filename)
+{
+    image = cv::imread(filename);
+    return !image.empty();
+}
+
+bool VideoCaptureImage::read(cv::OutputArray image)
+{
+    if (++index <= frames)
+    {
+        image.assign(this->image);
+        return true;
+    }
+    return false;
+}
+
+double VideoCaptureImage::get(int propId) const
+{
+    switch (propId)
+    {
+        case CV_CAP_PROP_FRAME_WIDTH:
+            return static_cast<double>(image.cols);
+        case CV_CAP_PROP_FRAME_HEIGHT:
+            return static_cast<double>(image.rows);
+        case CV_CAP_PROP_FRAME_COUNT:
+            return static_cast<double>(frames);
+        default:
+            return 0.0;
+    }
+}
diff --git a/src/app/pipeline/VideoCaptureImage.h b/src/app/pipeline/VideoCaptureImage.h
new file mode 100644
index 0000000..7bbb4d7
--- /dev/null
+++ b/src/app/pipeline/VideoCaptureImage.h
@@ -0,0 +1,36 @@
+/*! -*-c++-*-
+  @file   VideoCaptureImage.h
+  @author David Hirvonen
+  @brief  Present cv::Mat as an cv::VideoCaptureImag
+
+  \copyright Copyright 2018 Elucideye, Inc. All rights reserved.
+  \license{This project is released under the 3 Clause BSD License.}
+
+*/
+
+#ifndef __acf_VideoCaptureImage_h__
+#define __acf_VideoCaptureImage_h__
+
+#include <opencv2/highgui.hpp>
+
+class VideoCaptureImage : public cv::VideoCapture
+{
+public:
+    VideoCaptureImage(const cv::Mat& image, int frames = 100);
+    VideoCaptureImage(const std::string& filename, int frames = 100);
+    virtual ~VideoCaptureImage();
+
+    void setRepeat(int n);
+    virtual bool grab();
+    virtual bool isOpened() const;
+    virtual void release();
+    virtual bool open(const cv::String& filename);
+    virtual bool read(cv::OutputArray image);
+    double get(int propId) const;
+
+    cv::Mat image;
+    int frames = 0;
+    int index = -1;
+};
+
+#endif // __acf_VideoCaptureImage_h__
diff --git a/src/app/pipeline/pipeline.cpp b/src/app/pipeline/pipeline.cpp
index 2a09c33..98337e5 100644
--- a/src/app/pipeline/pipeline.cpp
+++ b/src/app/pipeline/pipeline.cpp
@@ -39,10 +39,12 @@
 
   acf-pipeline \
       --input=0 \
-      --model=${SOME_PATH_VAR}/drishti-assets/drishti_face_gray_80x80.cpb
+      --model=${SOME_PATH_VAR}/drishti-assets/drishti_face_gray_80x80.cpb \
       --minimum=200 \
       --calibration=0.01 \
-      --window
+      --global \
+      --window \
+      --size=1920x1080
 
   In the above command, "minimum=200" means we are ignoring all faces
   less than 200 pixels wide.  You should set this to the largest value
@@ -58,13 +60,14 @@
 */
 
 #if defined(ACF_ADD_TO_STRING)
-#  include <io/stdlib_string.h> // first
+#include <io/stdlib_string.h> // first
 #endif
 
 #include <util/Logger.h>
 #include <util/ScopeTimeLogger.h>
 
 #include "GPUDetectionPipeline.h"
+#include "VideoCaptureImage.h"
 
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
@@ -75,6 +78,7 @@
 #include <ogles_gpgpu/common/proc/disp.h>
 
 #include <cxxopts.hpp>
+#include <string>
 
 // clang-format off
 #ifdef ANDROID
@@ -93,81 +97,6 @@ void* void_ptr(const T* ptr)
 static std::shared_ptr<cv::VideoCapture> create(const std::string& filename);
 static cv::Size getSize(cv::VideoCapture& video);
 
-class VideoCaptureImage : public cv::VideoCapture
-{
-public:
-    VideoCaptureImage(const cv::Mat &image, int frames=100)
-        : image(image)
-        , frames(frames)
-    {
-    }
-    
-    VideoCaptureImage(const std::string &filename, int frames=100)
-        : frames(frames)
-    {
-        image = cv::imread(filename, cv::IMREAD_COLOR);
-    }
-    
-    virtual ~VideoCaptureImage()
-    {
-        
-    }
-    
-    void setRepeat(int n)
-    {
-        frames = n;
-    }
-    
-    virtual bool grab ()
-    {
-        return false;
-    }
-    
-    virtual bool isOpened () const
-    {
-        return !image.empty();
-    }
-    
-    virtual void release()
-    {
-        image.release();
-    }
-    
-    virtual bool open (const cv::String &filename)
-    {
-        image = cv::imread(filename);
-        return !image.empty();
-    }
-    virtual bool read (cv::OutputArray image)
-    {
-        if(++index <= frames)
-        {
-            image.assign(this->image);
-            return true;
-        }
-        return false;
-    }
-    
-    double get(int propId) const
-    {
-        switch (propId)
-        {
-            case CV_CAP_PROP_FRAME_WIDTH:
-                return static_cast<double>(image.cols);
-            case CV_CAP_PROP_FRAME_HEIGHT:
-                return static_cast<double>(image.rows);
-            case CV_CAP_PROP_FRAME_COUNT:
-                return static_cast<double>(frames);
-            default:
-                return 0.0;
-        }
-    }
-    
-    cv::Mat image;
-    int frames = 0;
-    int index = -1;
-};
-
 struct Application
 {
     // clang-format off
@@ -178,7 +107,8 @@ struct Application
         float acfCalibration,
         int minWidth,
         bool window,
-        float resolution
+        float resolution,
+        const cv::Size &sizeIn = {}
     ) : resolution(resolution)
     // clang-format on
     {
@@ -189,6 +119,20 @@ struct Application
         // http://answers.opencv.org/answers/761/revisions/
         video = create(input);
 
+        // ::::::::::::::::::: CAVEAT ::::::::::::::::::::::::::::::
+        // Using a MAX resolution approach will not work in all cases.
+        // It may lead to strange behavior: all gray, all black + very slow.
+        // You may have to specify the desired resolution explicitly as shown below
+        //video->set(cv::CAP_PROP_FRAME_WIDTH, 16000.0);
+        //video->set(cv::CAP_PROP_FRAME_HEIGHT, 16000.0);
+        
+        if (sizeIn.area())
+        {
+            // If the resolution is known in advance you can set it explicitly like this:
+            video->set(cv::CAP_PROP_FRAME_WIDTH, static_cast<double>(sizeIn.width));
+            video->set(cv::CAP_PROP_FRAME_HEIGHT, static_cast<double>(sizeIn.height));
+        }
+
         // Create an OpenGL context:
         const auto size = getSize(*video);
         context = aglet::GLContext::create(aglet::GLContext::kAuto, window ? "acf" : "", size.width, size.height);
@@ -221,10 +165,10 @@ struct Application
     {
         this->logger = logger;
     }
-    
+
     void setRepeat(int n)
     {
-        if(VideoCaptureImage *cap = dynamic_cast<VideoCaptureImage *>(video.get()))
+        if (VideoCaptureImage* cap = dynamic_cast<VideoCaptureImage*>(video.get()))
         {
             cap->setRepeat(n);
         }
@@ -235,16 +179,10 @@ struct Application
         pipeline->setDoGlobalNMS(flag);
     }
 
-    bool update()
+    virtual cv::Mat grab()
     {
         cv::Mat frame;
-        (*video) >> frame;
-
-        if (frame.empty())
-        {
-            return false; // indicate failure, exit loop
-        }
-
+        (*video) >> frame; std::cout << "MU: " << cv::mean(frame) << std::endl;
         if (frame.channels() == 3)
         {
             // ogles_gpgpu supports both {BGR,RGB}A and NV{21,12} inputs, and
@@ -257,8 +195,26 @@ struct Application
             cv::cvtColor(frame, frame, cv::COLOR_BGR2BGRA); // assume all others are GL_BGRA
 #endif
         }
+        return frame;
+    };
+
+    virtual cv::Mat getFrameInput(ogles_gpgpu::FrameInput& input)
+    {
+        cv::Mat frame = grab();
+        input = { { frame.cols, frame.rows }, void_ptr(frame.data), true, false, TEXTURE_FORMAT };
+        return frame;
+    }
+
+    virtual bool update()
+    {
+        ogles_gpgpu::FrameInput frame;
+        cv::Mat storage = getFrameInput(frame);
+        if (storage.empty())
+        {
+            return false;
+        }
 
-        auto result = (*pipeline)({ { frame.cols, frame.rows }, void_ptr(frame.data), true, false, TEXTURE_FORMAT }, true);
+        auto result = (*pipeline)(frame, true);
 
         if (logger)
         {
@@ -269,13 +225,13 @@ struct Application
         {
             show(result.first);
         }
-        
+
         counter++;
 
         return true; // continue sequence
     }
 
-    void show(GLuint texture)
+    virtual void show(GLuint texture)
     {
         auto& geometry = context->getGeometry();
         display->setOffset(geometry.tx, geometry.ty);
@@ -287,40 +243,87 @@ struct Application
     float resolution = 1.f;
 
     std::shared_ptr<spdlog::logger> logger;
-
     std::shared_ptr<aglet::GLContext> context;
     std::shared_ptr<ogles_gpgpu::Disp> display;
-
     std::shared_ptr<cv::VideoCapture> video;
     std::shared_ptr<acf::Detector> detector;
     std::shared_ptr<acf::GPUDetectionPipeline> pipeline;
-    
+
     std::size_t counter = 0;
 };
 
+struct ApplicationBenchmark : public Application
+{
+    // clang-format off
+    ApplicationBenchmark
+    (
+     const std::string &input,
+     const std::string &model,
+     float acfCalibration,
+     int minWidth,
+     bool window,
+     float resolution,
+     const cv::Size &size = {}
+    )
+    : Application(input,model,acfCalibration,minWidth,window,resolution,size)
+    // clang-format on
+    {
+    }
+
+    virtual cv::Mat getFrameInput(ogles_gpgpu::FrameInput& input)
+    {
+        if (counter > 256)
+        {
+            return cv::Mat();
+        }
+
+        static cv::Mat frame = grab(); // for the benchmark we can repeat the first frame
+        input = { { frame.cols, frame.rows }, void_ptr(frame.data), true, false, TEXTURE_FORMAT };
+        if (counter++ > 0)
+        {
+            input.inputTexture = pipeline->getInputTexture();
+            input.pixelBuffer = nullptr;
+        }
+
+        return frame;
+    }
+};
+
+static std::vector<std::string> split(const string& input, const string& regex) 
+{
+    // passing -1 as the submatch index parameter performs splitting
+    std::regex re(regex);
+    std::sregex_token_iterator first{ input.begin(), input.end(), re, -1 }, last;
+    return { first, last };
+}
+
 int gauze_main(int argc, char** argv)
 {
     auto logger = util::Logger::create("acf-pipeline");
 
-    for(int i = 0; i < argc; i++)
+    for (int i = 0; i < argc; i++)
     {
         logger->info("arg[{}] = {}", i, argv[i]);
     }
 
-    bool help = false, doWindow = false, doGlobal = false;
+    bool help = false, doWindow = false, doGlobal = false, doBenchmark = false;
     float resolution = 1.f, acfCalibration = 0.f;
     std::string sInput, sOutput, sModel;
     int minWidth = 0, repeat = 1;
 
+    std::string sDimensions;
+
     const int argumentCount = argc;
     cxxopts::Options options("acf-pipeline", "GPU accelerated ACF object detection (see Piotr's toolbox)");
 
     // clang-format off
     options.add_options()
         ("i,input", "Input file", cxxopts::value<std::string>(sInput))
+        ("size", "Input video dimensions: wxh", cxxopts::value<std::string>(sDimensions))
         ("o,output", "Output directory", cxxopts::value<std::string>(sOutput))
         ("m,model", "Model file", cxxopts::value<std::string>(sModel))
         ("c,calibration", "ACF calibration", cxxopts::value<float>(acfCalibration))
+        ("b,benchmark", "Run benchmark by repeating first input texture", cxxopts::value<bool>(doBenchmark))
         ("r,resolution", "Resolution", cxxopts::value<float>(resolution))
         ("g,global", "Globl nms", cxxopts::value<bool>(doGlobal))
         ("w,window", "Window", cxxopts::value<bool>(doWindow))
@@ -337,6 +340,18 @@ int gauze_main(int argc, char** argv)
         return 0;
     }
 
+    cv::Size size; // video dimensions
+    if (!sDimensions.empty())
+    {
+        std::vector<std::string> dimensions = split(sDimensions, "x");
+        if (!dimensions.size())
+        {
+            logger->error("Must specify input dimensions in format: <width>x<height>, received {}", sDimensions);
+            return 1;
+        }
+        size = { std::stoi(dimensions[0]), std::stoi(dimensions[1]) };
+    }
+
     if (sModel.empty())
     {
         logger->error("Must specify a valid model");
@@ -349,35 +364,43 @@ int gauze_main(int argc, char** argv)
         return 1;
     }
 
-    Application app(sInput, sModel, acfCalibration, minWidth, doWindow, resolution);
-    app.setLogger(logger);
-    app.setRepeat(repeat);
-    app.setDoGlobalNMS(doGlobal);
+    std::shared_ptr<Application> app;
+    if (doBenchmark)
+    {
+        app = std::make_shared<ApplicationBenchmark>(sInput, sModel, acfCalibration, minWidth, doWindow, resolution, size);
+    }
+    else
+    {
+        app = std::make_shared<Application>(sInput, sModel, acfCalibration, minWidth, doWindow, resolution, size);
+    }
+
+    app->setLogger(logger);
+    app->setRepeat(repeat);
+    app->setDoGlobalNMS(doGlobal);
 
     std::size_t count = 0;
-    aglet::GLContext::RenderDelegate delegate = [&]() -> bool
-    {
-        bool status = app.update();
-        if(status)
+    aglet::GLContext::RenderDelegate delegate = [&]() -> bool {
+        bool status = app->update();
+        if (status)
         {
             count++;
         }
         return status;
     };
 
-    double seconds = 0.0;    
+    double seconds = 0.0;
     { // Process all frames (main loop) and record the total time:
         util::ScopeTimeLogger timer = [&](double total) { seconds = total; };
-        (*app.context)(delegate);
+        (*app->context)(delegate);
     }
 
-    const double fps = (seconds > 0.0) ? static_cast<double>(count)/seconds : 0.0;
+    const double fps = (seconds > 0.0) ? static_cast<double>(count) / seconds : 0.0;
     logger->info("ACF FULL: FPS={}", fps);
 
-    if(count > 0)
+    if (count > 0)
     {
-        auto summary = app.pipeline->summary();
-        for(auto &entry : summary)
+        auto summary = app->pipeline->summary();
+        for (auto& entry : summary)
         {
             entry.second /= static_cast<double>(count);
             logger->info("\tACF STAGE {} = {}", entry.first, entry.second);
@@ -397,7 +420,7 @@ static std::shared_ptr<cv::VideoCapture> create(const std::string& filename)
     }
     else
     {
-        if(filename.find(".png") != std::string::npos)
+        if (filename.find(".png") != std::string::npos)
         {
             return std::make_shared<VideoCaptureImage>(filename);
         }
diff --git a/src/lib/acf/ACF.cpp b/src/lib/acf/ACF.cpp
index 3bb2654..88dbcf3 100644
--- a/src/lib/acf/ACF.cpp
+++ b/src/lib/acf/ACF.cpp
@@ -14,8 +14,8 @@
 
 #include <util/IndentingOStreamBuffer.h>
 #include <util/string_hash.h>
-
 #include <iomanip>
+#include <numeric> // for iota
 
 ACF_NAMESPACE_BEGIN
 
@@ -256,6 +256,14 @@ int Detector::operator()(const MatP& IpTranspose, std::vector<cv::Rect>& objects
     return (*this)(P, objects, scores);
 }
 
+static std::vector<int> create_random_indices(int n)
+{
+    std::vector<int> indices(n);
+    std::iota(indices.begin(), indices.end(), 0);
+    std::random_shuffle(indices.begin(), indices.end());
+    return indices;
+}
+
 // Multiscale search:
 int Detector::operator()(const Pyramid& P, std::vector<cv::Rect>& objects, std::vector<double>* scores)
 {
@@ -266,37 +274,59 @@ int Detector::operator()(const Pyramid& P, std::vector<cv::Rect>& objects, std::
     auto modelDs = *(opts.modelDs);
     auto shift = (modelDsPad - modelDs) / 2 - pad;
 
-    std::vector<Detection> bbs;
-    for (int i = 0; i < P.nScales; i++)
-    {
-        DetectionVec ds;
+    // Here we create random indices so that (on average) for each `const cv::Range &r` slice
+    // in the cv::parallel_for_(const cv::Range &r, ...) call, the total ACF Pyramid area
+    // for all levels (specified by Range::{start,end}) will be equal for every thread.
+    auto scales = create_random_indices(P.nScales);
+    std::vector<DetectionVec> bbs_(P.nScales);
 
-        // ROI fields indicates row major storage, else column major:
-        if (P.rois.size() > i)
-        {
-            acfDetect1(P.data[i][0], P.rois[i], shrink, modelDsPad, *(opts.stride), *(opts.cascThr), ds);
-        }
-        else
+    std::function<void(const cv::Range& r)> worker = [&](const cv::Range& r) {
+        for (int j = r.start; j < r.end; j++)
         {
-            acfDetect1(P.data[i][0], {}, shrink, modelDsPad, *(opts.stride), *(opts.cascThr), ds);
-        }
+            int i = scales[j];
 
-        // Scale up the detections
-        for (auto& bb : ds)
-        {
-            //std::cout << bb.weight << std::endl;
-            cv::Size size(cv::Size2d(modelDs) / P.scales[i]);
-            bb.roi.x = double(bb.roi.x + shift.width) / P.scaleshw[i].width;
-            bb.roi.y = double(bb.roi.y + shift.height) / P.scaleshw[i].height;
-            bb.roi.width = size.width;
-            bb.roi.height = size.height;
+            DetectionVec ds;
 
-            std::swap(bb.roi.x, bb.roi.y); // TODO: review
+            // ROI fields indicates row major storage, else column major:
+            if (P.rois.size() > i)
+            {
+                acfDetect1(P.data[i][0], P.rois[i], shrink, modelDsPad, *(opts.stride), *(opts.cascThr), ds);
+            }
+            else
+            {
+                acfDetect1(P.data[i][0], {}, shrink, modelDsPad, *(opts.stride), *(opts.cascThr), ds);
+            }
 
-            std::swap(bb.roi.width, bb.roi.height); // TRANSPOSE
+            // Scale up the detections
+            for (auto& bb : ds)
+            {
+                cv::Size size(cv::Size2d(modelDs) / P.scales[i]);
+                bb.roi.x = double(bb.roi.x + shift.width) / P.scaleshw[i].width;
+                bb.roi.y = double(bb.roi.y + shift.height) / P.scaleshw[i].height;
+                bb.roi.width = size.width;
+                bb.roi.height = size.height;
+
+                std::swap(bb.roi.x, bb.roi.y);
+                std::swap(bb.roi.width, bb.roi.height);
+            }
+            std::copy(ds.begin(), ds.end(), std::back_inserter(bbs_[i]));
         }
-        std::copy(ds.begin(), ds.end(), std::back_inserter(bbs));
+    };
+
+    if (m_doParallel)
+    {
+        cv::parallel_for_({ 0, P.nScales }, worker);
+    }
+    else
+    {
+        worker({ 0, P.nScales });
+    }
+
+    for (int i = 1; i < bbs_.size(); i++)
+    {
+        std::copy(bbs_[i].begin(), bbs_[i].end(), std::back_inserter(bbs_[0]));
     }
+    auto& bbs = bbs_[0];
 
     if (m_doNms)
     {
diff --git a/src/lib/acf/ACF.h b/src/lib/acf/ACF.h
index 99e6e75..5e19c58 100644
--- a/src/lib/acf/ACF.h
+++ b/src/lib/acf/ACF.h
@@ -449,7 +449,7 @@ class ACF_EXPORT Detector : public acf::ObjectDetector
         Size2dVec& scaleshw
     );
     // clang-format on
-    
+
     static int convTri(const MatP& I, MatP& J, double r = 1.0, int s = 1);
 
     // clang-format off
@@ -466,7 +466,7 @@ class ACF_EXPORT Detector : public acf::ObjectDetector
     );
     // clang-format on
 
-    // clang-format off    
+    // clang-format off
     static int gradientHist
     (
         const cv::Mat& M,
@@ -481,6 +481,16 @@ class ACF_EXPORT Detector : public acf::ObjectDetector
     );
     // clang-format on
 
+    virtual void setDoParallel(bool flag)
+    {
+        m_doParallel = flag;
+    }
+
+    virtual bool getDoParallel() const
+    {
+        return m_doParallel;
+    }
+
     virtual void setDetectionScorePruneRatio(double ratio)
     {
         m_detectionScorePruneRatio = ratio;
@@ -515,7 +525,7 @@ class ACF_EXPORT Detector : public acf::ObjectDetector
         DetectionVec& objects
     );
     // clang-format on
-    
+
     int bbNms(const DetectionVec& bbsIn, const Options::Nms& pNms, DetectionVec& bbs);
     int acfModify(const Detector::Modify& params);
 
@@ -593,6 +603,7 @@ class ACF_EXPORT Detector : public acf::ObjectDetector
     std::shared_ptr<spdlog::logger> m_streamLogger;
 
     double m_detectScorePruneRatio = 0.0;
+    bool m_doParallel = true;
 
     bool m_isLuv = false;
     bool m_isTranspose = false;
diff --git a/src/lib/acf/ACFIO.cpp b/src/lib/acf/ACFIO.cpp
index c013514..0d16784 100644
--- a/src/lib/acf/ACFIO.cpp
+++ b/src/lib/acf/ACFIO.cpp
@@ -165,15 +165,15 @@ int Detector::deserialize(ParserNodeDetector& detector_)
             auto&& pJitter_ = opts_.create("pJitter", opts_->pJitter);
             pJitter_.parse<Field<double>, decltype((*pJitter_)->flip)>("flip", (*pJitter_)->flip);
         }
-        catch(...)
+        catch (...)
         {
             opts_->pJitter->flip.set("jitter", false, true, 0);
         }
 
         opts_.parse<Field<double>, decltype(opts_->winsSave)>("winsSave", opts_->winsSave);
     }
-    
-    clf.thrsU8 = clf.thrs * 255.0; // add uint8_t compatible thresholds
+
+    clf.thrs.convertTo(clf.thrsU8, CV_8UC1, 255.0f); // add uint8_t compatible thresholds
 
     return 0;
 }
diff --git a/src/lib/acf/ACFIOArchive.h b/src/lib/acf/ACFIOArchive.h
index 96cbfce..9666e62 100644
--- a/src/lib/acf/ACFIOArchive.h
+++ b/src/lib/acf/ACFIOArchive.h
@@ -95,7 +95,7 @@ void Detector::Classifier::serialize(Archive& ar, const std::uint32_t version)
 
     if (Archive::is_loading::value)
     {
-        thrsU8 = thrs * 255.0; // precompute uint8_t thresholds
+        thrs.convertTo(thrsU8, CV_8UC1, 255.0f); // precompute uint8_t thresholds
     }
 }
 
diff --git a/src/lib/acf/GPUACF.cpp b/src/lib/acf/GPUACF.cpp
index c41229b..ea924e5 100644
--- a/src/lib/acf/GPUACF.cpp
+++ b/src/lib/acf/GPUACF.cpp
@@ -82,7 +82,7 @@ struct ACF::Impl
             rgb2luvProc->add(luvTransposeOut.get());
         }
     }
-    
+
     void initACF(const SizeVec& scales, FeatureKind kind)
     {
         // Rotation + rescale and ACF pipeline:
@@ -99,7 +99,7 @@ struct ACF::Impl
         smoothNormGradProc = util::make_unique<SmoothProc>(1.0);
         smoothGradHistProcA = util::make_unique<SmoothProc>(1.0);
         smoothGradHistProcB = util::make_unique<SmoothProc>(1.0);
-        
+
         // Reduction:
         reduceRgbProc = util::make_unique<ogles_gpgpu::GainProc>();
         reduceLuvProc = util::make_unique<ogles_gpgpu::GainProc>();
@@ -147,7 +147,7 @@ struct ACF::Impl
         // ((( histA -> smooth(histA) )))
         gradHistProcA->add(smoothGradHistProcA.get());
         smoothGradHistProcA->add(reduceGradHistProcA.get());
-        
+
         // ((( histB -> smooth(histB) )))
         gradHistProcB->add(smoothGradHistProcB.get());
         smoothGradHistProcB->add(reduceGradHistProcB.get());
@@ -180,7 +180,7 @@ struct ACF::Impl
                 CV_Assert(false);
         }
     }
-    
+
     // This provides a map for unpacking/swizzling OpenGL textures (i.e., RGBA or BGRA) to user
     // memory using NEON optimized instructions.
     ChannelSpecification getACFChannelSpecification(MatP& acf) const
@@ -248,7 +248,7 @@ struct ACF::Impl
         return ChannelSpecification();
         // clang-format on
     }
-    
+
     bool needsTextures() const
     {
         bool status = false;
@@ -277,16 +277,16 @@ struct ACF::Impl
     float m_grayscaleScale = 1.0f;
     bool m_hasGrayscaleOutput = false;
     cv::Mat m_grayscale;
-    
+
     int m_shrink = 4;
 
     std::unique_ptr<ogles_gpgpu::GainProc> rotationProc; // make sure we have an unmodified upright image
     std::unique_ptr<ogles_gpgpu::Rgb2LuvProc> rgb2luvProc;
     std::unique_ptr<ogles_gpgpu::PyramidProc> pyramidProc;
-    std::unique_ptr<ogles_gpgpu::GradProc> gradProc;            // (1.0);
-    std::unique_ptr<ogles_gpgpu::TriangleProc> normProc;        // (5, true, 0.005);
-    std::unique_ptr<ogles_gpgpu::GradHistProc> gradHistProcA;   // (6, 0, 1.f);
-    std::unique_ptr<ogles_gpgpu::GradHistProc> gradHistProcB;   // (6, 4, 1.f);
+    std::unique_ptr<ogles_gpgpu::GradProc> gradProc;          // (1.0);
+    std::unique_ptr<ogles_gpgpu::TriangleProc> normProc;      // (5, true, 0.005);
+    std::unique_ptr<ogles_gpgpu::GradHistProc> gradHistProcA; // (6, 0, 1.f);
+    std::unique_ptr<ogles_gpgpu::GradHistProc> gradHistProcB; // (6, 4, 1.f);
 
     // Reduction:
     std::unique_ptr<ogles_gpgpu::GainProc> reduceRgbProc; // initial reduction
@@ -294,8 +294,8 @@ struct ACF::Impl
     std::unique_ptr<ogles_gpgpu::GainProc> reduceNormGradProc;
     std::unique_ptr<ogles_gpgpu::GainProc> reduceGradHistProcA; // (1);
     std::unique_ptr<ogles_gpgpu::GainProc> reduceGradHistProcB; // (1);
-    std::unique_ptr<ogles_gpgpu::GainProc> reduceForGrayProc; // (optional) reduce for grayscale output
-    
+    std::unique_ptr<ogles_gpgpu::GainProc> reduceForGrayProc;   // (optional) reduce for grayscale output
+
     // Strategic smoothing (hand tuned to match ACF output)
     std::unique_ptr<SmoothProc> smoothProc;
     std::unique_ptr<SmoothProc> smoothNormGradProc;
@@ -303,7 +303,7 @@ struct ACF::Impl
     std::unique_ptr<SmoothProc> smoothGradHistProcB;
 
     // #### OUTPUT ###
-    std::unique_ptr<ogles_gpgpu::GainProc> luvTransposeOut;  //  transposed LUV output
+    std::unique_ptr<ogles_gpgpu::GainProc> luvTransposeOut; //  transposed LUV output
 
     // Multi-texture swizzle (one or the other for 7 vs 10 channels)
     std::unique_ptr<ogles_gpgpu::MergeProc> mergeProcLUVG;
diff --git a/src/lib/acf/draw.cpp b/src/lib/acf/draw.cpp
index de903a2..5ed912d 100644
--- a/src/lib/acf/draw.cpp
+++ b/src/lib/acf/draw.cpp
@@ -17,31 +17,31 @@ ACF_NAMESPACE_BEGIN
 // This function demonstrates how to visualize a pyramid structure:
 cv::Mat draw(acf::Detector::Pyramid& pyramid)
 {
-   cv::Mat canvas;
-   std::vector<cv::Mat> levels;
-   for (int i = 0; i < pyramid.nScales; i++)
-   {
-       // Concatenate the transposed faces, so they are compatible with the GPU layout
-       cv::Mat Ccpu;
-       std::vector<cv::Mat> images;
-       for (const auto& image : pyramid.data[i][0].get())
-       {
-           images.push_back(image.t());
-       }
-       cv::vconcat(images, Ccpu);
-
-       // Instead of upright:
-       //cv::vconcat(pyramid.data[i][0].get(), Ccpu);
-
-       if (levels.size())
-       {
-           cv::copyMakeBorder(Ccpu, Ccpu, 0, levels.front().rows - Ccpu.rows, 0, 0, cv::BORDER_CONSTANT);
-       }
-
-       levels.push_back(Ccpu);
-   }
-   cv::hconcat(levels, canvas);
-   return canvas;
+    cv::Mat canvas;
+    std::vector<cv::Mat> levels;
+    for (int i = 0; i < pyramid.nScales; i++)
+    {
+        // Concatenate the transposed faces, so they are compatible with the GPU layout
+        cv::Mat Ccpu;
+        std::vector<cv::Mat> images;
+        for (const auto& image : pyramid.data[i][0].get())
+        {
+            images.push_back(image.t());
+        }
+        cv::vconcat(images, Ccpu);
+
+        // Instead of upright:
+        //cv::vconcat(pyramid.data[i][0].get(), Ccpu);
+
+        if (levels.size())
+        {
+            cv::copyMakeBorder(Ccpu, Ccpu, 0, levels.front().rows - Ccpu.rows, 0, 0, cv::BORDER_CONSTANT);
+        }
+
+        levels.push_back(Ccpu);
+    }
+    cv::hconcat(levels, canvas);
+    return canvas;
 }
 
 ACF_NAMESPACE_END
diff --git a/src/lib/acf/gpu/multipass/triangle_pass.cpp b/src/lib/acf/gpu/multipass/triangle_pass.cpp
index fa77ef1..596da47 100644
--- a/src/lib/acf/gpu/multipass/triangle_pass.cpp
+++ b/src/lib/acf/gpu/multipass/triangle_pass.cpp
@@ -82,7 +82,7 @@ std::string fragmentShaderForTriangle(int blurRadius, bool doNorm = false, int p
     ss << "void main()\n";
     ss << "{\n";
     ss << "   vec4 sum = vec4(0.0);\n";
-    ss << "   vec4 center = texture2D(inputImageTexture, blurCoordinates[" << numberOfOffsets/2 << "]);\n";
+    ss << "   vec4 center = texture2D(inputImageTexture, blurCoordinates[" << numberOfOffsets / 2 << "]);\n";
 
     for (int currentBlurCoordinateIndex = 0; currentBlurCoordinateIndex < numberOfOffsets; currentBlurCoordinateIndex++)
     {
diff --git a/src/lib/acf/toolbox/acfDetect1.cpp b/src/lib/acf/toolbox/acfDetect1.cpp
index 7be6d66..31a2802 100644
--- a/src/lib/acf/toolbox/acfDetect1.cpp
+++ b/src/lib/acf/toolbox/acfDetect1.cpp
@@ -163,12 +163,12 @@ const cv::Mat& Detector::Classifier::getScaledThresholds(int type) const
 {
     switch (type)
     {
-        case CV_32FC1:
-            CV_Assert(!thrs.empty());
-            return thrs;
         case CV_8UC1:
-            CV_Assert(!thrsU8.empty());
+            CV_Assert(!thrsU8.empty() && (thrsU8.type() == CV_8UC1));
             return thrsU8;
+        case CV_32FC1:
+            CV_Assert(!thrs.empty() && (thrs.type() == CV_32FC1));
+            return thrs;
         default:
             CV_Assert(type == CV_32FC1 || type == CV_8UC1);
     }
@@ -176,13 +176,15 @@ const cv::Mat& Detector::Classifier::getScaledThresholds(int type) const
 }
 
 template <int kDepth>
-std::shared_ptr<DetectionParams> allocDetector(const MatP& I, const cv::Mat &thrs, DetectionSink* sink)
+std::shared_ptr<DetectionParams> allocDetector(const MatP& I, const cv::Mat& thrs, DetectionSink* sink)
 {
     switch (I.depth())
     {
         case CV_8UC1:
+            CV_Assert(thrs.type() == CV_8UC1);
             return std::make_shared<ParallelDetectionBody<uint8_t, kDepth>>(I[0].ptr<uint8_t>(), thrs.ptr<uint8_t>(), sink);
         case CV_32FC1:
+            CV_Assert(thrs.type() == CV_32FC1);
             return std::make_shared<ParallelDetectionBody<float, kDepth>>(I[0].ptr<float>(), thrs.ptr<float>(), sink);
         default:
             CV_Assert(I.depth() == CV_8UC1 || I.depth() == CV_32FC1);
@@ -190,7 +192,7 @@ std::shared_ptr<DetectionParams> allocDetector(const MatP& I, const cv::Mat &thr
     return nullptr; // unused: for static analyzer
 }
 
-std::shared_ptr<DetectionParams> allocDetector(const MatP& I, const cv::Mat &thrs, DetectionSink* sink, int depth)
+std::shared_ptr<DetectionParams> allocDetector(const MatP& I, const cv::Mat& thrs, DetectionSink* sink, int depth)
 {
     // Enforce compile time constants in inner tree search:
     switch (depth)
@@ -219,6 +221,7 @@ std::shared_ptr<DetectionParams> allocDetector(const MatP& I, const cv::Mat &thr
     return nullptr;
 }
 
+// clang-format off
 auto Detector::createDetector
 (
     const MatP& I,
@@ -228,6 +231,7 @@ auto Detector::createDetector
     int stride,
     DetectionSink* sink
 )
+// clang-format on
     const -> DetectionParamPtr
 {
     int modelHt = modelDsPad.height;
@@ -295,6 +299,7 @@ auto Detector::createDetector
 //
 // 3/21/2015: Rework arithmetic for row-major storage order
 
+// clang-format off
 void Detector::acfDetect1
 (
     const MatP& I,
@@ -305,6 +310,7 @@ void Detector::acfDetect1
     double cascThr,
     std::vector<Detection>& objects
 )
+// clang-format on
 {
     DetectionSink detections;
     auto detector = createDetector(I, rois, shrink, modelDsPad, stride, &detections);
diff --git a/src/lib/acf/toolbox/gradientMex.cpp b/src/lib/acf/toolbox/gradientMex.cpp
index e6bbdfb..0bcde5d 100644
--- a/src/lib/acf/toolbox/gradientMex.cpp
+++ b/src/lib/acf/toolbox/gradientMex.cpp
@@ -121,7 +121,7 @@ class ACosTable
 #endif
         return a1[i];
     }
-    
+
     const int max()
     {
         return +(n + b - 1);
@@ -185,10 +185,10 @@ void gradMag(float* I, float* M, float* O, int h, int w, int d, bool full)
     _Gx = (__m128*)Gx;
     Gy = (float*)alMalloc(s, 16);
     _Gy = (__m128*)Gy;
-    
+
     __m128 upper = SET(static_cast<float>(ACosTable::getInstance().max()));
     __m128 lower = SET(static_cast<float>(ACosTable::getInstance().min()));
-    
+
     // compute gradient magnitude and orientation for each column
     for (x = 0; x < w; x++)
     {
diff --git a/src/lib/acf/ut/test-acf.cpp b/src/lib/acf/ut/test-acf.cpp
index a9c4b67..fe12853 100644
--- a/src/lib/acf/ut/test-acf.cpp
+++ b/src/lib/acf/ut/test-acf.cpp
@@ -244,7 +244,7 @@ class ACFTest : public ::testing::Test
     // State:
     // 1) Allocates acf::Detector
     // 2) Allocates ogles_gpgpu::ACF
-    
+
     void initGPUAndCreatePyramid(acf::Detector::Pyramid& Pgpu, ogles_gpgpu::ACF::FeatureKind kind)
     {
         m_detector = create(modelFilename);
@@ -253,7 +253,7 @@ class ACFTest : public ::testing::Test
         acf::Detector::Pyramid Pcpu;
         m_detector->setIsTranspose(true);
         m_detector->computePyramid(m_IpT, Pcpu);
-        const int shrink = m_detector->opts.pPyramid->pChns->shrink.get();        
+        const int shrink = m_detector->opts.pPyramid->pChns->shrink.get();
         auto sizes = getPyramidSizes(Pcpu);
         static const bool doGray = false;
         ogles_gpgpu::Size2d inputSize(image.cols, image.rows);
@@ -265,7 +265,7 @@ class ACFTest : public ::testing::Test
         cv::Mat input = image;
 
         // Fill in the pyramid:
-        (*m_acf)({{ input.cols, input.rows }, input.ptr(), true, 0, DFLT_TEXTURE_FORMAT});
+        (*m_acf)({ { input.cols, input.rows }, input.ptr(), true, 0, DFLT_TEXTURE_FORMAT });
         glFlush();
         m_acf->fill(Pgpu, Pcpu);
     }
@@ -585,7 +585,6 @@ TEST_F(ACFTest, ACFCaltechDetector)
 }
 #endif // defined(ACF_SERIALIZE_WITH_CVMATIO)
 
-
 // ### utility ###
 
 // http://stackoverflow.com/a/32647694