matrix_reductions.cu

/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#if !defined CUDA_DISABLER

#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vec_traits.hpp"
#include "opencv2/gpu/device/vec_math.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/type_traits.hpp"

using namespace cv::gpu;
using namespace cv::gpu::device;

namespace detail
{
    __device__ __forceinline__ int cvAtomicAdd(int* address, int val)
    {
        return ::atomicAdd(address, val);
    }
    __device__ __forceinline__ unsigned int cvAtomicAdd(unsigned int* address, unsigned int val)
    {
        return ::atomicAdd(address, val);
    }
    __device__ __forceinline__ float cvAtomicAdd(float* address, float val)
    {
    #if __CUDA_ARCH__ >= 200
        return ::atomicAdd(address, val);
    #else
        int* address_as_i = (int*) address;
        int old = *address_as_i, assumed;
        do {
            assumed = old;
            old = ::atomicCAS(address_as_i, assumed,
                __float_as_int(val + __int_as_float(assumed)));
        } while (assumed != old);
        return __int_as_float(old);
    #endif
    }
    __device__ __forceinline__ double cvAtomicAdd(double* address, double val)
    {
    #if __CUDA_ARCH__ >= 130
        unsigned long long int* address_as_ull = (unsigned long long int*) address;
        unsigned long long int old = *address_as_ull, assumed;
        do {
            assumed = old;
            old = ::atomicCAS(address_as_ull, assumed,
                __double_as_longlong(val + __longlong_as_double(assumed)));
        } while (assumed != old);
        return __longlong_as_double(old);
    #else
        (void) address;
        (void) val;
        return 0.0;
    #endif
    }

    __device__ __forceinline__ int cvAtomicMin(int* address, int val)
    {
        return ::atomicMin(address, val);
    }
    __device__ __forceinline__ float cvAtomicMin(float* address, float val)
    {
    #if __CUDA_ARCH__ >= 120
        int* address_as_i = (int*) address;
        int old = *address_as_i, assumed;
        do {
            assumed = old;
            old = ::atomicCAS(address_as_i, assumed,
                __float_as_int(::fminf(val, __int_as_float(assumed))));
        } while (assumed != old);
        return __int_as_float(old);
    #else
        (void) address;
        (void) val;
        return 0.0f;
    #endif
    }
    __device__ __forceinline__ double cvAtomicMin(double* address, double val)
    {
    #if __CUDA_ARCH__ >= 130
        unsigned long long int* address_as_ull = (unsigned long long int*) address;
        unsigned long long int old = *address_as_ull, assumed;
        do {
            assumed = old;
            old = ::atomicCAS(address_as_ull, assumed,
                __double_as_longlong(::fmin(val, __longlong_as_double(assumed))));
        } while (assumed != old);
        return __longlong_as_double(old);
    #else
        (void) address;
        (void) val;
        return 0.0;
    #endif
    }

    __device__ __forceinline__ int cvAtomicMax(int* address, int val)
    {
        return ::atomicMax(address, val);
    }
    __device__ __forceinline__ float cvAtomicMax(float* address, float val)
    {
    #if __CUDA_ARCH__ >= 120
        int* address_as_i = (int*) address;
        int old = *address_as_i, assumed;
        do {
            assumed = old;
            old = ::atomicCAS(address_as_i, assumed,
                __float_as_int(::fmaxf(val, __int_as_float(assumed))));
        } while (assumed != old);
        return __int_as_float(old);
    #else
        (void) address;
        (void) val;
        return 0.0f;
    #endif
    }
    __device__ __forceinline__ double cvAtomicMax(double* address, double val)
    {
    #if __CUDA_ARCH__ >= 130
        unsigned long long int* address_as_ull = (unsigned long long int*) address;
        unsigned long long int old = *address_as_ull, assumed;
        do {
            assumed = old;
            old = ::atomicCAS(address_as_ull, assumed,
                __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
        } while (assumed != old);
        return __longlong_as_double(old);
    #else
        (void) address;
        (void) val;
        return 0.0;
    #endif
    }
}

namespace detail
{
    template <int cn> struct Unroll;
    template <> struct Unroll<1>
    {
        template <int BLOCK_SIZE, typename R>
        static __device__ __forceinline__ volatile R* smem_tuple(R* smem)
        {
            return smem;
        }

        template <typename R>
        static __device__ __forceinline__ R& tie(R& val)
        {
            return val;
        }

        template <class Op>
        static __device__ __forceinline__ const Op& op(const Op& op)
        {
            return op;
        }
    };
    template <> struct Unroll<2>
    {
        template <int BLOCK_SIZE, typename R>
        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*> smem_tuple(R* smem)
        {
            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE);
        }

        template <typename R>
        static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
        {
            return thrust::tie(val.x, val.y);
        }

        template <class Op>
        static __device__ __forceinline__ const thrust::tuple<Op, Op> op(const Op& op)
        {
            return thrust::make_tuple(op, op);
        }
    };
    template <> struct Unroll<3>
    {
        template <int BLOCK_SIZE, typename R>
        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
        {
            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
        }

        template <typename R>
        static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
        {
            return thrust::tie(val.x, val.y, val.z);
        }

        template <class Op>
        static __device__ __forceinline__ const thrust::tuple<Op, Op, Op> op(const Op& op)
        {
            return thrust::make_tuple(op, op, op);
        }
    };
    template <> struct Unroll<4>
    {
        template <int BLOCK_SIZE, typename R>
        static __device__ __forceinline__ thrust::tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem_tuple(R* smem)
        {
            return cv::gpu::device::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
        }

        template <typename R>
        static __device__ __forceinline__ thrust::tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> tie(R& val)
        {
            return thrust::tie(val.x, val.y, val.z, val.w);
        }

        template <class Op>
        static __device__ __forceinline__ const thrust::tuple<Op, Op, Op, Op> op(const Op& op)
        {
            return thrust::make_tuple(op, op, op, op);
        }
    };
}

/////////////////////////////////////////////////////////////
// sum

namespace sum
{
    __device__ unsigned int blocks_finished = 0;

    template <typename R, int cn> struct AtomicAdd;
    template <typename R> struct AtomicAdd<R, 1>
    {
        static __device__ void run(R* ptr, R val)
        {
            detail::cvAtomicAdd(ptr, val);
        }
    };
    template <typename R> struct AtomicAdd<R, 2>
    {
        typedef typename TypeVec<R, 2>::vec_type val_type;

        static __device__ void run(R* ptr, val_type val)
        {
            detail::cvAtomicAdd(ptr, val.x);
            detail::cvAtomicAdd(ptr + 1, val.y);
        }
    };
    template <typename R> struct AtomicAdd<R, 3>
    {
        typedef typename TypeVec<R, 3>::vec_type val_type;

        static __device__ void run(R* ptr, val_type val)
        {
            detail::cvAtomicAdd(ptr, val.x);
            detail::cvAtomicAdd(ptr + 1, val.y);
            detail::cvAtomicAdd(ptr + 2, val.z);
        }
    };
    template <typename R> struct AtomicAdd<R, 4>
    {
        typedef typename TypeVec<R, 4>::vec_type val_type;

        static __device__ void run(R* ptr, val_type val)
        {
            detail::cvAtomicAdd(ptr, val.x);
            detail::cvAtomicAdd(ptr + 1, val.y);
            detail::cvAtomicAdd(ptr + 2, val.z);
            detail::cvAtomicAdd(ptr + 3, val.w);
        }
    };

    template <int BLOCK_SIZE, typename R, int cn>
    struct GlobalReduce
    {
        typedef typename TypeVec<R, cn>::vec_type result_type;

        static __device__ void run(result_type& sum, result_type* result, int tid, int bid, R* smem)
        {
        #if __CUDA_ARCH__ >= 200
            if (tid == 0)
                AtomicAdd<R, cn>::run((R*) result, sum);
        #else
            __shared__ bool is_last;

            if (tid == 0)
            {
                result[bid] = sum;

                __threadfence();

                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
                is_last = (ticket == gridDim.x * gridDim.y - 1);
            }

            __syncthreads();

            if (is_last)
            {
                sum = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<result_type>::all(0);

                device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));

                if (tid == 0)
                {
                    result[0] = sum;
                    blocks_finished = 0;
                }
            }
        #endif
        }
    };

    template <int BLOCK_SIZE, typename src_type, typename result_type, class Mask, class Op>
    __global__ void kernel(const PtrStepSz<src_type> src, result_type* result, const Mask mask, const Op op, const int twidth, const int theight)
    {
        typedef typename VecTraits<src_type>::elem_type T;
        typedef typename VecTraits<result_type>::elem_type R;
        const int cn = VecTraits<src_type>::cn;

        __shared__ R smem[BLOCK_SIZE * cn];

        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;

        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
        const int bid = blockIdx.y * gridDim.x + blockIdx.x;

        result_type sum = VecTraits<result_type>::all(0);

        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
        {
            const src_type* ptr = src.ptr(y);

            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
            {
                if (mask(y, x))
                {
                    const src_type srcVal = ptr[x];
                    sum = sum + op(saturate_cast<result_type>(srcVal));
                }
            }
        }

        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(sum), tid, detail::Unroll<cn>::op(plus<R>()));

        GlobalReduce<BLOCK_SIZE, R, cn>::run(sum, result, tid, bid, smem);
    }

    const int threads_x = 32;
    const int threads_y = 8;

    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
    {
        block = dim3(threads_x, threads_y);

        grid = dim3(divUp(cols, block.x * block.y),
                    divUp(rows, block.y * block.x));

        grid.x = ::min(grid.x, block.x);
        grid.y = ::min(grid.y, block.y);
    }

    void getBufSize(int cols, int rows, int cn, int& bufcols, int& bufrows)
    {
        dim3 block, grid;
        getLaunchCfg(cols, rows, block, grid);

        bufcols = grid.x * grid.y * sizeof(double) * cn;
        bufrows = 1;
    }

    template <typename T, typename R, int cn, template <typename> class Op>
    void caller(PtrStepSzb src_, void* buf_, double* out, PtrStepSzb mask)
    {
        typedef typename TypeVec<T, cn>::vec_type src_type;
        typedef typename TypeVec<R, cn>::vec_type result_type;

        PtrStepSz<src_type> src(src_);
        result_type* buf = (result_type*) buf_;

        dim3 block, grid;
        getLaunchCfg(src.cols, src.rows, block, grid);

        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
        const int theight = divUp(divUp(src.rows, grid.y), block.y);

        Op<result_type> op;

        if (mask.data)
            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, SingleMask(mask), op, twidth, theight);
        else
            kernel<threads_x * threads_y><<<grid, block>>>(src, buf, WithOutMask(), op, twidth, theight);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );

        R result[4] = {0, 0, 0, 0};
        cudaSafeCall( cudaMemcpy(&result, buf, sizeof(result_type), cudaMemcpyDeviceToHost) );

        out[0] = result[0];
        out[1] = result[1];
        out[2] = result[2];
        out[3] = result[3];
    }

    template <typename T> struct SumType;
    template <> struct SumType<uchar> { typedef unsigned int R; };
    template <> struct SumType<schar> { typedef int R; };
    template <> struct SumType<ushort> { typedef unsigned int R; };
    template <> struct SumType<short> { typedef int R; };
    template <> struct SumType<int> { typedef int R; };
    template <> struct SumType<float> { typedef float R; };
    template <> struct SumType<double> { typedef double R; };

    template <typename T, int cn>
    void run(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
    {
        typedef typename SumType<T>::R R;
        caller<T, R, cn, identity>(src, buf, out, mask);
    }

    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#ifndef OPENCV_TINY_GPU_MODULE
    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void run<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void run<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void run<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void run<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#endif

    template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#ifndef OPENCV_TINY_GPU_MODULE
    template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void run<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#endif

    template <typename T, int cn>
    void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
    {
        typedef typename SumType<T>::R R;
        caller<T, R, cn, abs_func>(src, buf, out, mask);
    }

    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#ifndef OPENCV_TINY_GPU_MODULE
    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runAbs<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runAbs<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runAbs<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runAbs<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#endif

    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#ifndef OPENCV_TINY_GPU_MODULE
    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runAbs<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#endif

    template <typename T> struct Sqr : unary_function<T, T>
    {
        __device__ __forceinline__ T operator ()(T x) const
        {
            return x * x;
        }
    };

    template <typename T, int cn>
    void runSqr(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
    {
        caller<T, double, cn, Sqr>(src, buf, out, mask);
    }

    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#ifndef OPENCV_TINY_GPU_MODULE
    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runSqr<schar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<schar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<schar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<schar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runSqr<ushort, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<ushort, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<ushort, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<ushort, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runSqr<short, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<short, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<short, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<short, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runSqr<int, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#endif

    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#ifndef OPENCV_TINY_GPU_MODULE
    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);

    template void runSqr<double, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
#endif
}

/////////////////////////////////////////////////////////////
// minMax

namespace minMax
{
    __device__ unsigned int blocks_finished = 0;

    // To avoid shared bank conflicts we convert each value into value of
    // appropriate type (32 bits minimum)
    template <typename T> struct MinMaxTypeTraits;
    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<schar> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };

    template <int BLOCK_SIZE, typename R>
    struct GlobalReduce
    {
        static __device__ void run(R& mymin, R& mymax, R* minval, R* maxval, int tid, int bid, R* sminval, R* smaxval)
        {
        #if __CUDA_ARCH__ >= 200
            if (tid == 0)
            {
                detail::cvAtomicMin(minval, mymin);
                detail::cvAtomicMax(maxval, mymax);
            }
        #else
            __shared__ bool is_last;

            if (tid == 0)
            {
                minval[bid] = mymin;
                maxval[bid] = mymax;

                __threadfence();

                unsigned int ticket = ::atomicAdd(&blocks_finished, 1);
                is_last = (ticket == gridDim.x * gridDim.y - 1);
            }

            __syncthreads();

            if (is_last)
            {
                int idx = ::min(tid, gridDim.x * gridDim.y - 1);

                mymin = minval[idx];
                mymax = maxval[idx];

                const minimum<R> minOp;
                const maximum<R> maxOp;
                device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));

                if (tid == 0)
                {
                    minval[0] = mymin;
                    maxval[0] = mymax;

                    blocks_finished = 0;
                }
            }
        #endif
        }
    };

    template <int BLOCK_SIZE, typename T, typename R, class Mask>
    __global__ void kernel(const PtrStepSz<T> src, const Mask mask, R* minval, R* maxval, const int twidth, const int theight)
    {
        __shared__ R sminval[BLOCK_SIZE];
        __shared__ R smaxval[BLOCK_SIZE];

        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;

        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
        const int bid = blockIdx.y * gridDim.x + blockIdx.x;

        R mymin = numeric_limits<R>::max();
        R mymax = -numeric_limits<R>::max();

        const minimum<R> minOp;
        const maximum<R> maxOp;

        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
        {
            const T* ptr = src.ptr(y);

            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
            {
                if (mask(y, x))
                {
                    const R srcVal = ptr[x];

                    mymin = minOp(mymin, srcVal);
                    mymax = maxOp(mymax, srcVal);
                }
            }
        }

        device::reduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax), tid, thrust::make_tuple(minOp, maxOp));

        GlobalReduce<BLOCK_SIZE, R>::run(mymin, mymax, minval, maxval, tid, bid, sminval, smaxval);
    }

    const int threads_x = 32;
    const int threads_y = 8;

    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
    {
        block = dim3(threads_x, threads_y);

        grid = dim3(divUp(cols, block.x * block.y),
                    divUp(rows, block.y * block.x));

        grid.x = ::min(grid.x, block.x);
        grid.y = ::min(grid.y, block.y);
    }

    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
    {
        dim3 block, grid;
        getLaunchCfg(cols, rows, block, grid);

        bufcols = grid.x * grid.y * sizeof(double);
        bufrows = 2;
    }

    __global__ void setDefaultKernel(int* minval_buf, int* maxval_buf)
    {
        *minval_buf = numeric_limits<int>::max();
        *maxval_buf = numeric_limits<int>::min();
    }
    __global__ void setDefaultKernel(float* minval_buf, float* maxval_buf)
    {
        *minval_buf = numeric_limits<float>::max();
        *maxval_buf = -numeric_limits<float>::max();
    }
    __global__ void setDefaultKernel(double* minval_buf, double* maxval_buf)
    {
        *minval_buf = numeric_limits<double>::max();
        *maxval_buf = -numeric_limits<double>::max();
    }

    template <typename R>
    void setDefault(R* minval_buf, R* maxval_buf)
    {
        setDefaultKernel<<<1, 1>>>(minval_buf, maxval_buf);
    }

    template <typename T>
    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf)
    {
        typedef typename MinMaxTypeTraits<T>::best_type R;

        dim3 block, grid;
        getLaunchCfg(src.cols, src.rows, block, grid);

        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
        const int theight = divUp(divUp(src.rows, grid.y), block.y);

        R* minval_buf = (R*) buf.ptr(0);
        R* maxval_buf = (R*) buf.ptr(1);

        setDefault(minval_buf, maxval_buf);

        if (mask.data)
            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, twidth, theight);
        else
            kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, twidth, theight);

        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );

        R minval_, maxval_;
        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(R), cudaMemcpyDeviceToHost) );
        *minval = minval_;
        *maxval = maxval_;
    }

    template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
#ifndef OPENCV_TINY_GPU_MODULE
    template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
#endif
    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
#ifndef OPENCV_TINY_GPU_MODULE
    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
#endif
}

/////////////////////////////////////////////////////////////
// minMaxLoc

namespace minMaxLoc
{
    // To avoid shared bank conflicts we convert each value into value of
    // appropriate type (32 bits minimum)
    template <typename T> struct MinMaxTypeTraits;
    template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };

    template <int BLOCK_SIZE, typename T, class Mask>
    __global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
    {
        typedef typename MinMaxTypeTraits<T>::best_type work_type;

        __shared__ work_type sminval[BLOCK_SIZE];
        __shared__ work_type smaxval[BLOCK_SIZE];
        __shared__ unsigned int sminloc[BLOCK_SIZE];
        __shared__ unsigned int smaxloc[BLOCK_SIZE];

        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;

        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
        const int bid = blockIdx.y * gridDim.x + blockIdx.x;

        work_type mymin = numeric_limits<work_type>::max();
        work_type mymax = -numeric_limits<work_type>::max();
        unsigned int myminloc = 0;
        unsigned int mymaxloc = 0;

        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
        {
            const T* ptr = src.ptr(y);

            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
            {
                if (mask(y, x))
                {
                    const work_type srcVal = ptr[x];

                    if (srcVal < mymin)
                    {
                        mymin = srcVal;
                        myminloc = y * src.cols + x;
                    }

                    if (srcVal > mymax)
                    {
                        mymax = srcVal;
                        mymaxloc = y * src.cols + x;
                    }
                }
            }
        }

        reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
                                 smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
                                 tid,
                                 thrust::make_tuple(less<work_type>(), greater<work_type>()));

        if (tid == 0)
        {
            minval[bid] = (T) mymin;
            maxval[bid] = (T) mymax;
            minloc[bid] = myminloc;
            maxloc[bid] = mymaxloc;
        }
    }
    template <int BLOCK_SIZE, typename T>
    __global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
    {
        typedef typename MinMaxTypeTraits<T>::best_type work_type;

        __shared__ work_type sminval[BLOCK_SIZE];
        __shared__ work_type smaxval[BLOCK_SIZE];
        __shared__ unsigned int sminloc[BLOCK_SIZE];
        __shared__ unsigned int smaxloc[BLOCK_SIZE];

        unsigned int idx = ::min(threadIdx.x, count - 1);

        work_type mymin = minval[idx];
        work_type mymax = maxval[idx];
        unsigned int myminloc = minloc[idx];
        unsigned int mymaxloc = maxloc[idx];

        reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
                                 smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
                                 threadIdx.x,
                                 thrust::make_tuple(less<work_type>(), greater<work_type>()));

        if (threadIdx.x == 0)
        {
            minval[0] = (T) mymin;
            maxval[0] = (T) mymax;
            minloc[0] = myminloc;
            maxloc[0] = mymaxloc;
        }
    }

    const int threads_x = 32;
    const int threads_y = 8;

    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
    {
        block = dim3(threads_x, threads_y);

        grid = dim3(divUp(cols, block.x * block.y),
                    divUp(rows, block.y * block.x));

        grid.x = ::min(grid.x, block.x);
        grid.y = ::min(grid.y, block.y);
    }

    void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
    {
        dim3 block, grid;
        getLaunchCfg(cols, rows, block, grid);

        // For values
        b1cols = (int)(grid.x * grid.y * elem_size);
        b1rows = 2;

        // For locations
        b2cols = grid.x * grid.y * sizeof(int);
        b2rows = 2;
    }

    template <typename T>
    void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
    {
        dim3 block, grid;
        getLaunchCfg(src.cols, src.rows, block, grid);

        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
        const int theight = divUp(divUp(src.rows, grid.y), block.y);

        T* minval_buf = (T*) valbuf.ptr(0);
        T* maxval_buf = (T*) valbuf.ptr(1);
        unsigned int* minloc_buf = locbuf.ptr(0);
        unsigned int* maxloc_buf = locbuf.ptr(1);

        if (mask.data)
            kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
        else
            kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);

        cudaSafeCall( cudaGetLastError() );

        kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );

        T minval_, maxval_;
        cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
        cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
        *minval = minval_;
        *maxval = maxval_;

        unsigned int minloc_, maxloc_;
        cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
        cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
        minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
        maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
    }

    template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
#ifndef OPENCV_TINY_GPU_MODULE
    template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
#endif
    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
#ifndef OPENCV_TINY_GPU_MODULE
    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
#endif
}

/////////////////////////////////////////////////////////////
// countNonZero

namespace countNonZero
{
    __device__ unsigned int blocks_finished = 0;

    template <int BLOCK_SIZE, typename T>
    __global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
    {
        __shared__ unsigned int scount[BLOCK_SIZE];

        const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
        const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;

        const int tid = threadIdx.y * blockDim.x + threadIdx.x;

        unsigned int mycount = 0;

        for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
        {
            const T* ptr = src.ptr(y);

            for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
            {
                const T srcVal = ptr[x];

                mycount += (srcVal != 0);
            }
        }

        device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());

    #if __CUDA_ARCH__ >= 200
        if (tid == 0)
            ::atomicAdd(count, mycount);
    #else
        __shared__ bool is_last;
        const int bid = blockIdx.y * gridDim.x + blockIdx.x;

        if (tid == 0)
        {
            count[bid] = mycount;

            __threadfence();

            unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
            is_last = (ticket == gridDim.x * gridDim.y - 1);
        }

        __syncthreads();

        if (is_last)
        {
            mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;

            device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());

            if (tid == 0)
            {
                count[0] = mycount;

                blocks_finished = 0;
            }
        }
    #endif
    }

    const int threads_x = 32;
    const int threads_y = 8;

    void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
    {
        block = dim3(threads_x, threads_y);

        grid = dim3(divUp(cols, block.x * block.y),
                    divUp(rows, block.y * block.x));

        grid.x = ::min(grid.x, block.x);
        grid.y = ::min(grid.y, block.y);
    }

    void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
    {
        dim3 block, grid;
        getLaunchCfg(cols, rows, block, grid);

        bufcols = grid.x * grid.y * sizeof(int);
        bufrows = 1;
    }

    template <typename T>
    int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
    {
        dim3 block, grid;
        getLaunchCfg(src.cols, src.rows, block, grid);

        const int twidth = divUp(divUp(src.cols, grid.x), block.x);
        const int theight = divUp(divUp(src.rows, grid.y), block.y);

        unsigned int* count_buf = buf.ptr(0);

        cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int)) );

        kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );

        unsigned int count;
        cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));

        return count;
    }

    template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
#ifndef OPENCV_TINY_GPU_MODULE
    template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
#endif
    template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
#ifndef OPENCV_TINY_GPU_MODULE
    template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
#endif
}

//////////////////////////////////////////////////////////////////////////////
// reduce

namespace reduce
{
    struct Sum
    {
        template <typename T>
        __device__ __forceinline__ T startValue() const
        {
            return VecTraits<T>::all(0);
        }

        template <typename T>
        __device__ __forceinline__ T operator ()(T a, T b) const
        {
            return a + b;
        }

        template <typename T>
        __device__ __forceinline__ T result(T r, double) const
        {
            return r;
        }

        __device__ __forceinline__ Sum() {}
        __device__ __forceinline__ Sum(const Sum&) {}
    };

    struct Avg
    {
        template <typename T>
        __device__ __forceinline__ T startValue() const
        {
            return VecTraits<T>::all(0);
        }

        template <typename T>
        __device__ __forceinline__ T operator ()(T a, T b) const
        {
            return a + b;
        }

        template <typename T>
        __device__ __forceinline__ typename TypeVec<double, VecTraits<T>::cn>::vec_type result(T r, double sz) const
        {
            return r / sz;
        }

        __device__ __forceinline__ Avg() {}
        __device__ __forceinline__ Avg(const Avg&) {}
    };

    struct Min
    {
        template <typename T>
        __device__ __forceinline__ T startValue() const
        {
            return VecTraits<T>::all(numeric_limits<typename VecTraits<T>::elem_type>::max());
        }

        template <typename T>
        __device__ __forceinline__ T operator ()(T a, T b) const
        {
            minimum<T> minOp;
            return minOp(a, b);
        }

        template <typename T>
        __device__ __forceinline__ T result(T r, double) const
        {
            return r;
        }

        __device__ __forceinline__ Min() {}
        __device__ __forceinline__ Min(const Min&) {}
    };

    struct Max
    {
        template <typename T>
        __device__ __forceinline__ T startValue() const
        {
            return VecTraits<T>::all(-numeric_limits<typename VecTraits<T>::elem_type>::max());
        }

        template <typename T>
        __device__ __forceinline__ T operator ()(T a, T b) const
        {
            maximum<T> maxOp;
            return maxOp(a, b);
        }

        template <typename T>
        __device__ __forceinline__ T result(T r, double) const
        {
            return r;
        }

        __device__ __forceinline__ Max() {}
        __device__ __forceinline__ Max(const Max&) {}
    };

    ///////////////////////////////////////////////////////////

    template <typename T, typename S, typename D, class Op>
    __global__ void rowsKernel(const PtrStepSz<T> src, D* dst, const Op op)
    {
        __shared__ S smem[16 * 16];

        const int x = blockIdx.x * 16 + threadIdx.x;

        S myVal = op.template startValue<S>();

        if (x < src.cols)
        {
            for (int y = threadIdx.y; y < src.rows; y += 16)
            {
                S srcVal = src(y, x);
                myVal = op(myVal, srcVal);
            }
        }

        smem[threadIdx.x * 16 + threadIdx.y] = myVal;

        __syncthreads();

        volatile S* srow = smem + threadIdx.y * 16;

        myVal = srow[threadIdx.x];
        device::reduce<16>(srow, myVal, threadIdx.x, op);

        if (threadIdx.x == 0)
            srow[0] = myVal;

        __syncthreads();

        if (threadIdx.y == 0 && x < src.cols)
            dst[x] = (D) op.result(smem[threadIdx.x * 16], src.rows);
    }

    template <typename T, typename S, typename D, class Op>
    void rowsCaller(PtrStepSz<T> src, D* dst, cudaStream_t stream)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(src.cols, block.x));

        Op op;
        rowsKernel<T, S, D, Op><<<grid, block, 0, stream>>>(src, dst, op);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }

    template <typename T, typename S, typename D>
    void rows(PtrStepSzb src, void* dst, int op, cudaStream_t stream)
    {
        typedef void (*func_t)(PtrStepSz<T> src, D* dst, cudaStream_t stream);
        static const func_t funcs[] =
        {
            rowsCaller<T, S, D, Sum>,
            rowsCaller<T, S, D, Avg>,
            rowsCaller<T, S, D, Max>,
            rowsCaller<T, S, D, Min>
        };

        funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
    }

#ifdef OPENCV_TINY_GPU_MODULE
    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
#else
    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);

    template void rows<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);

    template void rows<short, int, short>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<short, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<short, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<short, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);

    template void rows<int, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<int, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<int, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);

    template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);

    template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
#endif

    ///////////////////////////////////////////////////////////

    template <int BLOCK_SIZE, typename T, typename S, typename D, int cn, class Op>
    __global__ void colsKernel(const PtrStepSz<typename TypeVec<T, cn>::vec_type> src, typename TypeVec<D, cn>::vec_type* dst, const Op op)
    {
        typedef typename TypeVec<T, cn>::vec_type src_type;
        typedef typename TypeVec<S, cn>::vec_type work_type;
        typedef typename TypeVec<D, cn>::vec_type dst_type;

        __shared__ S smem[BLOCK_SIZE * cn];

        const int y = blockIdx.x;

        const src_type* srcRow = src.ptr(y);

        work_type myVal = op.template startValue<work_type>();

        for (int x = threadIdx.x; x < src.cols; x += BLOCK_SIZE)
            myVal = op(myVal, saturate_cast<work_type>(srcRow[x]));

        device::reduce<BLOCK_SIZE>(detail::Unroll<cn>::template smem_tuple<BLOCK_SIZE>(smem), detail::Unroll<cn>::tie(myVal), threadIdx.x, detail::Unroll<cn>::op(op));

        if (threadIdx.x == 0)
            dst[y] = saturate_cast<dst_type>(op.result(myVal, src.cols));
    }

    template <typename T, typename S, typename D, int cn, class Op> void colsCaller(PtrStepSzb src, void* dst, cudaStream_t stream)
    {
        const int BLOCK_SIZE = 256;

        const dim3 block(BLOCK_SIZE);
        const dim3 grid(src.rows);

        Op op;
        colsKernel<BLOCK_SIZE, T, S, D, cn, Op><<<grid, block, 0, stream>>>((PtrStepSz<typename TypeVec<T, cn>::vec_type>) src, (typename TypeVec<D, cn>::vec_type*) dst, op);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );

    }

    template <typename T, typename S, typename D> void cols(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream)
    {
        typedef void (*func_t)(PtrStepSzb src, void* dst, cudaStream_t stream);
        static const func_t funcs[5][4] =
        {
            {0,0,0,0},
            {colsCaller<T, S, D, 1, Sum>, colsCaller<T, S, D, 1, Avg>, colsCaller<T, S, D, 1, Max>, colsCaller<T, S, D, 1, Min>},
            {colsCaller<T, S, D, 2, Sum>, colsCaller<T, S, D, 2, Avg>, colsCaller<T, S, D, 2, Max>, colsCaller<T, S, D, 2, Min>},
            {colsCaller<T, S, D, 3, Sum>, colsCaller<T, S, D, 3, Avg>, colsCaller<T, S, D, 3, Max>, colsCaller<T, S, D, 3, Min>},
            {colsCaller<T, S, D, 4, Sum>, colsCaller<T, S, D, 4, Avg>, colsCaller<T, S, D, 4, Max>, colsCaller<T, S, D, 4, Min>},
        };

        funcs[cn][op](src, dst, stream);
    }

#ifdef OPENCV_TINY_GPU_MODULE
    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
#else
    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);

    template void cols<unsigned short, int, unsigned short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);

    template void cols<short, int, short>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<short, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<short, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<short, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);

    template void cols<int, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<int, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<int, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);

    template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);

    template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
#endif
}

#endif /* CUDA_DISABLER */