/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #if !defined CUDA_DISABLER #include "internal_shared.hpp" #include "opencv2/gpu/device/vec_math.hpp" namespace cv { namespace gpu { namespace device { namespace match_template { __device__ __forceinline__ float sum(float v) { return v; } __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; } __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; } __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } __device__ __forceinline__ float first(float v) { return v; } __device__ __forceinline__ float first(float2 v) { return v.x; } __device__ __forceinline__ float first(float3 v) { return v.x; } __device__ __forceinline__ float first(float4 v) { return v.x; } __device__ __forceinline__ float mul(float a, float b) { return a * b; } __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; } __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); } __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } __device__ __forceinline__ float sub(float a, float b) { return a - b; } __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; } __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); } __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } ////////////////////////////////////////////////////////////////////// // Naive_CCORR template <typename T, int cn> __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) { typedef typename TypeVec<T, cn>::vec_type Type; typedef typename TypeVec<float, cn>::vec_type Typef; int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < result.cols && y < result.rows) { Typef res = VecTraits<Typef>::all(0); for (int i = 0; i < h; ++i) { const Type* image_ptr = (const Type*)image.ptr(y + i); const Type* templ_ptr = (const Type*)templ.ptr(i); for (int j = 0; j < w; ++j) res = res + mul(image_ptr[x + j], templ_ptr[j]); } result.ptr(y)[x] = sum(res); } } template <typename T, int cn> void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) { const dim3 threads(32, 8); const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) { typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); static const caller_t callers[] = { 0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4> }; callers[cn](image, templ, result, stream); } void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) { typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); static const caller_t callers[] = { 0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4> }; callers[cn](image, templ, result, stream); } ////////////////////////////////////////////////////////////////////// // Naive_SQDIFF template <typename T, int cn> __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result) { typedef typename TypeVec<T, cn>::vec_type Type; typedef typename TypeVec<float, cn>::vec_type Typef; int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < result.cols && y < result.rows) { Typef res = VecTraits<Typef>::all(0); Typef delta; for (int i = 0; i < h; ++i) { const Type* image_ptr = (const Type*)image.ptr(y + i); const Type* templ_ptr = (const Type*)templ.ptr(i); for (int j = 0; j < w; ++j) { delta = sub(image_ptr[x + j], templ_ptr[j]); res = res + delta * delta; } } result.ptr(y)[x] = sum(res); } } template <typename T, int cn> void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream) { const dim3 threads(32, 8); const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) { typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); static const caller_t callers[] = { 0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4> }; callers[cn](image, templ, result, stream); } void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream) { typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream); static const caller_t callers[] = { 0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4> }; callers[cn](image, templ, result, stream); } ////////////////////////////////////////////////////////////////////// // Prepared_SQDIFF template <int cn> __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sqsum_ = (float)( (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); float ccorr = result.ptr(y)[x]; result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum; } } template <int cn> void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream) { const dim3 threads(32, 8); const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream) { typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream); static const caller_t callers[] = { 0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4> }; callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); } ////////////////////////////////////////////////////////////////////// // Prepared_SQDIFF_NORMED // normAcc* are accurate normalization routines which make GPU matchTemplate // consistent with CPU one __device__ float normAcc(float num, float denum) { if (::fabs(num) < denum) return num / denum; if (::fabs(num) < denum * 1.125f) return num > 0 ? 1 : -1; return 0; } __device__ float normAcc_SQDIFF(float num, float denum) { if (::fabs(num) < denum) return num / denum; if (::fabs(num) < denum * 1.125f) return num > 0 ? 1 : -1; return 1; } template <int cn> __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U( int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sqsum_ = (float)( (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); float ccorr = result.ptr(y)[x]; result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum, sqrtf(image_sqsum_ * templ_sqsum)); } } template <int cn> void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream) { const dim3 threads(32, 8); const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream) { typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream); static const caller_t callers[] = { 0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4> }; callers[cn](w, h, image_sqsum, templ_sqsum, result, stream); } ////////////////////////////////////////////////////////////////////// // Prepared_CCOFF __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sum_ = (float)( (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); float ccorr = result.ptr(y)[x]; result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale; } } void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } __global__ void matchTemplatePreparedKernel_CCOFF_8UC2( int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned int> image_sum_g, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sum_r_ = (float)( (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); float image_sum_g_ = (float)( (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); float ccorr = result.ptr(y)[x]; result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g; } } void matchTemplatePrepared_CCOFF_8UC2( int w, int h, const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned int> image_sum_g, unsigned int templ_sum_r, unsigned int templ_sum_g, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>( w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), image_sum_r, image_sum_g, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } __global__ void matchTemplatePreparedKernel_CCOFF_8UC3( int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned int> image_sum_b, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sum_r_ = (float)( (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); float image_sum_g_ = (float)( (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); float image_sum_b_ = (float)( (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); float ccorr = result.ptr(y)[x]; result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g - image_sum_b_ * templ_sum_scale_b; } } void matchTemplatePrepared_CCOFF_8UC3( int w, int h, const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned int> image_sum_b, unsigned int templ_sum_r, unsigned int templ_sum_g, unsigned int templ_sum_b, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>( w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), (float)templ_sum_b / (w * h), image_sum_r, image_sum_g, image_sum_b, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } __global__ void matchTemplatePreparedKernel_CCOFF_8UC4( int w, int h, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, float templ_sum_scale_a, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned int> image_sum_a, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sum_r_ = (float)( (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); float image_sum_g_ = (float)( (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); float image_sum_b_ = (float)( (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); float image_sum_a_ = (float)( (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); float ccorr = result.ptr(y)[x]; result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a; } } void matchTemplatePrepared_CCOFF_8UC4( int w, int h, const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned int> image_sum_a, unsigned int templ_sum_r, unsigned int templ_sum_g, unsigned int templ_sum_b, unsigned int templ_sum_a, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>( w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h), (float)templ_sum_b / (w * h), (float)templ_sum_a / (w * h), image_sum_r, image_sum_g, image_sum_b, image_sum_a, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////// // Prepared_CCOFF_NORMED __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U( int w, int h, float weight, float templ_sum_scale, float templ_sqsum_scale, const PtrStep<unsigned int> image_sum, const PtrStep<unsigned long long> image_sqsum, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float ccorr = result.ptr(y)[x]; float image_sum_ = (float)( (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) - (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x])); float image_sqsum_ = (float)( (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) - (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x])); result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale, sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_))); } } void matchTemplatePrepared_CCOFF_NORMED_8U( int w, int h, const PtrStepSz<unsigned int> image_sum, const PtrStepSz<unsigned long long> image_sqsum, unsigned int templ_sum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); float weight = 1.f / (w * h); float templ_sum_scale = templ_sum * weight; float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum; matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>( w, h, weight, templ_sum_scale, templ_sqsum_scale, image_sum, image_sqsum, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2( int w, int h, float weight, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sqsum_scale, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sum_r_ = (float)( (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); float image_sqsum_r_ = (float)( (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); float image_sum_g_ = (float)( (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); float image_sqsum_g_ = (float)( (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g; float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_)); result.ptr(y)[x] = normAcc(num, denum); } } void matchTemplatePrepared_CCOFF_NORMED_8UC2( int w, int h, const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, unsigned int templ_sum_r, unsigned long long templ_sqsum_r, unsigned int templ_sum_g, unsigned long long templ_sqsum_g, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); float weight = 1.f / (w * h); float templ_sum_scale_r = templ_sum_r * weight; float templ_sum_scale_g = templ_sum_g * weight; float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r + templ_sqsum_g - weight * templ_sum_g * templ_sum_g; matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>( w, h, weight, templ_sum_scale_r, templ_sum_scale_g, templ_sqsum_scale, image_sum_r, image_sqsum_r, image_sum_g, image_sqsum_g, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3( int w, int h, float weight, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, float templ_sqsum_scale, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sum_r_ = (float)( (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); float image_sqsum_r_ = (float)( (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); float image_sum_g_ = (float)( (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); float image_sqsum_g_ = (float)( (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); float image_sum_b_ = (float)( (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); float image_sqsum_b_ = (float)( (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g - image_sum_b_ * templ_sum_scale_b; float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_)); result.ptr(y)[x] = normAcc(num, denum); } } void matchTemplatePrepared_CCOFF_NORMED_8UC3( int w, int h, const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, unsigned int templ_sum_r, unsigned long long templ_sqsum_r, unsigned int templ_sum_g, unsigned long long templ_sqsum_g, unsigned int templ_sum_b, unsigned long long templ_sqsum_b, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); float weight = 1.f / (w * h); float templ_sum_scale_r = templ_sum_r * weight; float templ_sum_scale_g = templ_sum_g * weight; float templ_sum_scale_b = templ_sum_b * weight; float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r + templ_sqsum_g - weight * templ_sum_g * templ_sum_g + templ_sqsum_b - weight * templ_sum_b * templ_sum_b; matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>( w, h, weight, templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sqsum_scale, image_sum_r, image_sqsum_r, image_sum_g, image_sqsum_g, image_sum_b, image_sqsum_b, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4( int w, int h, float weight, float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b, float templ_sum_scale_a, float templ_sqsum_scale, const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r, const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g, const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b, const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sum_r_ = (float)( (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) - (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x])); float image_sqsum_r_ = (float)( (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) - (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x])); float image_sum_g_ = (float)( (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) - (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x])); float image_sqsum_g_ = (float)( (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) - (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x])); float image_sum_b_ = (float)( (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) - (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x])); float image_sqsum_b_ = (float)( (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) - (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x])); float image_sum_a_ = (float)( (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) - (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x])); float image_sqsum_a_ = (float)( (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) - (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x])); float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a; float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_ + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_ + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_ + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_)); result.ptr(y)[x] = normAcc(num, denum); } } void matchTemplatePrepared_CCOFF_NORMED_8UC4( int w, int h, const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r, const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g, const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b, const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a, unsigned int templ_sum_r, unsigned long long templ_sqsum_r, unsigned int templ_sum_g, unsigned long long templ_sqsum_g, unsigned int templ_sum_b, unsigned long long templ_sqsum_b, unsigned int templ_sum_a, unsigned long long templ_sqsum_a, PtrStepSzf result, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); float weight = 1.f / (w * h); float templ_sum_scale_r = templ_sum_r * weight; float templ_sum_scale_g = templ_sum_g * weight; float templ_sum_scale_b = templ_sum_b * weight; float templ_sum_scale_a = templ_sum_a * weight; float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r + templ_sqsum_g - weight * templ_sum_g * templ_sum_g + templ_sqsum_b - weight * templ_sum_b * templ_sum_b + templ_sqsum_a - weight * templ_sum_a * templ_sum_a; matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>( w, h, weight, templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a, templ_sqsum_scale, image_sum_r, image_sqsum_r, image_sum_g, image_sqsum_g, image_sum_b, image_sqsum_b, image_sum_a, image_sqsum_a, result); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////// // normalize template <int cn> __global__ void normalizeKernel_8U( int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result) { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < result.cols && y < result.rows) { float image_sqsum_ = (float)( (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) - (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn])); result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum)); } } void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); switch (cn) { case 1: normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); break; case 2: normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); break; case 3: normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); break; case 4: normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result); break; } cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } ////////////////////////////////////////////////////////////////////// // extractFirstChannel template <int cn> __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result) { typedef typename TypeVec<float, cn>::vec_type Typef; int x = blockDim.x * blockIdx.x + threadIdx.x; int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < result.cols && y < result.rows) { Typef val = ((const Typef*)image.ptr(y))[x]; result.ptr(y)[x] = first(val); } } void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream) { dim3 threads(32, 8); dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y)); switch (cn) { case 1: extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result); break; case 2: extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result); break; case 3: extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result); break; case 4: extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result); break; } cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } } //namespace match_template }}} // namespace cv { namespace gpu { namespace device #endif /* CUDA_DISABLER */