/* * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. */ #include <vector> #include "common.hpp" namespace CAROTENE_NS { bool isScharr3x3Supported(const Size2D &size, BORDER_MODE border, s32 dx, s32 dy, Margin borderMargin) { return (dx == 0 && dy == 1 && isSeparableFilter3x3Supported(size, border, 3, 1, borderMargin)) || (dx == 1 && dy == 0 && isSeparableFilter3x3Supported(size, border, 1, 3, borderMargin)); } void Scharr3x3(const Size2D &size, const u8 * srcBase, ptrdiff_t srcStride, s16 * dstBase, ptrdiff_t dstStride, s32 dx, s32 dy, BORDER_MODE border, u8 borderValue, Margin borderMargin) { internal::assertSupportedConfiguration(isScharr3x3Supported(size, border, dx, dy, borderMargin)); #ifdef CAROTENE_NEON static s16 dw[] = {3, 10, 3}; if (dy == 1) SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, 3, 1, dw, 0, border, borderValue, borderMargin); else SeparableFilter3x3(size, srcBase, srcStride, dstBase, dstStride, 1, 3, 0, dw, border, borderValue, borderMargin); #else (void)srcBase; (void)srcStride; (void)dstBase; (void)dstStride; (void)borderValue; #endif } void ScharrDeriv(const Size2D &size, s32 cn, const u8 * srcBase, ptrdiff_t srcStride, s16 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON size_t colsn = size.width*cn; size_t roiw8 = colsn > 7 ? colsn - 7 : 0; ptrdiff_t delta = (ptrdiff_t)(((size.width + 2)*cn + 15) & -16);//align size std::vector<s16> _tempBuf((delta << 1) + 64); s16 *trow0 = internal::alignPtr(&_tempBuf[cn], 16), *trow1 = internal::alignPtr(trow0 + delta, 16); int16x8_t vc3 = vmovq_n_s16(3); int16x8_t vc10 = vmovq_n_s16(10); uint8x8_t v8c10 = vmov_n_u8(10); for(size_t y = 0; y < size.height; y++ ) { const u8* srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : size.height > 1 ? 1 : 0); const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y); const u8* srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height > 1 ? size.height-2 : 0); s16* drow = internal::getRowPtr(dstBase, dstStride, y); // do vertical convolution size_t x = 0; for( ; x < roiw8; x += 8 ) { internal::prefetch(srow0 + x); internal::prefetch(srow1 + x); internal::prefetch(srow2 + x); #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 __asm__ ( "vld1.8 {d0}, [%[src0]] \n\t" "vld1.8 {d2}, [%[src2]] \n\t" "vld1.8 {d1}, [%[src1]] \n\t" "vaddl.u8 q2, d2, d0 \n\t" "vmull.u8 q3, d1, %[vc10] \n\t" "vsubl.u8 q4, d2, d0 \n\t" "vmla.s16 q3, q2, %q[vc3] \n\t" "vst1.16 {d8-d9}, [%[out1],:128] \n\t" "vst1.16 {d6-d7}, [%[out0],:128] \n\t" : : [out0] "r" (trow0 + x), [out1] "r" (trow1 + x), [src0] "r" (srow0 + x), [src1] "r" (srow1 + x), [src2] "r" (srow2 + x), [vc10] "w" (v8c10), [vc3] "w" (vc3) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" ); #else uint8x8_t s0 = vld1_u8(srow0 + x); uint8x8_t s1 = vld1_u8(srow1 + x); uint8x8_t s2 = vld1_u8(srow2 + x); int16x8_t s1x10 = vreinterpretq_s16_u16(vmull_u8(s1, v8c10)); int16x8_t s02 = vreinterpretq_s16_u16(vaddl_u8(s2, s0)); int16x8_t t1 = vreinterpretq_s16_u16(vsubl_u8(s2, s0)); int16x8_t t0 = vmlaq_s16(s1x10, s02, vc3); vst1q_s16(trow1 + x, t1); vst1q_s16(trow0 + x, t0); #endif } for( ; x < colsn; x++ ) { trow0[x] = (s16)((srow0[x] + srow2[x])*3 + srow1[x]*10); trow1[x] = (s16)(srow2[x] - srow0[x]); } // make border size_t x0 = (size.width > 1 ? cn : 0), x1 = (size.width > 1 ? (size.width-2)*cn : 0); for( s32 k = 0; k < cn; k++ ) { trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k]; trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k]; } // do horizontal convolution, interleave the results and store them to dst x = 0; for( ; x < roiw8; x += 8 ) { #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6 __asm__ ( "vld1.16 {d4-d5}, [%[s2ptr]] \n\t" "vld1.16 {d8-d9}, [%[s4ptr]] \n\t" "vld1.16 {d6-d7}, [%[s3ptr],:128] \n\t" "vld1.16 {d0-d1}, [%[s0ptr]] \n\t" "vld1.16 {d2-d3}, [%[s1ptr]] \n\t" "vadd.i16 q7, q2, q4 \n\t" "vmul.s16 q6, q3, %q[vc10] \n\t" "vsub.s16 q5, q1, q0 \n\t" "vmla.s16 q6, q7, %q[vc3] \n\t" "vst2.16 {d10-d13}, [%[out]] \n\t" : : [out] "r" (drow + x * 2), [s0ptr] "r" (trow0 + x - cn), [s1ptr] "r" (trow0 + x + cn), [s2ptr] "r" (trow1 + x - cn), [s3ptr] "r" (trow1 + x), [s4ptr] "r" (trow1 + x + cn), [vc10] "w" (vc10), [vc3] "w" (vc3) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13","d14","d15" ); #else int16x8_t s0 = vld1q_s16(trow0 + x - cn); int16x8_t s1 = vld1q_s16(trow0 + x + cn); int16x8_t s2 = vld1q_s16(trow1 + x - cn); int16x8_t s3 = vld1q_s16(trow1 + x); int16x8_t s4 = vld1q_s16(trow1 + x + cn); int16x8_t s3x10 = vmulq_s16(s3, vc10); int16x8_t s24 = vaddq_s16(s2, s4); int16x8x2_t vr; vr.val[0] = vsubq_s16(s1, s0); vr.val[1] = vmlaq_s16(s3x10, s24, vc3); vst2q_s16(drow + x*2, vr); #endif } for( ; x < colsn; x++ ) { drow[x*2] = (s16)(trow0[x+cn] - trow0[x-cn]); drow[x*2+1] = (s16)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10); } } #else (void)size; (void)cn; (void)srcBase; (void)srcStride; (void)dstBase; (void)dstStride; #endif } } // namespace CAROTENE_NS