/* * By downloading, copying, installing or using the software you agree to this license. * If you do not agree to this license, do not download, install, * copy or use the software. * * * License Agreement * For Open Source Computer Vision Library * (3-clause BSD License) * * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved. * Third party copyrights are property of their respective owners. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the names of the copyright holders nor the names of the contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * This software is provided by the copyright holders and contributors "as is" and * any express or implied warranties, including, but not limited to, the implied * warranties of merchantability and fitness for a particular purpose are disclaimed. * In no event shall copyright holders or contributors be liable for any direct, * indirect, incidental, special, exemplary, or consequential damages * (including, but not limited to, procurement of substitute goods or services; * loss of use, data, or profits; or business interruption) however caused * and on any theory of liability, whether in contract, strict liability, * or tort (including negligence or otherwise) arising in any way out of * the use of this software, even if advised of the possibility of such damage. */ #include "common.hpp" #include "saturate_cast.hpp" #include <vector> #include <float.h> // For FLT_EPSILON namespace CAROTENE_NS { #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) /* * Pyramidal Lucas-Kanade Optical Flow level processing */ void pyrLKOptFlowLevel(const Size2D &size, s32 cn, const u8 *prevData, ptrdiff_t prevStride, const s16 *prevDerivData, ptrdiff_t prevDerivStride, const u8 *nextData, ptrdiff_t nextStride, u32 ptCount, const f32 *prevPts, f32 *nextPts, u8 *status, f32 *err, const Size2D &winSize, u32 terminationCount, f64 terminationEpsilon, u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals, f32 minEigThreshold) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f; s32 cn2 = cn*2; std::vector<s16> _buf(winSize.total()*(cn + cn2)); s16* IWinBuf = &_buf[0]; s32 IWinBufStride = winSize.width*cn; s16* derivIWinBuf = &_buf[winSize.total()*cn]; s32 derivIWinBufStride = winSize.width*cn2; for( u32 ptidx = 0; ptidx < ptCount; ptidx++ ) { f32 levscale = (1./(1 << level)); u32 ptref = ptidx << 1; f32 prevPtX = prevPts[ptref+0]*levscale; f32 prevPtY = prevPts[ptref+1]*levscale; f32 nextPtX; f32 nextPtY; if( level == maxLevel ) { if( useInitialFlow ) { nextPtX = nextPts[ptref+0]*levscale; nextPtY = nextPts[ptref+1]*levscale; } else { nextPtX = prevPtX; nextPtY = prevPtY; } } else { nextPtX = nextPts[ptref+0]*2.f; nextPtY = nextPts[ptref+1]*2.f; } nextPts[ptref+0] = nextPtX; nextPts[ptref+1] = nextPtY; s32 iprevPtX, iprevPtY; s32 inextPtX, inextPtY; prevPtX -= halfWinX; prevPtY -= halfWinY; iprevPtX = floor(prevPtX); iprevPtY = floor(prevPtY); if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width || iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height ) { if( level == 0 ) { if( status ) status[ptidx] = false; if( err ) err[ptidx] = 0; } continue; } f32 a = prevPtX - iprevPtX; f32 b = prevPtY - iprevPtY; const s32 W_BITS = 14, W_BITS1 = 14; const f32 FLT_SCALE = 1.f/(1 << 20); s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS)); s32 iw01 = round(a*(1.f - b)*(1 << W_BITS)); s32 iw10 = round((1.f - a)*b*(1 << W_BITS)); s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; s32 dstep = prevDerivStride/sizeof(s16); f32 A11 = 0, A12 = 0, A22 = 0; int16x4_t viw00 = vmov_n_s16((s16)iw00); int16x4_t viw01 = vmov_n_s16((s16)iw01); int16x4_t viw10 = vmov_n_s16((s16)iw10); int16x4_t viw11 = vmov_n_s16((s16)iw11); float32x4_t vA11 = vmovq_n_f32(0); float32x4_t vA12 = vmovq_n_f32(0); float32x4_t vA22 = vmovq_n_f32(0); s32 wwcn = winSize.width*cn; // extract the patch from the first image, compute covariation matrix of derivatives s32 x = 0; for(s32 y = 0; y < (s32)winSize.height; y++ ) { const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn; const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2; s16* Iptr = IWinBuf + y*IWinBufStride; s16* dIptr = derivIWinBuf + y*derivIWinBufStride; internal::prefetch(src + x + prevStride * 2, 0); for(x = 0; x <= wwcn - 8; x += 8) { uint8x8_t vsrc00 = vld1_u8(src + x); uint8x8_t vsrc10 = vld1_u8(src + x + prevStride); uint8x8_t vsrc01 = vld1_u8(src + x + cn); uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn); int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00)); int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10)); int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01)); int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11)); int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00); int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10); vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01); vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11); vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10); vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00); vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11); vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01); int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5); vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh)); } for(; x <= wwcn - 4; x += 4) { uint8x8_t vsrc00 = vld1_u8(src + x); uint8x8_t vsrc10 = vld1_u8(src + x + prevStride); uint8x8_t vsrc01 = vld1_u8(src + x + cn); uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn); int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00))); int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10))); int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01))); int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11))); int32x4_t vsuml1 = vmull_s16(vs00, viw00); int32x4_t vsuml2 = vmull_s16(vs01, viw01); vsuml1 = vmlal_s16(vsuml1, vs10, viw10); vsuml2 = vmlal_s16(vsuml2, vs11, viw11); int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2); int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); vst1_s16(Iptr + x, vsumnl); } internal::prefetch(dsrc + dstep * 2, 0); for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) { #if 0 __asm__ ( "vld2.16 {d0-d1}, [%[dsrc00]] \n\t" "vld2.16 {d2-d3}, [%[dsrc10]] \n\t" "vld2.16 {d4-d5}, [%[dsrc01]] \n\t" "vld2.16 {d6-d7}, [%[dsrc11]] \n\t" "vmull.s16 q4, d3, %P[viw10] \n\t" "vmull.s16 q5, d0, %P[viw00] \n\t" "vmlal.s16 q4, d7, %P[viw11] \n\t" "vmlal.s16 q5, d4, %P[viw01] \n\t" "vmlal.s16 q4, d1, %P[viw00] \n\t" "vmlal.s16 q5, d2, %P[viw10] \n\t" "vmlal.s16 q4, d5, %P[viw01] \n\t" "vmlal.s16 q5, d6, %P[viw11] \n\t" "vrshrn.s32 d13, q4, %[W_BITS1] \n\t" "vrshrn.s32 d12, q5, %[W_BITS1] \n\t" "vmull.s16 q3, d13, d13 \n\t" "vmull.s16 q4, d12, d12 \n\t" "vmull.s16 q5, d13, d12 \n\t" "vcvt.f32.s32 q3, q3 \n\t" "vcvt.f32.s32 q4, q4 \n\t" "vcvt.f32.s32 q5, q5 \n\t" "vadd.f32 %q[vA22], q3 \n\t" "vadd.f32 %q[vA11], q4 \n\t" "vadd.f32 %q[vA12], q5 \n\t" "vst2.16 {d12-d13}, [%[out]] \n\t" : [vA22] "=w" (vA22), [vA11] "=w" (vA11), [vA12] "=w" (vA12) : "0" (vA22), "1" (vA11), "2" (vA12), [out] "r" (dIptr), [dsrc00] "r" (dsrc), [dsrc10] "r" (dsrc + dstep), [dsrc01] "r" (dsrc + cn2), [dsrc11] "r" (dsrc + dstep + cn2), [viw00] "w" (viw00), [viw10] "w" (viw10), [viw01] "w" (viw01), [viw11] "w" (viw11), [W_BITS1] "I" (W_BITS1) : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13" ); #else int16x4x2_t vdsrc00 = vld2_s16(dsrc); int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep); int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2); int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2); int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10); int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00); vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11); vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01); vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00); vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10); vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01); vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11); int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1); int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1); int32x4_t va22i = vmull_s16(vsumny, vsumny); int32x4_t va11i = vmull_s16(vsumnx, vsumnx); int32x4_t va12i = vmull_s16(vsumnx, vsumny); float32x4_t va22f = vcvtq_f32_s32(va22i); float32x4_t va11f = vcvtq_f32_s32(va11i); float32x4_t va12f = vcvtq_f32_s32(va12i); vA22 = vaddq_f32(vA22, va22f); vA11 = vaddq_f32(vA11, va11f); vA12 = vaddq_f32(vA12, va12f); int16x4x2_t vsum; vsum.val[0] = vsumnx; vsum.val[1] = vsumny; vst2_s16(dIptr, vsum); #endif } for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 ) { s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 + src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5); s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 + dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1); s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 + dsrc[dstep+cn2+1]*iw11, W_BITS1); Iptr[x] = (s16)ival; dIptr[0] = (s16)ixval; dIptr[1] = (s16)iyval; A11 += (f32)(ixval*ixval); A12 += (f32)(ixval*iyval); A22 += (f32)(iyval*iyval); } } f32 A11buf[2], A12buf[2], A22buf[2]; vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11))); vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12))); vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22))); A11 += A11buf[0] + A11buf[1]; A12 += A12buf[0] + A12buf[1]; A22 += A22buf[0] + A22buf[1]; A11 *= FLT_SCALE; A12 *= FLT_SCALE; A22 *= FLT_SCALE; f32 D = A11*A22 - A12*A12; f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) + 4.f*A12*A12))/(2*winSize.width*winSize.height); if( err && getMinEigenVals ) err[ptidx] = (f32)minEig; if( minEig < minEigThreshold || D < FLT_EPSILON ) { if( level == 0 && status ) status[ptidx] = false; continue; } D = 1.f/D; nextPtX -= halfWinX; nextPtY -= halfWinY; f32 prevDeltaX = 0; f32 prevDeltaY = 0; for(u32 j = 0; j < terminationCount; j++ ) { inextPtX = floor(nextPtX); inextPtY = floor(nextPtY); if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width || inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height ) { if( level == 0 && status ) status[ptidx] = false; break; } a = nextPtX - inextPtX; b = nextPtY - inextPtY; iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS)); iw01 = round(a*(1.f - b)*(1 << W_BITS)); iw10 = round((1.f - a)*b*(1 << W_BITS)); iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; f32 b1 = 0, b2 = 0; viw00 = vmov_n_s16((s16)iw00); viw01 = vmov_n_s16((s16)iw01); viw10 = vmov_n_s16((s16)iw10); viw11 = vmov_n_s16((s16)iw11); float32x4_t vb1 = vmovq_n_f32(0); float32x4_t vb2 = vmovq_n_f32(0); for(s32 y = 0; y < (s32)winSize.height; y++ ) { const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn; const s16* Iptr = IWinBuf + y*IWinBufStride; const s16* dIptr = derivIWinBuf + y*derivIWinBufStride; x = 0; internal::prefetch(Jptr, nextStride * 2); internal::prefetch(Iptr, IWinBufStride/2); internal::prefetch(dIptr, derivIWinBufStride/2); for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 ) { uint8x8_t vj00 = vld1_u8(Jptr + x); uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride); uint8x8_t vj01 = vld1_u8(Jptr + x + cn); uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn); int16x8_t vI = vld1q_s16(Iptr + x); int16x8x2_t vDerivI = vld2q_s16(dIptr); int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00)); int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10)); int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01)); int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11)); int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00); int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10); vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01); vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11); vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10); vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00); vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11); vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01); int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5); int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5); int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI); int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0])); int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1])); int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0])); int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1])); float32x4_t vb1f = vcvtq_f32_s32(vb1i); float32x4_t vb2f = vcvtq_f32_s32(vb2i); vb1 = vaddq_f32(vb1, vb1f); vb2 = vaddq_f32(vb2, vb2f); } for( ; x < wwcn; x++, dIptr += 2 ) { s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11, W_BITS1-5) - Iptr[x]; b1 += (f32)(diff*dIptr[0]); b2 += (f32)(diff*dIptr[1]); } } f32 bbuf[2]; float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2))); vst1_f32(bbuf, vb); b1 += bbuf[0]; b2 += bbuf[1]; b1 *= FLT_SCALE; b2 *= FLT_SCALE; f32 deltaX = (f32)((A12*b2 - A22*b1) * D); f32 deltaY = (f32)((A12*b1 - A11*b2) * D); nextPtX += deltaX; nextPtY += deltaY; nextPts[ptref+0] = nextPtX + halfWinX; nextPts[ptref+1] = nextPtY + halfWinY; if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon ) break; if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 && std::abs(deltaY + prevDeltaY) < 0.01 ) { nextPts[ptref+0] -= deltaX*0.5f; nextPts[ptref+1] -= deltaY*0.5f; break; } prevDeltaX = deltaX; prevDeltaY = deltaY; } if( status && status[ptidx] && err && level == 0 && !getMinEigenVals ) { f32 nextPointX = nextPts[ptref+0] - halfWinX; f32 nextPointY = nextPts[ptref+1] - halfWinY; s32 inextPointX = floor(nextPointX); s32 inextPointY = floor(nextPointY); if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width || inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height ) { if( status ) status[ptidx] = false; continue; } f32 aa = nextPointX - inextPointX; f32 bb = nextPointY - inextPointY; iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS)); iw01 = round(aa*(1.f - bb)*(1 << W_BITS)); iw10 = round((1.f - aa)*bb*(1 << W_BITS)); iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; f32 errval = 0.f; for(s32 y = 0; y < (s32)winSize.height; y++ ) { const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn; const s16* Iptr = IWinBuf + y*IWinBufStride; for( x = 0; x < wwcn; x++ ) { s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 + Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11, W_BITS1-5) - Iptr[x]; errval += std::abs((f32)diff); } } err[ptidx] = errval / (32*wwcn*winSize.height); } } #else (void)size; (void)cn; (void)prevData; (void)prevStride; (void)prevDerivData; (void)prevDerivStride; (void)nextData; (void)nextStride; (void)prevPts; (void)nextPts; (void)status; (void)err; (void)winSize; (void)terminationCount; (void)terminationEpsilon; (void)level; (void)maxLevel; (void)useInitialFlow; (void)getMinEigenVals; (void)minEigThreshold; (void)ptCount; #endif } }//CAROTENE_NS