stereobm.cpp 42.7 KB
Newer Older
wester committed
1 2 3 4 5 6 7 8 9
//M*//////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
wester committed
10
//                        Intel License Agreement
wester committed
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
wester committed
26
//   * The name of Intel Corporation may not be used to endorse or promote products
wester committed
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

/****************************************************************************************\
*    Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm.   *
*    Contributed by Kurt Konolige                                                        *
\****************************************************************************************/

#include "precomp.hpp"
#include <stdio.h>

wester committed
50 51 52
//#undef CV_SSE2
//#define CV_SSE2 0
//#include "emmintrin.h"
wester committed
53 54


wester committed
55
#include <limits>
wester committed
56

wester committed
57
CV_IMPL CvStereoBMState* cvCreateStereoBMState( int /*preset*/, int numberOfDisparities )
wester committed
58
{
wester committed
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    CvStereoBMState* state = (CvStereoBMState*)cvAlloc( sizeof(*state) );
    if( !state )
        return 0;

    state->preFilterType = CV_STEREO_BM_XSOBEL; //CV_STEREO_BM_NORMALIZED_RESPONSE;
    state->preFilterSize = 9;
    state->preFilterCap = 31;
    state->SADWindowSize = 15;
    state->minDisparity = 0;
    state->numberOfDisparities = numberOfDisparities > 0 ? numberOfDisparities : 64;
    state->textureThreshold = 10;
    state->uniquenessRatio = 15;
    state->speckleRange = state->speckleWindowSize = 0;
    state->trySmallerWindows = 0;
    state->roi1 = state->roi2 = cvRect(0,0,0,0);
    state->disp12MaxDiff = -1;

    state->preFilteredImg0 = state->preFilteredImg1 = state->slidingSumBuf =
    state->disp = state->cost = 0;

    return state;
}
wester committed
81

wester committed
82 83 84 85 86 87 88 89 90 91 92 93 94 95
CV_IMPL void cvReleaseStereoBMState( CvStereoBMState** state )
{
    if( !state )
        CV_Error( CV_StsNullPtr, "" );

    if( !*state )
        return;

    cvReleaseMat( &(*state)->preFilteredImg0 );
    cvReleaseMat( &(*state)->preFilteredImg1 );
    cvReleaseMat( &(*state)->slidingSumBuf );
    cvReleaseMat( &(*state)->disp );
    cvReleaseMat( &(*state)->cost );
    cvFree( state );
wester committed
96
}
wester committed
97 98 99

namespace cv
{
wester committed
100 101 102 103 104 105 106 107

static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf )
{
    int x, y, wsz2 = winsize/2;
    int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
    int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
    const int OFS = 256*5, TABSZ = OFS*2 + 256;
    uchar tab[TABSZ];
wester committed
108
    const uchar* sptr = src.data;
wester committed
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
    int srcstep = (int)src.step;
    Size size = src.size();

    scale_g *= scale_s;

    for( x = 0; x < TABSZ; x++ )
        tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);

    for( x = 0; x < size.width; x++ )
        vsum[x] = (ushort)(sptr[x]*(wsz2 + 2));

    for( y = 1; y < wsz2; y++ )
    {
        for( x = 0; x < size.width; x++ )
            vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
    }

    for( y = 0; y < size.height; y++ )
    {
        const uchar* top = sptr + srcstep*MAX(y-wsz2-1,0);
        const uchar* bottom = sptr + srcstep*MIN(y+wsz2,size.height-1);
        const uchar* prev = sptr + srcstep*MAX(y-1,0);
        const uchar* curr = sptr + srcstep*y;
        const uchar* next = sptr + srcstep*MIN(y+1,size.height-1);
        uchar* dptr = dst.ptr<uchar>(y);
wester committed
134
        x = 0;
wester committed
135

wester committed
136
        for( ; x < size.width; x++ )
wester committed
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
            vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);

        for( x = 0; x <= wsz2; x++ )
        {
            vsum[-x-1] = vsum[0];
            vsum[size.width+x] = vsum[size.width-1];
        }

        int sum = vsum[0]*(wsz2 + 1);
        for( x = 1; x <= wsz2; x++ )
            sum += vsum[x];

        int val = ((curr[0]*5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
        dptr[0] = tab[val + OFS];

        for( x = 1; x < size.width-1; x++ )
        {
            sum += vsum[x+wsz2] - vsum[x-wsz2-1];
            val = ((curr[x]*4 + curr[x-1] + curr[x+1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
            dptr[x] = tab[val + OFS];
        }

        sum += vsum[x+wsz2] - vsum[x-wsz2-1];
        val = ((curr[x]*5 + curr[x-1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
        dptr[x] = tab[val + OFS];
    }
}


static void
prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
{
    int x, y;
    const int OFS = 256*4, TABSZ = OFS*2 + 256;
a  
Kai Westerkamp committed
171
    uchar tab[TABSZ];
wester committed
172 173 174 175 176 177
    Size size = src.size();

    for( x = 0; x < TABSZ; x++ )
        tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
    uchar val0 = tab[0 + OFS];

a  
Kai Westerkamp committed
178 179
#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
wester committed
180 181 182 183 184 185 186 187 188 189 190 191 192 193
#endif

    for( y = 0; y < size.height-1; y += 2 )
    {
        const uchar* srow1 = src.ptr<uchar>(y);
        const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
        const uchar* srow2 = y < size.height-1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
        const uchar* srow3 = y < size.height-2 ? srow1 + src.step*2 : srow1;
        uchar* dptr0 = dst.ptr<uchar>(y);
        uchar* dptr1 = dptr0 + dst.step;

        dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
        x = 1;

wester committed
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
    #if CV_SSE2
        if( useSIMD )
        {
            __m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero),
                ftz2 = _mm_set1_epi8(CV_CAST_8U(ftzero*2));
            for( ; x <= size.width-9; x += 8 )
            {
                __m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
                __m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
                __m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
                __m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);

                d0 = _mm_sub_epi16(d0, c0);
                d1 = _mm_sub_epi16(d1, c1);

                __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
                __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
                __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
                __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);

                d2 = _mm_sub_epi16(d2, c2);
                d3 = _mm_sub_epi16(d3, c3);

                __m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
                __m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
                v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
                v0 = _mm_min_epu8(v0, ftz2);

                _mm_storel_epi64((__m128i*)(dptr0 + x), v0);
                _mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
            }
        }
    #endif
    #if CV_NEON
a  
Kai Westerkamp committed
228 229 230 231
        int16x8_t ftz = vdupq_n_s16 ((short) ftzero);
        uint8x8_t ftz2 = vdup_n_u8 (cv::saturate_cast<uchar>(ftzero*2));

        for(; x <=size.width-9; x += 8 )
wester committed
232
        {
a  
Kai Westerkamp committed
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
            uint8x8_t c0 = vld1_u8 (srow0 + x - 1);
            uint8x8_t c1 = vld1_u8 (srow1 + x - 1);
            uint8x8_t d0 = vld1_u8 (srow0 + x + 1);
            uint8x8_t d1 = vld1_u8 (srow1 + x + 1);

            int16x8_t t0 = vreinterpretq_s16_u16 (vsubl_u8 (d0, c0));
            int16x8_t t1 = vreinterpretq_s16_u16 (vsubl_u8 (d1, c1));

            uint8x8_t c2 = vld1_u8 (srow2 + x - 1);
            uint8x8_t c3 = vld1_u8 (srow3 + x - 1);
            uint8x8_t d2 = vld1_u8 (srow2 + x + 1);
            uint8x8_t d3 = vld1_u8 (srow3 + x + 1);

            int16x8_t t2 = vreinterpretq_s16_u16 (vsubl_u8 (d2, c2));
            int16x8_t t3 = vreinterpretq_s16_u16 (vsubl_u8 (d3, c3));

            int16x8_t v0 = vaddq_s16 (vaddq_s16 (t2, t0), vaddq_s16 (t1, t1));
            int16x8_t v1 = vaddq_s16 (vaddq_s16 (t3, t1), vaddq_s16 (t2, t2));

wester committed
252

a  
Kai Westerkamp committed
253 254 255 256 257 258 259 260 261
            uint8x8_t v0_u8 = vqmovun_s16 (vaddq_s16 (v0, ftz));
            uint8x8_t v1_u8 = vqmovun_s16 (vaddq_s16 (v1, ftz));
            v0_u8 =  vmin_u8 (v0_u8, ftz2);
            v1_u8 =  vmin_u8 (v1_u8, ftz2);
            vqmovun_s16 (vaddq_s16 (v1, ftz));

            vst1_u8 (dptr0 + x, v0_u8);
            vst1_u8 (dptr1 + x, v1_u8);
        }
wester committed
262
    #endif
wester committed
263 264 265 266

        for( ; x < size.width-1; x++ )
        {
            int d0 = srow0[x+1] - srow0[x-1], d1 = srow1[x+1] - srow1[x-1],
wester committed
267
                d2 = srow2[x+1] - srow2[x-1], d3 = srow3[x+1] - srow3[x-1];
wester committed
268 269 270 271 272 273 274
            int v0 = tab[d0 + d1*2 + d2 + OFS];
            int v1 = tab[d1 + d2*2 + d3 + OFS];
            dptr0[x] = (uchar)v0;
            dptr1[x] = (uchar)v1;
        }
    }

a  
Kai Westerkamp committed
275 276 277 278
#if CV_NEON
    uint8x16_t val0_16 = vdupq_n_u8 (val0);
#endif

wester committed
279 280 281 282
    for( ; y < size.height; y++ )
    {
        uchar* dptr = dst.ptr<uchar>(y);
        x = 0;
a  
Kai Westerkamp committed
283 284 285 286
    #if CV_NEON
        for(; x <= size.width-16; x+=16 )
            vst1q_u8 (dptr + x, val0_16);
    #endif
wester committed
287 288 289 290 291 292
        for(; x < size.width; x++ )
            dptr[x] = val0;
    }
}


a  
Kai Westerkamp committed
293
static const int DISPARITY_SHIFT = 4;
wester committed
294

a  
Kai Westerkamp committed
295 296
#if CV_SSE2
static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
wester committed
297 298
                                             Mat& disp, Mat& cost, CvStereoBMState& state,
                                             uchar* buf, int _dy0, int _dy1 )
wester committed
299 300 301 302 303
{
    const int ALIGN = 16;
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
wester committed
304
    int ndisp = state.numberOfDisparities;
wester committed
305 306 307 308 309 310 311
    int mindisp = state.minDisparity;
    int lofs = MAX(ndisp - 1 + mindisp, 0);
    int rofs = -MIN(ndisp - 1 + mindisp, 0);
    int width = left.cols, height = left.rows;
    int width1 = width - rofs - ndisp + 1;
    int ftzero = state.preFilterCap;
    int textureThreshold = state.textureThreshold;
wester committed
312
    int uniquenessRatio = state.uniquenessRatio*256/100;
a  
Kai Westerkamp committed
313
    short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
wester committed
314 315 316 317

    ushort *sad, *hsad0, *hsad, *hsad_sub;
    int *htext;
    uchar *cbuf0, *cbuf;
wester committed
318 319
    const uchar* lptr0 = left.data + lofs;
    const uchar* rptr0 = right.data + rofs;
wester committed
320
    const uchar *lptr, *lptr_sub, *rptr;
wester committed
321
    short* dptr = (short*)disp.data;
wester committed
322 323 324 325 326 327 328
    int sstep = (int)left.step;
    int dstep = (int)(disp.step/sizeof(dptr[0]));
    int cstep = (height + dy0 + dy1)*ndisp;
    short costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const int TABSZ = 256;
    uchar tab[TABSZ];
a  
Kai Westerkamp committed
329
    const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8);
wester committed
330 331 332 333

    sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
    hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
    htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
wester committed
334
    cbuf0 = (uchar*)alignPtr(htext + height + wsz2 + 2 + dy0*ndisp, ALIGN);
wester committed
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351

    for( x = 0; x < TABSZ; x++ )
        tab[x] = (uchar)std::abs(x - ftzero);

    // initialize buffers
    memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
    memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );

    for( x = -wsz2-1; x < wsz2; x++ )
    {
        hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
        lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-ndisp) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
        {
            int lval = lptr[0];
a  
Kai Westerkamp committed
352
            __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
wester committed
353 354
            for( d = 0; d < ndisp; d += 16 )
            {
a  
Kai Westerkamp committed
355 356 357 358 359 360 361 362 363
                __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
                __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
                __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
                __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
                _mm_store_si128((__m128i*)(cbuf + d), diff);
                hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));
                hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));
                _mm_store_si128((__m128i*)(hsad + d), hsad_l);
                _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
wester committed
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
            }
            htext[y] += tab[lval];
        }
    }

    // initialize the left and right borders of the disparity map
    for( y = 0; y < height; y++ )
    {
        for( x = 0; x < lofs; x++ )
            dptr[y*dstep + x] = FILTERED;
        for( x = lofs + width1; x < width; x++ )
            dptr[y*dstep + x] = FILTERED;
    }
    dptr += lofs;

    for( x = 0; x < width1; x++, dptr++ )
    {
wester committed
381
        short* costptr = cost.data ? (short*)cost.data + lofs + x : &costbuf;
wester committed
382 383 384 385 386 387 388 389 390
        int x0 = x - wsz2 - 1, x1 = x + wsz2;
        const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        hsad = hsad0 - dy0*ndisp;
        lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
        lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
wester committed
391
             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
wester committed
392 393
        {
            int lval = lptr[0];
a  
Kai Westerkamp committed
394
            __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
wester committed
395 396
            for( d = 0; d < ndisp; d += 16 )
            {
a  
Kai Westerkamp committed
397 398 399 400 401 402 403 404 405 406 407 408
                __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
                __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
                __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
                __m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d));
                __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
                __m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z));
                _mm_store_si128((__m128i*)(cbuf + d), diff);
                diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z));
                hsad_h = _mm_add_epi16(hsad_h, diff_h);
                hsad_l = _mm_add_epi16(hsad_l, diff);
                _mm_store_si128((__m128i*)(hsad + d), hsad_l);
                _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
wester committed
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
            }
            htext[y] += tab[lval] - tab[lptr_sub[0]];
        }

        // fill borders
        for( y = dy1; y <= wsz2; y++ )
            htext[height+y] = htext[height+dy1-1];
        for( y = -wsz2-1; y < -dy0; y++ )
            htext[y] = htext[-dy0];

        // initialize sums
        for( d = 0; d < ndisp; d++ )
            sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));

        hsad = hsad0 + (1 - dy0)*ndisp;
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
a  
Kai Westerkamp committed
425
            for( d = 0; d < ndisp; d += 16 )
wester committed
426
            {
a  
Kai Westerkamp committed
427 428 429 430 431 432 433 434
                __m128i s0 = _mm_load_si128((__m128i*)(sad + d));
                __m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8));
                __m128i t0 = _mm_load_si128((__m128i*)(hsad + d));
                __m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8));
                s0 = _mm_add_epi16(s0, t0);
                s1 = _mm_add_epi16(s1, t1);
                _mm_store_si128((__m128i*)(sad + d), s0);
                _mm_store_si128((__m128i*)(sad + d + 8), s1);
wester committed
435 436 437 438 439 440 441 442 443 444 445
            }
        int tsum = 0;
        for( y = -wsz2-1; y < wsz2; y++ )
            tsum += htext[y];

        // finally, start the real processing
        for( y = 0; y < height; y++ )
        {
            int minsad = INT_MAX, mind = -1;
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
a  
Kai Westerkamp committed
446 447
            __m128i minsad8 = _mm_set1_epi16(SHRT_MAX);
            __m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask;
wester committed
448 449 450

            for( d = 0; d < ndisp; d += 16 )
            {
a  
Kai Westerkamp committed
451 452
                __m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d));
                __m128i u1 = _mm_load_si128((__m128i*)(hsad + d));
wester committed
453

a  
Kai Westerkamp committed
454 455
                __m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8));
                __m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8));
wester committed
456

a  
Kai Westerkamp committed
457 458
                __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
                __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
wester committed
459

a  
Kai Westerkamp committed
460 461 462 463
                u1 = _mm_sub_epi16(u1, u0);
                v1 = _mm_sub_epi16(v1, v0);
                usad8 = _mm_add_epi16(usad8, u1);
                vsad8 = _mm_add_epi16(vsad8, v1);
wester committed
464

a  
Kai Westerkamp committed
465 466 467
                mask = _mm_cmpgt_epi16(minsad8, usad8);
                minsad8 = _mm_min_epi16(minsad8, usad8);
                mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
wester committed
468

a  
Kai Westerkamp committed
469 470
                _mm_store_si128((__m128i*)(sad + d), usad8);
                _mm_store_si128((__m128i*)(sad + d + 8), vsad8);
wester committed
471

a  
Kai Westerkamp committed
472 473
                mask = _mm_cmpgt_epi16(minsad8, vsad8);
                minsad8 = _mm_min_epi16(minsad8, vsad8);
wester committed
474

a  
Kai Westerkamp committed
475 476 477
                d8 = _mm_add_epi16(d8, dd_8);
                mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
                d8 = _mm_add_epi16(d8, dd_8);
wester committed
478 479 480 481 482 483 484 485 486
            }

            tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
            if( tsum < textureThreshold )
            {
                dptr[y*dstep] = FILTERED;
                continue;
            }

wester committed
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504
            __m128i minsad82 = _mm_unpackhi_epi64(minsad8, minsad8);
            __m128i mind82 = _mm_unpackhi_epi64(mind8, mind8);
            mask = _mm_cmpgt_epi16(minsad8, minsad82);
            mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
            minsad8 = _mm_min_epi16(minsad8, minsad82);

            minsad82 = _mm_shufflelo_epi16(minsad8, _MM_SHUFFLE(3,2,3,2));
            mind82 = _mm_shufflelo_epi16(mind8, _MM_SHUFFLE(3,2,3,2));
            mask = _mm_cmpgt_epi16(minsad8, minsad82);
            mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
            minsad8 = _mm_min_epi16(minsad8, minsad82);

            minsad82 = _mm_shufflelo_epi16(minsad8, 1);
            mind82 = _mm_shufflelo_epi16(mind8, 1);
            mask = _mm_cmpgt_epi16(minsad8, minsad82);
            mind8 = _mm_xor_si128(mind8,_mm_and_si128(_mm_xor_si128(mind82,mind8),mask));
            mind = (short)_mm_cvtsi128_si32(mind8);
            minsad = sad[mind];
wester committed
505 506 507

            if( uniquenessRatio > 0 )
            {
wester committed
508
                int thresh = minsad + ((minsad * uniquenessRatio) >> 8);
a  
Kai Westerkamp committed
509 510 511
                __m128i thresh8 = _mm_set1_epi16((short)(thresh + 1));
                __m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1));
                __m128i dd_16 = _mm_add_epi16(dd_8, dd_8);
wester committed
512
                 d8 = _mm_sub_epi16(d0_8, dd_16);
wester committed
513

a  
Kai Westerkamp committed
514
                for( d = 0; d < ndisp; d += 16 )
wester committed
515
                {
a  
Kai Westerkamp committed
516 517 518 519 520 521 522 523 524
                    __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
                    __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
                    mask = _mm_cmpgt_epi16( thresh8, _mm_min_epi16(usad8,vsad8));
                    d8 = _mm_add_epi16(d8, dd_16);
                    if( !_mm_movemask_epi8(mask) )
                        continue;
                    mask = _mm_cmpgt_epi16( thresh8, usad8);
                    mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,d8), _mm_cmpgt_epi16(d8,d2)));
                    if( _mm_movemask_epi8(mask) )
wester committed
525
                        break;
a  
Kai Westerkamp committed
526 527 528 529
                    __m128i t8 = _mm_add_epi16(d8, dd_8);
                    mask = _mm_cmpgt_epi16( thresh8, vsad8);
                    mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,t8), _mm_cmpgt_epi16(t8,d2)));
                    if( _mm_movemask_epi8(mask) )
wester committed
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
                        break;
                }
                if( d < ndisp )
                {
                    dptr[y*dstep] = FILTERED;
                    continue;
                }
            }

            if( 0 < mind && mind < ndisp - 1 )
            {
                int p = sad[mind+1], n = sad[mind-1];
                d = p + n - 2*sad[mind] + std::abs(p - n);
                dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
            }
            else
                dptr[y*dstep] = (short)((ndisp - mind - 1 + mindisp)*16);
            costptr[y*coststep] = sad[mind];
        }
    }
}
#endif

static void
findStereoCorrespondenceBM( const Mat& left, const Mat& right,
wester committed
555 556
                            Mat& disp, Mat& cost, const CvStereoBMState& state,
                            uchar* buf, int _dy0, int _dy1 )
wester committed
557 558 559 560 561 562
{

    const int ALIGN = 16;
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
wester committed
563
    int ndisp = state.numberOfDisparities;
wester committed
564 565 566 567 568 569 570 571
    int mindisp = state.minDisparity;
    int lofs = MAX(ndisp - 1 + mindisp, 0);
    int rofs = -MIN(ndisp - 1 + mindisp, 0);
    int width = left.cols, height = left.rows;
    int width1 = width - rofs - ndisp + 1;
    int ftzero = state.preFilterCap;
    int textureThreshold = state.textureThreshold;
    int uniquenessRatio = state.uniquenessRatio;
a  
Kai Westerkamp committed
572 573 574 575 576 577 578 579 580
    short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);

#if CV_NEON
    CV_Assert (ndisp % 8 == 0);
    int32_t d0_4_temp [4];
    for (int i = 0; i < 4; i ++)
        d0_4_temp[i] = i;
    int32x4_t d0_4 = vld1q_s32 (d0_4_temp);
    int32x4_t dd_4 = vdupq_n_s32 (4);
wester committed
581 582 583 584
#endif

    int *sad, *hsad0, *hsad, *hsad_sub, *htext;
    uchar *cbuf0, *cbuf;
wester committed
585 586
    const uchar* lptr0 = left.data + lofs;
    const uchar* rptr0 = right.data + rofs;
wester committed
587
    const uchar *lptr, *lptr_sub, *rptr;
wester committed
588
    short* dptr = (short*)disp.data;
wester committed
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
    int sstep = (int)left.step;
    int dstep = (int)(disp.step/sizeof(dptr[0]));
    int cstep = (height+dy0+dy1)*ndisp;
    int costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const int TABSZ = 256;
    uchar tab[TABSZ];

    sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
    hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
    htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
    cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);

    for( x = 0; x < TABSZ; x++ )
        tab[x] = (uchar)std::abs(x - ftzero);

    // initialize buffers
    memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
    memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );

    for( x = -wsz2-1; x < wsz2; x++ )
    {
        hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
        lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep;
        rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-ndisp) - dy0*sstep;
wester committed
614

wester committed
615 616 617
        for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
        {
            int lval = lptr[0];
a  
Kai Westerkamp committed
618 619
        #if CV_NEON
            int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
wester committed
620

a  
Kai Westerkamp committed
621 622 623 624 625 626 627 628 629 630 631
            for( d = 0; d < ndisp; d += 8 )
            {
                int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
                int32x4_t hsad_l = vld1q_s32 (hsad + d);
                int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
                int16x8_t diff = vabdq_s16 (lv, rv);
                vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
                hsad_l = vaddq_s32 (hsad_l, vmovl_s16(vget_low_s16 (diff)));
                hsad_h = vaddq_s32 (hsad_h, vmovl_s16(vget_high_s16 (diff)));
                vst1q_s32 ((hsad + d), hsad_l);
                vst1q_s32 ((hsad + d + 4), hsad_h);
wester committed
632
            }
a  
Kai Westerkamp committed
633 634
        #else
            for( d = 0; d < ndisp; d++ )
wester committed
635 636 637 638 639
            {
                int diff = std::abs(lval - rptr[d]);
                cbuf[d] = (uchar)diff;
                hsad[d] = (int)(hsad[d] + diff);
            }
a  
Kai Westerkamp committed
640
        #endif
wester committed
641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
            htext[y] += tab[lval];
        }
    }

    // initialize the left and right borders of the disparity map
    for( y = 0; y < height; y++ )
    {
        for( x = 0; x < lofs; x++ )
            dptr[y*dstep + x] = FILTERED;
        for( x = lofs + width1; x < width; x++ )
            dptr[y*dstep + x] = FILTERED;
    }
    dptr += lofs;

    for( x = 0; x < width1; x++, dptr++ )
    {
wester committed
657
        int* costptr = cost.data ? (int*)cost.data + lofs + x : &costbuf;
wester committed
658 659 660 661 662 663 664 665 666
        int x0 = x - wsz2 - 1, x1 = x + wsz2;
        const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        hsad = hsad0 - dy0*ndisp;
        lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
        lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
wester committed
667
             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
wester committed
668 669
        {
            int lval = lptr[0];
a  
Kai Westerkamp committed
670 671 672
        #if CV_NEON
            int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
            for( d = 0; d < ndisp; d += 8 )
wester committed
673
            {
a  
Kai Westerkamp committed
674 675 676 677 678 679 680 681 682 683 684 685
                int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
                int32x4_t hsad_l = vld1q_s32 (hsad + d);
                int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
                int16x8_t cbs = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (cbuf_sub + d)));
                int16x8_t diff = vabdq_s16 (lv, rv);
                int32x4_t diff_h = vsubl_s16 (vget_high_s16 (diff), vget_high_s16 (cbs));
                int32x4_t diff_l = vsubl_s16 (vget_low_s16 (diff), vget_low_s16 (cbs));
                vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
                hsad_h = vaddq_s32 (hsad_h, diff_h);
                hsad_l = vaddq_s32 (hsad_l, diff_l);
                vst1q_s32 ((hsad + d), hsad_l);
                vst1q_s32 ((hsad + d + 4), hsad_h);
wester committed
686
            }
a  
Kai Westerkamp committed
687 688
        #else
            for( d = 0; d < ndisp; d++ )
wester committed
689 690 691 692 693
            {
                int diff = std::abs(lval - rptr[d]);
                cbuf[d] = (uchar)diff;
                hsad[d] = hsad[d] + diff - cbuf_sub[d];
            }
a  
Kai Westerkamp committed
694
        #endif
wester committed
695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
            htext[y] += tab[lval] - tab[lptr_sub[0]];
        }

        // fill borders
        for( y = dy1; y <= wsz2; y++ )
            htext[height+y] = htext[height+dy1-1];
        for( y = -wsz2-1; y < -dy0; y++ )
            htext[y] = htext[-dy0];

        // initialize sums
        for( d = 0; d < ndisp; d++ )
            sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));

        hsad = hsad0 + (1 - dy0)*ndisp;
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
        {
a  
Kai Westerkamp committed
711 712
        #if CV_NEON
            for( d = 0; d <= ndisp-8; d += 8 )
wester committed
713
            {
a  
Kai Westerkamp committed
714 715 716 717 718 719 720 721
                int32x4_t s0 = vld1q_s32 (sad + d);
                int32x4_t s1 = vld1q_s32 (sad + d + 4);
                int32x4_t t0 = vld1q_s32 (hsad + d);
                int32x4_t t1 = vld1q_s32 (hsad + d + 4);
                s0 = vaddq_s32 (s0, t0);
                s1 = vaddq_s32 (s1, t1);
                vst1q_s32 (sad + d, s0);
                vst1q_s32 (sad + d + 4, s1);
wester committed
722
            }
a  
Kai Westerkamp committed
723 724
        #else
            for( d = 0; d < ndisp; d++ )
wester committed
725
                sad[d] = (int)(sad[d] + hsad[d]);
a  
Kai Westerkamp committed
726
        #endif
wester committed
727 728 729 730 731 732 733 734 735 736 737
        }
        int tsum = 0;
        for( y = -wsz2-1; y < wsz2; y++ )
            tsum += htext[y];

        // finally, start the real processing
        for( y = 0; y < height; y++ )
        {
            int minsad = INT_MAX, mind = -1;
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
a  
Kai Westerkamp committed
738 739 740 741 742
        #if CV_NEON
            int32x4_t minsad4 = vdupq_n_s32 (INT_MAX);
            int32x4_t mind4 = vdupq_n_s32(0), d4 = d0_4;

            for( d = 0; d <= ndisp-8; d += 8 )
wester committed
743
            {
a  
Kai Westerkamp committed
744 745
                int32x4_t u0 = vld1q_s32 (hsad_sub + d);
                int32x4_t u1 = vld1q_s32 (hsad + d);
wester committed
746

a  
Kai Westerkamp committed
747 748
                int32x4_t v0 = vld1q_s32 (hsad_sub + d + 4);
                int32x4_t v1 = vld1q_s32 (hsad + d + 4);
wester committed
749

a  
Kai Westerkamp committed
750 751
                int32x4_t usad4 = vld1q_s32(sad + d);
                int32x4_t vsad4 = vld1q_s32(sad + d + 4);
wester committed
752

a  
Kai Westerkamp committed
753 754 755 756
                u1 = vsubq_s32 (u1, u0);
                v1 = vsubq_s32 (v1, v0);
                usad4 = vaddq_s32 (usad4, u1);
                vsad4 = vaddq_s32 (vsad4, v1);
wester committed
757

a  
Kai Westerkamp committed
758 759 760
                uint32x4_t mask = vcgtq_s32 (minsad4, usad4);
                minsad4 = vminq_s32 (minsad4, usad4);
                mind4 = vbslq_s32(mask, d4, mind4);
wester committed
761

a  
Kai Westerkamp committed
762 763 764
                vst1q_s32 (sad + d, usad4);
                vst1q_s32 (sad + d + 4, vsad4);
                d4 = vaddq_s32 (d4, dd_4);
wester committed
765

a  
Kai Westerkamp committed
766 767 768
                mask = vcgtq_s32 (minsad4, vsad4);
                minsad4 = vminq_s32 (minsad4, vsad4);
                mind4 = vbslq_s32(mask, d4, mind4);
wester committed
769

a  
Kai Westerkamp committed
770
                d4 = vaddq_s32 (d4, dd_4);
wester committed
771 772

            }
a  
Kai Westerkamp committed
773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
            int32x2_t mind4_h = vget_high_s32 (mind4);
            int32x2_t mind4_l = vget_low_s32 (mind4);
            int32x2_t minsad4_h = vget_high_s32 (minsad4);
            int32x2_t minsad4_l = vget_low_s32 (minsad4);

            uint32x2_t mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
            mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
            minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);

            mind4_l = vext_s32 (mind4_h,mind4_h,1);
            minsad4_l = vext_s32 (minsad4_h,minsad4_h,1);

            mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
            mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
            minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);

            mind = (int) vget_lane_s32 (mind4_h, 0);
            minsad = sad[mind];

        #else
            for( d = 0; d < ndisp; d++ )
wester committed
794 795 796 797 798 799 800 801 802
            {
                int currsad = sad[d] + hsad[d] - hsad_sub[d];
                sad[d] = currsad;
                if( currsad < minsad )
                {
                    minsad = currsad;
                    mind = d;
                }
            }
a  
Kai Westerkamp committed
803
        #endif
wester committed
804 805 806 807 808 809 810 811 812 813 814 815
            tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
            if( tsum < textureThreshold )
            {
                dptr[y*dstep] = FILTERED;
                continue;
            }

            if( uniquenessRatio > 0 )
            {
                int thresh = minsad + (minsad * uniquenessRatio/100);
                for( d = 0; d < ndisp; d++ )
                {
wester committed
816
                    if( sad[d] <= thresh && (d < mind-1 || d > mind+1))
wester committed
817 818 819 820 821 822 823 824 825 826
                        break;
                }
                if( d < ndisp )
                {
                    dptr[y*dstep] = FILTERED;
                    continue;
                }
            }

            {
wester committed
827 828 829 830 831 832
            sad[-1] = sad[1];
            sad[ndisp] = sad[ndisp-2];
            int p = sad[mind+1], n = sad[mind-1];
            d = p + n - 2*sad[mind] + std::abs(p - n);
            dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
            costptr[y*coststep] = sad[mind];
wester committed
833 834 835 836 837
            }
        }
    }
}

wester committed
838
struct PrefilterInvoker
wester committed
839 840
{
    PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
wester committed
841
                     uchar* buf0, uchar* buf1, CvStereoBMState* _state )
wester committed
842 843 844 845 846 847 848
    {
        imgs0[0] = &left0; imgs0[1] = &right0;
        imgs[0] = &left; imgs[1] = &right;
        buf[0] = buf0; buf[1] = buf1;
        state = _state;
    }

wester committed
849
    void operator()( int ind ) const
wester committed
850
    {
wester committed
851 852 853 854
        if( state->preFilterType == CV_STEREO_BM_NORMALIZED_RESPONSE )
            prefilterNorm( *imgs0[ind], *imgs[ind], state->preFilterSize, state->preFilterCap, buf[ind] );
        else
            prefilterXSobel( *imgs0[ind], *imgs[ind], state->preFilterCap );
wester committed
855 856 857 858 859
    }

    const Mat* imgs0[2];
    Mat* imgs[2];
    uchar* buf[2];
wester committed
860
    CvStereoBMState *state;
wester committed
861 862 863
};


wester committed
864
struct FindStereoCorrespInvoker : ParallelLoopBody
wester committed
865 866
{
    FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
wester committed
867 868 869
                              Mat& _disp, CvStereoBMState* _state,
                              int _nstripes, int _stripeBufSize,
                              bool _useShorts, Rect _validDisparityRect )
wester committed
870 871 872 873 874 875 876 877 878 879 880
    {
        left = &_left; right = &_right;
        disp = &_disp; state = _state;
        nstripes = _nstripes; stripeBufSize = _stripeBufSize;
        useShorts = _useShorts;
        validDisparityRect = _validDisparityRect;
    }

    void operator()( const Range& range ) const
    {
        int cols = left->cols, rows = left->rows;
wester committed
881 882 883
        int _row0 = min(cvRound(range.start * rows / nstripes), rows);
        int _row1 = min(cvRound(range.end * rows / nstripes), rows);
        uchar *ptr = state->slidingSumBuf->data.ptr + range.start * stripeBufSize;
wester committed
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
        int FILTERED = (state->minDisparity - 1)*16;

        Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
        if( roi.height == 0 )
            return;
        int row0 = roi.y;
        int row1 = roi.y + roi.height;

        Mat part;
        if( row0 > _row0 )
        {
            part = disp->rowRange(_row0, row0);
            part = Scalar::all(FILTERED);
        }
        if( _row1 > row1 )
        {
            part = disp->rowRange(row1, _row1);
            part = Scalar::all(FILTERED);
        }

        Mat left_i = left->rowRange(row0, row1);
        Mat right_i = right->rowRange(row0, row1);
        Mat disp_i = disp->rowRange(row0, row1);
wester committed
907
        Mat cost_i = state->disp12MaxDiff >= 0 ? Mat(state->cost).rowRange(row0, row1) : Mat();
wester committed
908

a  
Kai Westerkamp committed
909 910 911
#if CV_SSE2
        if( useShorts )
            findStereoCorrespondenceBM_SSE2( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
wester committed
912 913
        else
#endif
a  
Kai Westerkamp committed
914
            findStereoCorrespondenceBM( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
wester committed
915 916

        if( state->disp12MaxDiff >= 0 )
wester committed
917
            validateDisparity( disp_i, cost_i, state->minDisparity, state->numberOfDisparities, state->disp12MaxDiff );
wester committed
918 919 920 921 922 923 924 925 926 927 928 929 930 931 932

        if( roi.x > 0 )
        {
            part = disp_i.colRange(0, roi.x);
            part = Scalar::all(FILTERED);
        }
        if( roi.x + roi.width < cols )
        {
            part = disp_i.colRange(roi.x + roi.width, cols);
            part = Scalar::all(FILTERED);
        }
    }

protected:
    const Mat *left, *right;
wester committed
933 934
    Mat* disp;
    CvStereoBMState *state;
wester committed
935 936

    int nstripes;
wester committed
937
    int stripeBufSize;
wester committed
938 939 940 941
    bool useShorts;
    Rect validDisparityRect;
};

wester committed
942
static void findStereoCorrespondenceBM( const Mat& left0, const Mat& right0, Mat& disp0, CvStereoBMState* state)
wester committed
943
{
wester committed
944 945
    if (left0.size() != right0.size() || disp0.size() != left0.size())
        CV_Error( CV_StsUnmatchedSizes, "All the images must have the same size" );
wester committed
946

wester committed
947 948
    if (left0.type() != CV_8UC1 || right0.type() != CV_8UC1)
        CV_Error( CV_StsUnsupportedFormat, "Both input images must have CV_8UC1" );
wester committed
949

wester committed
950 951
    if (disp0.type() != CV_16SC1 && disp0.type() != CV_32FC1)
        CV_Error( CV_StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" );
wester committed
952

wester committed
953 954
    if( !state )
        CV_Error( CV_StsNullPtr, "Stereo BM state is NULL." );
wester committed
955

wester committed
956 957
    if( state->preFilterType != CV_STEREO_BM_NORMALIZED_RESPONSE && state->preFilterType != CV_STEREO_BM_XSOBEL )
        CV_Error( CV_StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" );
wester committed
958

wester committed
959 960
    if( state->preFilterSize < 5 || state->preFilterSize > 255 || state->preFilterSize % 2 == 0 )
        CV_Error( CV_StsOutOfRange, "preFilterSize must be odd and be within 5..255" );
wester committed
961

wester committed
962 963
    if( state->preFilterCap < 1 || state->preFilterCap > 63 )
        CV_Error( CV_StsOutOfRange, "preFilterCap must be within 1..63" );
wester committed
964

wester committed
965 966 967
    if( state->SADWindowSize < 5 || state->SADWindowSize > 255 || state->SADWindowSize % 2 == 0 ||
        state->SADWindowSize >= min(left0.cols, left0.rows) )
        CV_Error( CV_StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" );
wester committed
968

wester committed
969 970
    if( state->numberOfDisparities <= 0 || state->numberOfDisparities % 16 != 0 )
        CV_Error( CV_StsOutOfRange, "numberOfDisparities must be positive and divisble by 16" );
wester committed
971

wester committed
972 973
    if( state->textureThreshold < 0 )
        CV_Error( CV_StsOutOfRange, "texture threshold must be non-negative" );
wester committed
974

wester committed
975 976
    if( state->uniquenessRatio < 0 )
        CV_Error( CV_StsOutOfRange, "uniqueness ratio must be non-negative" );
wester committed
977

wester committed
978 979 980 981 982
    if( !state->preFilteredImg0 || state->preFilteredImg0->cols * state->preFilteredImg0->rows < left0.cols * left0.rows )
    {
        cvReleaseMat( &state->preFilteredImg0 );
        cvReleaseMat( &state->preFilteredImg1 );
        cvReleaseMat( &state->cost );
wester committed
983

wester committed
984 985 986 987 988 989
        state->preFilteredImg0 = cvCreateMat( left0.rows, left0.cols, CV_8U );
        state->preFilteredImg1 = cvCreateMat( left0.rows, left0.cols, CV_8U );
        state->cost = cvCreateMat( left0.rows, left0.cols, CV_16S );
    }
    Mat left(left0.size(), CV_8U, state->preFilteredImg0->data.ptr);
    Mat right(right0.size(), CV_8U, state->preFilteredImg1->data.ptr);
wester committed
990

wester committed
991 992
    int mindisp = state->minDisparity;
    int ndisp = state->numberOfDisparities;
wester committed
993

wester committed
994 995 996 997 998 999
    int width = left0.cols;
    int height = left0.rows;
    int lofs = max(ndisp - 1 + mindisp, 0);
    int rofs = -min(ndisp - 1 + mindisp, 0);
    int width1 = width - rofs - ndisp + 1;
    int FILTERED = (state->minDisparity - 1) << DISPARITY_SHIFT;
wester committed
1000

wester committed
1001 1002 1003 1004 1005
    if( lofs >= width || rofs >= width || width1 < 1 )
    {
        disp0 = Scalar::all( FILTERED * ( disp0.type() < CV_32F ? 1 : 1./(1 << DISPARITY_SHIFT) ) );
        return;
    }
wester committed
1006

wester committed
1007
    Mat disp = disp0;
wester committed
1008

wester committed
1009 1010 1011
    if( disp0.type() == CV_32F)
    {
        if( !state->disp || state->disp->rows != disp0.rows || state->disp->cols != disp0.cols )
wester committed
1012
        {
wester committed
1013 1014
            cvReleaseMat( &state->disp );
            state->disp = cvCreateMat(disp0.rows, disp0.cols, CV_16S);
wester committed
1015
        }
wester committed
1016 1017
        disp = cv::cvarrToMat(state->disp);
    }
wester committed
1018

wester committed
1019 1020 1021 1022 1023
    int wsz = state->SADWindowSize;
    int bufSize0 = (int)((ndisp + 2)*sizeof(int));
    bufSize0 += (int)((height+wsz+2)*ndisp*sizeof(int));
    bufSize0 += (int)((height + wsz + 2)*sizeof(int));
    bufSize0 += (int)((height+wsz+2)*ndisp*(wsz+2)*sizeof(uchar) + 256);
wester committed
1024

wester committed
1025 1026 1027 1028
    int bufSize1 = (int)((width + state->preFilterSize + 2) * sizeof(int) + 256);
    int bufSize2 = 0;
    if( state->speckleRange >= 0 && state->speckleWindowSize > 0 )
        bufSize2 = width*height*(sizeof(cv::Point_<short>) + sizeof(int) + sizeof(uchar));
wester committed
1029

a  
Kai Westerkamp committed
1030
#if CV_SSE2
wester committed
1031
    bool useShorts = state->preFilterCap <= 31 && state->SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
a  
Kai Westerkamp committed
1032
#else
wester committed
1033
    const bool useShorts = false;
a  
Kai Westerkamp committed
1034 1035
#endif

wester committed
1036 1037 1038 1039
    const double SAD_overhead_coeff = 10.0;
    double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
    double maxStripeSize = min(max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
    int nstripes = cvCeil(height / maxStripeSize);
wester committed
1040

wester committed
1041
    int bufSize = max(bufSize0 * nstripes, max(bufSize1 * 2, bufSize2));
wester committed
1042

wester committed
1043 1044 1045 1046
    if( !state->slidingSumBuf || state->slidingSumBuf->cols < bufSize )
    {
        cvReleaseMat( &state->slidingSumBuf );
        state->slidingSumBuf = cvCreateMat( 1, bufSize, CV_8U );
wester committed
1047 1048
    }

wester committed
1049 1050 1051
    uchar *_buf = state->slidingSumBuf->data.ptr;
    int idx[] = {0,1};
    parallel_do(idx, idx+2, PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, state));
wester committed
1052

wester committed
1053 1054 1055 1056 1057
    Rect validDisparityRect(0, 0, width, height), R1 = state->roi1, R2 = state->roi2;
    validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
                                              R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
                                              state->minDisparity, state->numberOfDisparities,
                                              state->SADWindowSize);
wester committed
1058

wester committed
1059 1060 1061
    parallel_for_(Range(0, nstripes),
                  FindStereoCorrespInvoker(left, right, disp, state, nstripes,
                                           bufSize0, useShorts, validDisparityRect));
wester committed
1062

wester committed
1063
    if( state->speckleRange >= 0 && state->speckleWindowSize > 0 )
wester committed
1064
    {
wester committed
1065 1066
        Mat buf(state->slidingSumBuf);
        filterSpeckles(disp, FILTERED, state->speckleWindowSize, state->speckleRange, buf);
wester committed
1067 1068
    }

wester committed
1069 1070 1071
    if (disp0.data != disp.data)
        disp.convertTo(disp0, disp0.type(), 1./(1 << DISPARITY_SHIFT), 0);
}
wester committed
1072

wester committed
1073 1074
StereoBM::StereoBM()
{ state = cvCreateStereoBMState(); }
wester committed
1075

wester committed
1076 1077
StereoBM::StereoBM(int _preset, int _ndisparities, int _SADWindowSize)
{ init(_preset, _ndisparities, _SADWindowSize); }
wester committed
1078

wester committed
1079 1080 1081 1082 1083
void StereoBM::init(int _preset, int _ndisparities, int _SADWindowSize)
{
    state = cvCreateStereoBMState(_preset, _ndisparities);
    state->SADWindowSize = _SADWindowSize;
}
wester committed
1084

wester committed
1085 1086
void StereoBM::operator()( InputArray _left, InputArray _right,
                           OutputArray _disparity, int disptype )
wester committed
1087
{
wester committed
1088 1089 1090 1091 1092 1093
    Mat left = _left.getMat(), right = _right.getMat();
    CV_Assert( disptype == CV_16S || disptype == CV_32F );
    _disparity.create(left.size(), disptype);
    Mat disparity = _disparity.getMat();

    findStereoCorrespondenceBM(left, right, disparity, state);
wester committed
1094 1095
}

wester committed
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
template<> void Ptr<CvStereoBMState>::delete_obj()
{ cvReleaseStereoBMState(&obj); }

}

CV_IMPL void cvFindStereoCorrespondenceBM( const CvArr* leftarr, const CvArr* rightarr,
                                           CvArr* disparr, CvStereoBMState* state )
{
    cv::Mat left = cv::cvarrToMat(leftarr),
        right = cv::cvarrToMat(rightarr),
        disp = cv::cvarrToMat(disparr);
    cv::findStereoCorrespondenceBM(left, right, disp, state);
wester committed
1108 1109 1110
}

/* End of file. */