static void findStereoCorrespondenceBM_SSE2()

in modules/calib3d/src/stereobm.cpp [319:564]


static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
                                            Mat& disp, Mat& cost, StereoBMParams& state,
                                            uchar* buf, int _dy0, int _dy1 )
{
    const int ALIGN = 16;
    int x, y, d;
    int wsz = state.SADWindowSize, wsz2 = wsz/2;
    int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
    int ndisp = state.numDisparities;
    int mindisp = state.minDisparity;
    int lofs = MAX(ndisp - 1 + mindisp, 0);
    int rofs = -MIN(ndisp - 1 + mindisp, 0);
    int width = left.cols, height = left.rows;
    int width1 = width - rofs - ndisp + 1;
    int ftzero = state.preFilterCap;
    int textureThreshold = state.textureThreshold;
    int uniquenessRatio = state.uniquenessRatio;
    short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);

    ushort *sad, *hsad0, *hsad, *hsad_sub;
    int *htext;
    uchar *cbuf0, *cbuf;
    const uchar* lptr0 = left.ptr() + lofs;
    const uchar* rptr0 = right.ptr() + rofs;
    const uchar *lptr, *lptr_sub, *rptr;
    short* dptr = disp.ptr<short>();
    int sstep = (int)left.step;
    int dstep = (int)(disp.step/sizeof(dptr[0]));
    int cstep = (height + dy0 + dy1)*ndisp;
    short costbuf = 0;
    int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
    const int TABSZ = 256;
    uchar tab[TABSZ];
    const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8);

    sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
    hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
    htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
    cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);

    for( x = 0; x < TABSZ; x++ )
        tab[x] = (uchar)std::abs(x - ftzero);

    // initialize buffers
    memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
    memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );

    for( x = -wsz2-1; x < wsz2; x++ )
    {
        hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
        lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-ndisp) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
        {
            int lval = lptr[0];
            __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
            for( d = 0; d < ndisp; d += 16 )
            {
                __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
                __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
                __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
                __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
                _mm_store_si128((__m128i*)(cbuf + d), diff);
                hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));
                hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));
                _mm_store_si128((__m128i*)(hsad + d), hsad_l);
                _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
            }
            htext[y] += tab[lval];
        }
    }

    // initialize the left and right borders of the disparity map
    for( y = 0; y < height; y++ )
    {
        for( x = 0; x < lofs; x++ )
            dptr[y*dstep + x] = FILTERED;
        for( x = lofs + width1; x < width; x++ )
            dptr[y*dstep + x] = FILTERED;
    }
    dptr += lofs;

    for( x = 0; x < width1; x++, dptr++ )
    {
        short* costptr = cost.data ? cost.ptr<short>() + lofs + x : &costbuf;
        int x0 = x - wsz2 - 1, x1 = x + wsz2;
        const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
        hsad = hsad0 - dy0*ndisp;
        lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
        lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
        rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;

        for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
            hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
        {
            int lval = lptr[0];
            __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
            for( d = 0; d < ndisp; d += 16 )
            {
                __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
                __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
                __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
                __m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d));
                __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
                __m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z));
                _mm_store_si128((__m128i*)(cbuf + d), diff);
                diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z));
                hsad_h = _mm_add_epi16(hsad_h, diff_h);
                hsad_l = _mm_add_epi16(hsad_l, diff);
                _mm_store_si128((__m128i*)(hsad + d), hsad_l);
                _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
            }
            htext[y] += tab[lval] - tab[lptr_sub[0]];
        }

        // fill borders
        for( y = dy1; y <= wsz2; y++ )
            htext[height+y] = htext[height+dy1-1];
        for( y = -wsz2-1; y < -dy0; y++ )
            htext[y] = htext[-dy0];

        // initialize sums
        for( d = 0; d < ndisp; d++ )
            sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));

        hsad = hsad0 + (1 - dy0)*ndisp;
        for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
            for( d = 0; d < ndisp; d += 16 )
            {
                __m128i s0 = _mm_load_si128((__m128i*)(sad + d));
                __m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8));
                __m128i t0 = _mm_load_si128((__m128i*)(hsad + d));
                __m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8));
                s0 = _mm_add_epi16(s0, t0);
                s1 = _mm_add_epi16(s1, t1);
                _mm_store_si128((__m128i*)(sad + d), s0);
                _mm_store_si128((__m128i*)(sad + d + 8), s1);
            }
        int tsum = 0;
        for( y = -wsz2-1; y < wsz2; y++ )
            tsum += htext[y];

        // finally, start the real processing
        for( y = 0; y < height; y++ )
        {
            int minsad = INT_MAX, mind = -1;
            hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
            hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
            __m128i minsad8 = _mm_set1_epi16(SHRT_MAX);
            __m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask;

            for( d = 0; d < ndisp; d += 16 )
            {
                __m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d));
                __m128i u1 = _mm_load_si128((__m128i*)(hsad + d));

                __m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8));
                __m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8));

                __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
                __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));

                u1 = _mm_sub_epi16(u1, u0);
                v1 = _mm_sub_epi16(v1, v0);
                usad8 = _mm_add_epi16(usad8, u1);
                vsad8 = _mm_add_epi16(vsad8, v1);

                mask = _mm_cmpgt_epi16(minsad8, usad8);
                minsad8 = _mm_min_epi16(minsad8, usad8);
                mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));

                _mm_store_si128((__m128i*)(sad + d), usad8);
                _mm_store_si128((__m128i*)(sad + d + 8), vsad8);

                mask = _mm_cmpgt_epi16(minsad8, vsad8);
                minsad8 = _mm_min_epi16(minsad8, vsad8);

                d8 = _mm_add_epi16(d8, dd_8);
                mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
                d8 = _mm_add_epi16(d8, dd_8);
            }

            tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
            if( tsum < textureThreshold )
            {
                dptr[y*dstep] = FILTERED;
                continue;
            }

            ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
            _mm_store_si128((__m128i*)minsad_buf, minsad8);
            _mm_store_si128((__m128i*)mind_buf, mind8);
            for( d = 0; d < 8; d++ )
                if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
                {
                    minsad = minsad_buf[d];
                    mind = mind_buf[d];
                }

            if( uniquenessRatio > 0 )
            {
                int thresh = minsad + (minsad * uniquenessRatio/100);
                __m128i thresh8 = _mm_set1_epi16((short)(thresh + 1));
                __m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1));
                __m128i dd_16 = _mm_add_epi16(dd_8, dd_8);
                d8 = _mm_sub_epi16(d0_8, dd_16);

                for( d = 0; d < ndisp; d += 16 )
                {
                    __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
                    __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
                    mask = _mm_cmpgt_epi16( thresh8, _mm_min_epi16(usad8,vsad8));
                    d8 = _mm_add_epi16(d8, dd_16);
                    if( !_mm_movemask_epi8(mask) )
                        continue;
                    mask = _mm_cmpgt_epi16( thresh8, usad8);
                    mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,d8), _mm_cmpgt_epi16(d8,d2)));
                    if( _mm_movemask_epi8(mask) )
                        break;
                    __m128i t8 = _mm_add_epi16(d8, dd_8);
                    mask = _mm_cmpgt_epi16( thresh8, vsad8);
                    mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,t8), _mm_cmpgt_epi16(t8,d2)));
                    if( _mm_movemask_epi8(mask) )
                        break;
                }
                if( d < ndisp )
                {
                    dptr[y*dstep] = FILTERED;
                    continue;
                }
            }

            if( 0 < mind && mind < ndisp - 1 )
            {
                int p = sad[mind+1], n = sad[mind-1];
                d = p + n - 2*sad[mind] + std::abs(p - n);
                dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
            }
            else
                dptr[y*dstep] = (short)((ndisp - mind - 1 + mindisp)*16);
            costptr[y*coststep] = sad[mind];
        }
    }
}