in modules/calib3d/src/stereobm.cpp [319:564]
static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
Mat& disp, Mat& cost, StereoBMParams& state,
uchar* buf, int _dy0, int _dy1 )
{
const int ALIGN = 16;
int x, y, d;
int wsz = state.SADWindowSize, wsz2 = wsz/2;
int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
int ndisp = state.numDisparities;
int mindisp = state.minDisparity;
int lofs = MAX(ndisp - 1 + mindisp, 0);
int rofs = -MIN(ndisp - 1 + mindisp, 0);
int width = left.cols, height = left.rows;
int width1 = width - rofs - ndisp + 1;
int ftzero = state.preFilterCap;
int textureThreshold = state.textureThreshold;
int uniquenessRatio = state.uniquenessRatio;
short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
ushort *sad, *hsad0, *hsad, *hsad_sub;
int *htext;
uchar *cbuf0, *cbuf;
const uchar* lptr0 = left.ptr() + lofs;
const uchar* rptr0 = right.ptr() + rofs;
const uchar *lptr, *lptr_sub, *rptr;
short* dptr = disp.ptr<short>();
int sstep = (int)left.step;
int dstep = (int)(disp.step/sizeof(dptr[0]));
int cstep = (height + dy0 + dy1)*ndisp;
short costbuf = 0;
int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
const int TABSZ = 256;
uchar tab[TABSZ];
const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8);
sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
for( x = 0; x < TABSZ; x++ )
tab[x] = (uchar)std::abs(x - ftzero);
// initialize buffers
memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
for( x = -wsz2-1; x < wsz2; x++ )
{
hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-ndisp) - dy0*sstep;
for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
{
int lval = lptr[0];
__m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
for( d = 0; d < ndisp; d += 16 )
{
__m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
__m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
__m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
__m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
_mm_store_si128((__m128i*)(cbuf + d), diff);
hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));
hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));
_mm_store_si128((__m128i*)(hsad + d), hsad_l);
_mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
}
htext[y] += tab[lval];
}
}
// initialize the left and right borders of the disparity map
for( y = 0; y < height; y++ )
{
for( x = 0; x < lofs; x++ )
dptr[y*dstep + x] = FILTERED;
for( x = lofs + width1; x < width; x++ )
dptr[y*dstep + x] = FILTERED;
}
dptr += lofs;
for( x = 0; x < width1; x++, dptr++ )
{
short* costptr = cost.data ? cost.ptr<short>() + lofs + x : &costbuf;
int x0 = x - wsz2 - 1, x1 = x + wsz2;
const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
hsad = hsad0 - dy0*ndisp;
lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;
for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
{
int lval = lptr[0];
__m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
for( d = 0; d < ndisp; d += 16 )
{
__m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
__m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
__m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
__m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d));
__m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
__m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z));
_mm_store_si128((__m128i*)(cbuf + d), diff);
diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z));
hsad_h = _mm_add_epi16(hsad_h, diff_h);
hsad_l = _mm_add_epi16(hsad_l, diff);
_mm_store_si128((__m128i*)(hsad + d), hsad_l);
_mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
}
htext[y] += tab[lval] - tab[lptr_sub[0]];
}
// fill borders
for( y = dy1; y <= wsz2; y++ )
htext[height+y] = htext[height+dy1-1];
for( y = -wsz2-1; y < -dy0; y++ )
htext[y] = htext[-dy0];
// initialize sums
for( d = 0; d < ndisp; d++ )
sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
hsad = hsad0 + (1 - dy0)*ndisp;
for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
for( d = 0; d < ndisp; d += 16 )
{
__m128i s0 = _mm_load_si128((__m128i*)(sad + d));
__m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8));
__m128i t0 = _mm_load_si128((__m128i*)(hsad + d));
__m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8));
s0 = _mm_add_epi16(s0, t0);
s1 = _mm_add_epi16(s1, t1);
_mm_store_si128((__m128i*)(sad + d), s0);
_mm_store_si128((__m128i*)(sad + d + 8), s1);
}
int tsum = 0;
for( y = -wsz2-1; y < wsz2; y++ )
tsum += htext[y];
// finally, start the real processing
for( y = 0; y < height; y++ )
{
int minsad = INT_MAX, mind = -1;
hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
__m128i minsad8 = _mm_set1_epi16(SHRT_MAX);
__m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask;
for( d = 0; d < ndisp; d += 16 )
{
__m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d));
__m128i u1 = _mm_load_si128((__m128i*)(hsad + d));
__m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8));
__m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8));
__m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
__m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
u1 = _mm_sub_epi16(u1, u0);
v1 = _mm_sub_epi16(v1, v0);
usad8 = _mm_add_epi16(usad8, u1);
vsad8 = _mm_add_epi16(vsad8, v1);
mask = _mm_cmpgt_epi16(minsad8, usad8);
minsad8 = _mm_min_epi16(minsad8, usad8);
mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
_mm_store_si128((__m128i*)(sad + d), usad8);
_mm_store_si128((__m128i*)(sad + d + 8), vsad8);
mask = _mm_cmpgt_epi16(minsad8, vsad8);
minsad8 = _mm_min_epi16(minsad8, vsad8);
d8 = _mm_add_epi16(d8, dd_8);
mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
d8 = _mm_add_epi16(d8, dd_8);
}
tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
if( tsum < textureThreshold )
{
dptr[y*dstep] = FILTERED;
continue;
}
ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
_mm_store_si128((__m128i*)minsad_buf, minsad8);
_mm_store_si128((__m128i*)mind_buf, mind8);
for( d = 0; d < 8; d++ )
if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
{
minsad = minsad_buf[d];
mind = mind_buf[d];
}
if( uniquenessRatio > 0 )
{
int thresh = minsad + (minsad * uniquenessRatio/100);
__m128i thresh8 = _mm_set1_epi16((short)(thresh + 1));
__m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1));
__m128i dd_16 = _mm_add_epi16(dd_8, dd_8);
d8 = _mm_sub_epi16(d0_8, dd_16);
for( d = 0; d < ndisp; d += 16 )
{
__m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
__m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
mask = _mm_cmpgt_epi16( thresh8, _mm_min_epi16(usad8,vsad8));
d8 = _mm_add_epi16(d8, dd_16);
if( !_mm_movemask_epi8(mask) )
continue;
mask = _mm_cmpgt_epi16( thresh8, usad8);
mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,d8), _mm_cmpgt_epi16(d8,d2)));
if( _mm_movemask_epi8(mask) )
break;
__m128i t8 = _mm_add_epi16(d8, dd_8);
mask = _mm_cmpgt_epi16( thresh8, vsad8);
mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,t8), _mm_cmpgt_epi16(t8,d2)));
if( _mm_movemask_epi8(mask) )
break;
}
if( d < ndisp )
{
dptr[y*dstep] = FILTERED;
continue;
}
}
if( 0 < mind && mind < ndisp - 1 )
{
int p = sad[mind+1], n = sad[mind-1];
d = p + n - 2*sad[mind] + std::abs(p - n);
dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
}
else
dptr[y*dstep] = (short)((ndisp - mind - 1 + mindisp)*16);
costptr[y*coststep] = sad[mind];
}
}
}