in modules/imgproc/src/filter.cpp [616:918]
int operator()(const uchar* src, uchar* _dst, int width, int cn) const
{
if( !checkHardwareSupport(CV_CPU_SSE2) )
return 0;
int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
int* dst = (int*)_dst;
bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
const int* kx = kernel.ptr<int>() + _ksize/2;
if( !smallValues )
return 0;
src += (_ksize/2)*cn;
width *= cn;
__m128i z = _mm_setzero_si128();
if( symmetrical )
{
if( _ksize == 1 )
return 0;
if( _ksize == 3 )
{
if( kx[0] == 2 && kx[1] == 1 )
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_si128((__m128i*)(src - cn));
x1 = _mm_loadu_si128((__m128i*)src);
x2 = _mm_loadu_si128((__m128i*)(src + cn));
y0 = _mm_unpackhi_epi8(x0, z);
x0 = _mm_unpacklo_epi8(x0, z);
y1 = _mm_unpackhi_epi8(x1, z);
x1 = _mm_unpacklo_epi8(x1, z);
y2 = _mm_unpackhi_epi8(x2, z);
x2 = _mm_unpacklo_epi8(x2, z);
x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
_mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
_mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
_mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
_mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
}
else if( kx[0] == -2 && kx[1] == 1 )
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_si128((__m128i*)(src - cn));
x1 = _mm_loadu_si128((__m128i*)src);
x2 = _mm_loadu_si128((__m128i*)(src + cn));
y0 = _mm_unpackhi_epi8(x0, z);
x0 = _mm_unpacklo_epi8(x0, z);
y1 = _mm_unpackhi_epi8(x1, z);
x1 = _mm_unpacklo_epi8(x1, z);
y2 = _mm_unpackhi_epi8(x2, z);
x2 = _mm_unpacklo_epi8(x2, z);
x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
}
else
{
__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
k0 = _mm_packs_epi32(k0, k0);
k1 = _mm_packs_epi32(k1, k1);
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
x0 = _mm_loadu_si128((__m128i*)(src - cn));
x1 = _mm_loadu_si128((__m128i*)src);
x2 = _mm_loadu_si128((__m128i*)(src + cn));
y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
y1 = _mm_unpackhi_epi8(x1, z);
x1 = _mm_unpacklo_epi8(x1, z);
t1 = _mm_mulhi_epi16(x1, k0);
t0 = _mm_mullo_epi16(x1, k0);
x2 = _mm_mulhi_epi16(x0, k1);
x0 = _mm_mullo_epi16(x0, k1);
z0 = _mm_unpacklo_epi16(t0, t1);
z1 = _mm_unpackhi_epi16(t0, t1);
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
t1 = _mm_mulhi_epi16(y1, k0);
t0 = _mm_mullo_epi16(y1, k0);
y1 = _mm_mulhi_epi16(y0, k1);
y0 = _mm_mullo_epi16(y0, k1);
z2 = _mm_unpacklo_epi16(t0, t1);
z3 = _mm_unpackhi_epi16(t0, t1);
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
_mm_store_si128((__m128i*)(dst + i), z0);
_mm_store_si128((__m128i*)(dst + i + 4), z1);
_mm_store_si128((__m128i*)(dst + i + 8), z2);
_mm_store_si128((__m128i*)(dst + i + 12), z3);
}
}
}
else if( _ksize == 5 )
{
if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, x2, y0, y1, y2;
x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
x1 = _mm_loadu_si128((__m128i*)src);
x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
y0 = _mm_unpackhi_epi8(x0, z);
x0 = _mm_unpacklo_epi8(x0, z);
y1 = _mm_unpackhi_epi8(x1, z);
x1 = _mm_unpacklo_epi8(x1, z);
y2 = _mm_unpackhi_epi8(x2, z);
x2 = _mm_unpacklo_epi8(x2, z);
x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
}
else
{
__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
k0 = _mm_packs_epi32(k0, k0);
k1 = _mm_packs_epi32(k1, k1);
k2 = _mm_packs_epi32(k2, k2);
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
x0 = _mm_loadu_si128((__m128i*)(src - cn));
x1 = _mm_loadu_si128((__m128i*)src);
x2 = _mm_loadu_si128((__m128i*)(src + cn));
y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
y1 = _mm_unpackhi_epi8(x1, z);
x1 = _mm_unpacklo_epi8(x1, z);
t1 = _mm_mulhi_epi16(x1, k0);
t0 = _mm_mullo_epi16(x1, k0);
x2 = _mm_mulhi_epi16(x0, k1);
x0 = _mm_mullo_epi16(x0, k1);
z0 = _mm_unpacklo_epi16(t0, t1);
z1 = _mm_unpackhi_epi16(t0, t1);
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
t1 = _mm_mulhi_epi16(y1, k0);
t0 = _mm_mullo_epi16(y1, k0);
y1 = _mm_mulhi_epi16(y0, k1);
y0 = _mm_mullo_epi16(y0, k1);
z2 = _mm_unpacklo_epi16(t0, t1);
z3 = _mm_unpackhi_epi16(t0, t1);
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
t1 = _mm_mulhi_epi16(y0, k2);
t0 = _mm_mullo_epi16(y0, k2);
y0 = _mm_mullo_epi16(y1, k2);
y1 = _mm_mulhi_epi16(y1, k2);
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
_mm_store_si128((__m128i*)(dst + i), z0);
_mm_store_si128((__m128i*)(dst + i + 4), z1);
_mm_store_si128((__m128i*)(dst + i + 8), z2);
_mm_store_si128((__m128i*)(dst + i + 12), z3);
}
}
}
}
else
{
if( _ksize == 3 )
{
if( kx[0] == 0 && kx[1] == 1 )
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, y0;
x0 = _mm_loadu_si128((__m128i*)(src + cn));
x1 = _mm_loadu_si128((__m128i*)(src - cn));
y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
_mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
_mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
_mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
_mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
}
else
{
__m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
k1 = _mm_packs_epi32(k1, k1);
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, y0, y1, z0, z1, z2, z3;
x0 = _mm_loadu_si128((__m128i*)(src + cn));
x1 = _mm_loadu_si128((__m128i*)(src - cn));
y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
x1 = _mm_mulhi_epi16(x0, k1);
x0 = _mm_mullo_epi16(x0, k1);
z0 = _mm_unpacklo_epi16(x0, x1);
z1 = _mm_unpackhi_epi16(x0, x1);
y1 = _mm_mulhi_epi16(y0, k1);
y0 = _mm_mullo_epi16(y0, k1);
z2 = _mm_unpacklo_epi16(y0, y1);
z3 = _mm_unpackhi_epi16(y0, y1);
_mm_store_si128((__m128i*)(dst + i), z0);
_mm_store_si128((__m128i*)(dst + i + 4), z1);
_mm_store_si128((__m128i*)(dst + i + 8), z2);
_mm_store_si128((__m128i*)(dst + i + 12), z3);
}
}
}
else if( _ksize == 5 )
{
__m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
k0 = _mm_packs_epi32(k0, k0);
k1 = _mm_packs_epi32(k1, k1);
k2 = _mm_packs_epi32(k2, k2);
for( ; i <= width - 16; i += 16, src += 16 )
{
__m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
x0 = _mm_loadu_si128((__m128i*)(src + cn));
x2 = _mm_loadu_si128((__m128i*)(src - cn));
y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
x2 = _mm_mulhi_epi16(x0, k1);
x0 = _mm_mullo_epi16(x0, k1);
z0 = _mm_unpacklo_epi16(x0, x2);
z1 = _mm_unpackhi_epi16(x0, x2);
y1 = _mm_mulhi_epi16(y0, k1);
y0 = _mm_mullo_epi16(y0, k1);
z2 = _mm_unpacklo_epi16(y0, y1);
z3 = _mm_unpackhi_epi16(y0, y1);
x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
t1 = _mm_mulhi_epi16(y0, k2);
t0 = _mm_mullo_epi16(y0, k2);
y0 = _mm_mullo_epi16(y1, k2);
y1 = _mm_mulhi_epi16(y1, k2);
z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
_mm_store_si128((__m128i*)(dst + i), z0);
_mm_store_si128((__m128i*)(dst + i + 4), z1);
_mm_store_si128((__m128i*)(dst + i + 8), z2);
_mm_store_si128((__m128i*)(dst + i + 12), z3);
}
}
}
src -= (_ksize/2)*cn;
kx -= _ksize/2;
for( ; i <= width - 4; i += 4, src += 4 )
{
__m128i f, s0 = z, x0, x1;
for( k = j = 0; k < _ksize; k++, j += cn )
{
f = _mm_cvtsi32_si128(kx[k]);
f = _mm_shuffle_epi32(f, 0);
f = _mm_packs_epi32(f, f);
x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
x0 = _mm_unpacklo_epi8(x0, z);
x1 = _mm_mulhi_epi16(x0, f);
x0 = _mm_mullo_epi16(x0, f);
s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
}
_mm_store_si128((__m128i*)(dst + i), s0);
}
return i;
}