34 void integral_image(
const uint8_t* in, int32_t* out,
int w,
int h ) {
35 int32_t* out_top = out;
36 const uint8_t* line_end = in + w;
37 const uint8_t* in_end = in + w*h;
39 for( ; in != line_end; in++, out++ ) {
43 for( ; in != in_end; ) {
45 const uint8_t* line_end = in + w;
46 for( ; in != line_end; in++, out++, out_top++ ) {
48 *out = *out_top + line_sum;
53 void unpack_8bit_to_16bit(
const __m128i a, __m128i& b0, __m128i& b1 ) {
54 __m128i zero = _mm_setzero_si128();
55 b0 = _mm_unpacklo_epi8( a, zero );
56 b1 = _mm_unpackhi_epi8( a, zero );
59 void pack_16bit_to_8bit_saturate(
const __m128i a0,
const __m128i a1, __m128i& b ) {
60 b = _mm_packus_epi16( a0, a1 );
65 void convolve_14641_row_5x5_16bit(
const int16_t* in, uint8_t* out,
int w,
int h ) {
66 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
67 const __m128i* i0 = (
const __m128i*)(in);
68 const int16_t* i1 = in+1;
69 const int16_t* i2 = in+2;
70 const int16_t* i3 = in+3;
71 const int16_t* i4 = in+4;
72 uint8_t* result = out + 2;
73 const int16_t*
const end_input = in + w*h;
74 __m128i offs = _mm_set1_epi16( 128 );
75 for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) {
76 __m128i result_register_lo;
77 __m128i result_register_hi;
78 for(
int i=0; i<2; i++ ) {
79 __m128i* result_register;
80 if( i==0 ) result_register = &result_register_lo;
81 else result_register = &result_register_hi;
82 __m128i i0_register = *i0;
83 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
84 __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
85 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
86 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
87 *result_register = _mm_setzero_si128();
88 *result_register = _mm_add_epi16( i0_register, *result_register );
89 i1_register = _mm_add_epi16( i1_register, i1_register );
90 i1_register = _mm_add_epi16( i1_register, i1_register );
91 *result_register = _mm_add_epi16( i1_register, *result_register );
92 i2_register = _mm_add_epi16( i2_register, i2_register );
93 *result_register = _mm_add_epi16( i2_register, *result_register );
94 i2_register = _mm_add_epi16( i2_register, i2_register );
95 *result_register = _mm_add_epi16( i2_register, *result_register );
96 i3_register = _mm_add_epi16( i3_register, i3_register );
97 i3_register = _mm_add_epi16( i3_register, i3_register );
98 *result_register = _mm_add_epi16( i3_register, *result_register );
99 *result_register = _mm_add_epi16( i4_register, *result_register );
100 *result_register = _mm_srai_epi16( *result_register, 7 );
101 *result_register = _mm_add_epi16( *result_register, offs );
110 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
111 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
118 void convolve_12021_row_5x5_16bit(
const int16_t* in, uint8_t* out,
int w,
int h ) {
119 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
120 const __m128i* i0 = (
const __m128i*)(in);
121 const int16_t* i1 = in+1;
122 const int16_t* i3 = in+3;
123 const int16_t* i4 = in+4;
124 uint8_t* result = out + 2;
125 const int16_t*
const end_input = in + w*h;
126 __m128i offs = _mm_set1_epi16( 128 );
127 for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) {
128 __m128i result_register_lo;
129 __m128i result_register_hi;
130 for(
int i=0; i<2; i++ ) {
131 __m128i* result_register;
132 if( i==0 ) result_register = &result_register_lo;
133 else result_register = &result_register_hi;
134 __m128i i0_register = *i0;
135 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
136 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
137 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
138 *result_register = _mm_setzero_si128();
139 *result_register = _mm_add_epi16( i0_register, *result_register );
140 i1_register = _mm_add_epi16( i1_register, i1_register );
141 *result_register = _mm_add_epi16( i1_register, *result_register );
142 i3_register = _mm_add_epi16( i3_register, i3_register );
143 *result_register = _mm_sub_epi16( *result_register, i3_register );
144 *result_register = _mm_sub_epi16( *result_register, i4_register );
145 *result_register = _mm_srai_epi16( *result_register, 7 );
146 *result_register = _mm_add_epi16( *result_register, offs );
154 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
155 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
162 void convolve_121_row_3x3_16bit(
const int16_t* in, uint8_t* out,
int w,
int h ) {
163 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
164 const __m128i* i0 = (
const __m128i*)(in);
165 const int16_t* i1 = in+1;
166 const int16_t* i2 = in+2;
167 uint8_t* result = out + 1;
168 const int16_t*
const end_input = in + w*h;
169 const size_t blocked_loops = (w*h-2)/16;
170 __m128i offs = _mm_set1_epi16( 128 );
171 for(
size_t i=0; i != blocked_loops; i++ ) {
172 __m128i result_register_lo;
173 __m128i result_register_hi;
177 i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
178 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
179 result_register_lo = *i0;
180 i1_register = _mm_add_epi16( i1_register, i1_register );
181 result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
182 result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
183 result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
184 result_register_lo = _mm_add_epi16( result_register_lo, offs );
190 i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
191 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
192 result_register_hi = *i0;
193 i1_register = _mm_add_epi16( i1_register, i1_register );
194 result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
195 result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
196 result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
197 result_register_hi = _mm_add_epi16( result_register_hi, offs );
203 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
204 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
213 void convolve_101_row_3x3_16bit(
const int16_t* in, uint8_t* out,
int w,
int h ) {
214 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
215 const __m128i* i0 = (
const __m128i*)(in);
216 const int16_t* i2 = in+2;
217 uint8_t* result = out + 1;
218 const int16_t*
const end_input = in + w*h;
219 const size_t blocked_loops = (w*h-2)/16;
220 __m128i offs = _mm_set1_epi16( 128 );
221 for(
size_t i=0; i != blocked_loops; i++ ) {
222 __m128i result_register_lo;
223 __m128i result_register_hi;
226 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
227 result_register_lo = *i0;
228 result_register_lo = _mm_sub_epi16( result_register_lo, i2_register );
229 result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
230 result_register_lo = _mm_add_epi16( result_register_lo, offs );
235 i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
236 result_register_hi = *i0;
237 result_register_hi = _mm_sub_epi16( result_register_hi, i2_register );
238 result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
239 result_register_hi = _mm_add_epi16( result_register_hi, offs );
244 pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
245 _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
250 for( ; i2 < end_input; i2++, result++) {
251 *result = ((*(i2-2) - *i2)>>2)+128;
255 void convolve_cols_5x5(
const unsigned char* in, int16_t* out_v, int16_t* out_h,
int w,
int h ) {
257 memset( out_h, 0, w*h*
sizeof(int16_t) );
258 memset( out_v, 0, w*h*
sizeof(int16_t) );
259 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
260 const int w_chunk = w/16;
261 __m128i* i0 = (__m128i*)( in );
262 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
263 __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
264 __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
265 __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
266 __m128i* result_h = (__m128i*)( out_h ) + 4*w_chunk;
267 __m128i* result_v = (__m128i*)( out_v ) + 4*w_chunk;
268 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
269 __m128i sixes = _mm_set1_epi16( 6 );
270 __m128i fours = _mm_set1_epi16( 4 );
271 for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) {
273 unpack_8bit_to_16bit( *i0, ihi, ilo );
274 *result_h = _mm_add_epi16( ihi, *result_h );
275 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
276 *result_v = _mm_add_epi16( *result_v, ihi );
277 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
278 unpack_8bit_to_16bit( *i1, ihi, ilo );
279 *result_h = _mm_add_epi16( ihi, *result_h );
280 *result_h = _mm_add_epi16( ihi, *result_h );
281 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
282 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
283 ihi = _mm_mullo_epi16( ihi, fours );
284 ilo = _mm_mullo_epi16( ilo, fours );
285 *result_v = _mm_add_epi16( *result_v, ihi );
286 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
287 unpack_8bit_to_16bit( *i2, ihi, ilo );
288 ihi = _mm_mullo_epi16( ihi, sixes );
289 ilo = _mm_mullo_epi16( ilo, sixes );
290 *result_v = _mm_add_epi16( *result_v, ihi );
291 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
292 unpack_8bit_to_16bit( *i3, ihi, ilo );
293 *result_h = _mm_sub_epi16( *result_h, ihi );
294 *result_h = _mm_sub_epi16( *result_h, ihi );
295 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
296 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
297 ihi = _mm_mullo_epi16( ihi, fours );
298 ilo = _mm_mullo_epi16( ilo, fours );
299 *result_v = _mm_add_epi16( *result_v, ihi );
300 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
301 unpack_8bit_to_16bit( *i4, ihi, ilo );
302 *result_h = _mm_sub_epi16( *result_h, ihi );
303 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
304 *result_v = _mm_add_epi16( *result_v, ihi );
305 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
309 void convolve_col_p1p1p0m1m1_5x5(
const unsigned char* in, int16_t* out,
int w,
int h ) {
310 memset( out, 0, w*h*
sizeof(int16_t) );
312 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
313 const int w_chunk = w/16;
314 __m128i* i0 = (__m128i*)( in );
315 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
316 __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
317 __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
318 __m128i* result = (__m128i*)( out ) + 4*w_chunk;
319 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
320 for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) {
322 unpack_8bit_to_16bit( *i0, ihi0, ilo0 );
324 unpack_8bit_to_16bit( *i1, ihi1, ilo1 );
325 *result = _mm_add_epi16( ihi0, ihi1 );
326 *(result+1) = _mm_add_epi16( ilo0, ilo1 );
328 unpack_8bit_to_16bit( *i3, ihi, ilo );
329 *result = _mm_sub_epi16( *result, ihi );
330 *(result+1) = _mm_sub_epi16( *(result+1), ilo );
331 unpack_8bit_to_16bit( *i4, ihi, ilo );
332 *result = _mm_sub_epi16( *result, ihi );
333 *(result+1) = _mm_sub_epi16( *(result+1), ilo );
337 void convolve_row_p1p1p0m1m1_5x5(
const int16_t* in, int16_t* out,
int w,
int h ) {
338 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
339 const __m128i* i0 = (
const __m128i*)(in);
340 const int16_t* i1 = in+1;
341 const int16_t* i3 = in+3;
342 const int16_t* i4 = in+4;
343 int16_t* result = out + 2;
344 const int16_t*
const end_input = in + w*h;
345 for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) {
346 __m128i result_register;
347 __m128i i0_register = *i0;
348 __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
349 __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
350 __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
351 result_register = _mm_add_epi16( i0_register, i1_register );
352 result_register = _mm_sub_epi16( result_register, i3_register );
353 result_register = _mm_sub_epi16( result_register, i4_register );
354 _mm_storeu_si128( ((__m128i*)( result )), result_register );
358 void convolve_cols_3x3(
const unsigned char* in, int16_t* out_v, int16_t* out_h,
int w,
int h ) {
360 assert( w % 16 == 0 &&
"width must be multiple of 16!" );
361 const int w_chunk = w/16;
362 __m128i* i0 = (__m128i*)( in );
363 __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
364 __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
365 __m128i* result_h = (__m128i*)( out_h ) + 2*w_chunk;
366 __m128i* result_v = (__m128i*)( out_v ) + 2*w_chunk;
367 __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
368 for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
369 *result_h = _mm_setzero_si128();
370 *(result_h+1) = _mm_setzero_si128();
371 *result_v = _mm_setzero_si128();
372 *(result_v+1) = _mm_setzero_si128();
374 unpack_8bit_to_16bit( *i0, ihi, ilo );
375 unpack_8bit_to_16bit( *i0, ihi, ilo );
376 *result_h = _mm_add_epi16( ihi, *result_h );
377 *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
378 *result_v = _mm_add_epi16( *result_v, ihi );
379 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
380 unpack_8bit_to_16bit( *i1, ihi, ilo );
381 *result_v = _mm_add_epi16( *result_v, ihi );
382 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
383 *result_v = _mm_add_epi16( *result_v, ihi );
384 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
385 unpack_8bit_to_16bit( *i2, ihi, ilo );
386 *result_h = _mm_sub_epi16( *result_h, ihi );
387 *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
388 *result_v = _mm_add_epi16( *result_v, ihi );
389 *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
394 void sobel3x3(
const uint8_t* in, uint8_t* out_v, uint8_t* out_h,
int w,
int h ) {
395 int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*
sizeof( int16_t ), 16 ) );
396 int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*
sizeof( int16_t ), 16 ) );
397 detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
398 detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
399 detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
404 void sobel5x5(
const uint8_t* in, uint8_t* out_v, uint8_t* out_h,
int w,
int h ) {
405 int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*
sizeof( int16_t ), 16 ) );
406 int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*
sizeof( int16_t ), 16 ) );
407 detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
408 detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
409 detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
419 void checkerboard5x5(
const uint8_t* in, int16_t* out,
int w,
int h ) {
420 int16_t* temp = (int16_t*)( _mm_malloc( w*h*
sizeof( int16_t ), 16 ) );
421 detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h );
422 detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h );
431 void blob5x5(
const uint8_t* in, int16_t* out,
int w,
int h ) {
432 int32_t* integral = (int32_t*)( _mm_malloc( w*h*
sizeof( int32_t ), 16 ) );
433 detail::integral_image( in, integral, w, h );
434 int16_t* out_ptr = out + 3 + 3*w;
435 int16_t* out_end = out + w * h - 2 - 2*w;
436 const int32_t* i00 = integral;
437 const int32_t* i50 = integral + 5;
438 const int32_t* i05 = integral + 5*w;
439 const int32_t* i55 = integral + 5 + 5*w;
440 const int32_t* i11 = integral + 1 + 1*w;
441 const int32_t* i41 = integral + 4 + 1*w;
442 const int32_t* i14 = integral + 1 + 4*w;
443 const int32_t* i44 = integral + 4 + 4*w;
444 const uint8_t* im22 = in + 3 + 3*w;
445 for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
447 result = -( *i55 - *i50 - *i05 + *i00 );
448 result += 2*( *i44 - *i41 - *i14 + *i11 );
452 _mm_free( integral );