stereo-vision
All Data Structures Namespaces Functions Modules Pages
filter.cpp
1 /*
2 Copyright 2011. All rights reserved.
3 Institute of Measurement and Control Systems
4 Karlsruhe Institute of Technology, Germany
5 
6 This file is part of libelas.
7 Authors: Julius Ziegler, Andreas Geiger
8 
9 libelas is free software; you can redistribute it and/or modify it under the
10 terms of the GNU General Public License as published by the Free Software
11 Foundation; either version 3 of the License, or any later version.
12 
13 libelas is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
15 PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License along with
18 libelas; if not, write to the Free Software Foundation, Inc., 51 Franklin
19 Street, Fifth Floor, Boston, MA 02110-1301, USA
20 */
21 
22 #include <stdio.h>
23 #include <string.h>
24 #include <stdint.h>
25 #include <cassert>
26 #include "filter.h"
27 
28 // fast filters: implements 3x3 and 5x5 sobel filters and
29 // 5x5 blob and corner filters based on SSE2/3 instructions
30 namespace filter {
31 
32  // private namespace, public user functions at the bottom of this file
33  namespace detail {
34  void integral_image( const uint8_t* in, int32_t* out, int w, int h ) {
35  int32_t* out_top = out;
36  const uint8_t* line_end = in + w;
37  const uint8_t* in_end = in + w*h;
38  int32_t line_sum = 0;
39  for( ; in != line_end; in++, out++ ) {
40  line_sum += *in;
41  *out = line_sum;
42  }
43  for( ; in != in_end; ) {
44  int32_t line_sum = 0;
45  const uint8_t* line_end = in + w;
46  for( ; in != line_end; in++, out++, out_top++ ) {
47  line_sum += *in;
48  *out = *out_top + line_sum;
49  }
50  }
51  }
52 
53  void unpack_8bit_to_16bit( const __m128i a, __m128i& b0, __m128i& b1 ) {
54  __m128i zero = _mm_setzero_si128();
55  b0 = _mm_unpacklo_epi8( a, zero );
56  b1 = _mm_unpackhi_epi8( a, zero );
57  }
58 
59  void pack_16bit_to_8bit_saturate( const __m128i a0, const __m128i a1, __m128i& b ) {
60  b = _mm_packus_epi16( a0, a1 );
61  }
62 
63  // convolve image with a (1,4,6,4,1) row vector. Result is accumulated into output.
64  // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
65  void convolve_14641_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
66  assert( w % 16 == 0 && "width must be multiple of 16!" );
67  const __m128i* i0 = (const __m128i*)(in);
68  const int16_t* i1 = in+1;
69  const int16_t* i2 = in+2;
70  const int16_t* i3 = in+3;
71  const int16_t* i4 = in+4;
72  uint8_t* result = out + 2;
73  const int16_t* const end_input = in + w*h;
74  __m128i offs = _mm_set1_epi16( 128 );
75  for( ; i4 < end_input; i0 += 1, i1 += 8, i2 += 8, i3 += 8, i4 += 8, result += 16 ) {
76  __m128i result_register_lo;
77  __m128i result_register_hi;
78  for( int i=0; i<2; i++ ) {
79  __m128i* result_register;
80  if( i==0 ) result_register = &result_register_lo;
81  else result_register = &result_register_hi;
82  __m128i i0_register = *i0;
83  __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
84  __m128i i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
85  __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
86  __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
87  *result_register = _mm_setzero_si128();
88  *result_register = _mm_add_epi16( i0_register, *result_register );
89  i1_register = _mm_add_epi16( i1_register, i1_register );
90  i1_register = _mm_add_epi16( i1_register, i1_register );
91  *result_register = _mm_add_epi16( i1_register, *result_register );
92  i2_register = _mm_add_epi16( i2_register, i2_register );
93  *result_register = _mm_add_epi16( i2_register, *result_register );
94  i2_register = _mm_add_epi16( i2_register, i2_register );
95  *result_register = _mm_add_epi16( i2_register, *result_register );
96  i3_register = _mm_add_epi16( i3_register, i3_register );
97  i3_register = _mm_add_epi16( i3_register, i3_register );
98  *result_register = _mm_add_epi16( i3_register, *result_register );
99  *result_register = _mm_add_epi16( i4_register, *result_register );
100  *result_register = _mm_srai_epi16( *result_register, 7 );
101  *result_register = _mm_add_epi16( *result_register, offs );
102  if( i==0 ) {
103  i0 += 1;
104  i1 += 8;
105  i2 += 8;
106  i3 += 8;
107  i4 += 8;
108  }
109  }
110  pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
111  _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
112  }
113  }
114 
115  // convolve image with a (1,2,0,-2,-1) row vector. Result is accumulated into output.
116  // This one works on 16bit input and 8bit output.
117  // output is scaled by 1/128, then clamped to [-128,128], and finally shifted to [0,255].
118  void convolve_12021_row_5x5_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
119  assert( w % 16 == 0 && "width must be multiple of 16!" );
120  const __m128i* i0 = (const __m128i*)(in);
121  const int16_t* i1 = in+1;
122  const int16_t* i3 = in+3;
123  const int16_t* i4 = in+4;
124  uint8_t* result = out + 2;
125  const int16_t* const end_input = in + w*h;
126  __m128i offs = _mm_set1_epi16( 128 );
127  for( ; i4 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 16 ) {
128  __m128i result_register_lo;
129  __m128i result_register_hi;
130  for( int i=0; i<2; i++ ) {
131  __m128i* result_register;
132  if( i==0 ) result_register = &result_register_lo;
133  else result_register = &result_register_hi;
134  __m128i i0_register = *i0;
135  __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
136  __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
137  __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
138  *result_register = _mm_setzero_si128();
139  *result_register = _mm_add_epi16( i0_register, *result_register );
140  i1_register = _mm_add_epi16( i1_register, i1_register );
141  *result_register = _mm_add_epi16( i1_register, *result_register );
142  i3_register = _mm_add_epi16( i3_register, i3_register );
143  *result_register = _mm_sub_epi16( *result_register, i3_register );
144  *result_register = _mm_sub_epi16( *result_register, i4_register );
145  *result_register = _mm_srai_epi16( *result_register, 7 );
146  *result_register = _mm_add_epi16( *result_register, offs );
147  if( i==0 ) {
148  i0 += 1;
149  i1 += 8;
150  i3 += 8;
151  i4 += 8;
152  }
153  }
154  pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
155  _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
156  }
157  }
158 
159  // convolve image with a (1,2,1) row vector. Result is accumulated into output.
160  // This one works on 16bit input and 8bit output.
161  // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
162  void convolve_121_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
163  assert( w % 16 == 0 && "width must be multiple of 16!" );
164  const __m128i* i0 = (const __m128i*)(in);
165  const int16_t* i1 = in+1;
166  const int16_t* i2 = in+2;
167  uint8_t* result = out + 1;
168  const int16_t* const end_input = in + w*h;
169  const size_t blocked_loops = (w*h-2)/16;
170  __m128i offs = _mm_set1_epi16( 128 );
171  for( size_t i=0; i != blocked_loops; i++ ) {
172  __m128i result_register_lo;
173  __m128i result_register_hi;
174  __m128i i1_register;
175  __m128i i2_register;
176 
177  i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
178  i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
179  result_register_lo = *i0;
180  i1_register = _mm_add_epi16( i1_register, i1_register );
181  result_register_lo = _mm_add_epi16( i1_register, result_register_lo );
182  result_register_lo = _mm_add_epi16( i2_register, result_register_lo );
183  result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
184  result_register_lo = _mm_add_epi16( result_register_lo, offs );
185 
186  i0++;
187  i1+=8;
188  i2+=8;
189 
190  i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
191  i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
192  result_register_hi = *i0;
193  i1_register = _mm_add_epi16( i1_register, i1_register );
194  result_register_hi = _mm_add_epi16( i1_register, result_register_hi );
195  result_register_hi = _mm_add_epi16( i2_register, result_register_hi );
196  result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
197  result_register_hi = _mm_add_epi16( result_register_hi, offs );
198 
199  i0++;
200  i1+=8;
201  i2+=8;
202 
203  pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
204  _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
205 
206  result += 16;
207  }
208  }
209 
210  // convolve image with a (1,0,-1) row vector. Result is accumulated into output.
211  // This one works on 16bit input and 8bit output.
212  // output is scaled by 1/4, then clamped to [-128,128], and finally shifted to [0,255].
213  void convolve_101_row_3x3_16bit( const int16_t* in, uint8_t* out, int w, int h ) {
214  assert( w % 16 == 0 && "width must be multiple of 16!" );
215  const __m128i* i0 = (const __m128i*)(in);
216  const int16_t* i2 = in+2;
217  uint8_t* result = out + 1;
218  const int16_t* const end_input = in + w*h;
219  const size_t blocked_loops = (w*h-2)/16;
220  __m128i offs = _mm_set1_epi16( 128 );
221  for( size_t i=0; i != blocked_loops; i++ ) {
222  __m128i result_register_lo;
223  __m128i result_register_hi;
224  __m128i i2_register;
225 
226  i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
227  result_register_lo = *i0;
228  result_register_lo = _mm_sub_epi16( result_register_lo, i2_register );
229  result_register_lo = _mm_srai_epi16( result_register_lo, 2 );
230  result_register_lo = _mm_add_epi16( result_register_lo, offs );
231 
232  i0 += 1;
233  i2 += 8;
234 
235  i2_register = _mm_loadu_si128( (__m128i*)( i2 ) );
236  result_register_hi = *i0;
237  result_register_hi = _mm_sub_epi16( result_register_hi, i2_register );
238  result_register_hi = _mm_srai_epi16( result_register_hi, 2 );
239  result_register_hi = _mm_add_epi16( result_register_hi, offs );
240 
241  i0 += 1;
242  i2 += 8;
243 
244  pack_16bit_to_8bit_saturate( result_register_lo, result_register_hi, result_register_lo );
245  _mm_storeu_si128( ((__m128i*)( result )), result_register_lo );
246 
247  result += 16;
248  }
249 
250  for( ; i2 < end_input; i2++, result++) {
251  *result = ((*(i2-2) - *i2)>>2)+128;
252  }
253  }
254 
255  void convolve_cols_5x5( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
256  using namespace std;
257  memset( out_h, 0, w*h*sizeof(int16_t) );
258  memset( out_v, 0, w*h*sizeof(int16_t) );
259  assert( w % 16 == 0 && "width must be multiple of 16!" );
260  const int w_chunk = w/16;
261  __m128i* i0 = (__m128i*)( in );
262  __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
263  __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
264  __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
265  __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
266  __m128i* result_h = (__m128i*)( out_h ) + 4*w_chunk;
267  __m128i* result_v = (__m128i*)( out_v ) + 4*w_chunk;
268  __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
269  __m128i sixes = _mm_set1_epi16( 6 );
270  __m128i fours = _mm_set1_epi16( 4 );
271  for( ; i4 != end_input; i0++, i1++, i2++, i3++, i4++, result_v+=2, result_h+=2 ) {
272  __m128i ilo, ihi;
273  unpack_8bit_to_16bit( *i0, ihi, ilo );
274  *result_h = _mm_add_epi16( ihi, *result_h );
275  *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
276  *result_v = _mm_add_epi16( *result_v, ihi );
277  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
278  unpack_8bit_to_16bit( *i1, ihi, ilo );
279  *result_h = _mm_add_epi16( ihi, *result_h );
280  *result_h = _mm_add_epi16( ihi, *result_h );
281  *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
282  *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
283  ihi = _mm_mullo_epi16( ihi, fours );
284  ilo = _mm_mullo_epi16( ilo, fours );
285  *result_v = _mm_add_epi16( *result_v, ihi );
286  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
287  unpack_8bit_to_16bit( *i2, ihi, ilo );
288  ihi = _mm_mullo_epi16( ihi, sixes );
289  ilo = _mm_mullo_epi16( ilo, sixes );
290  *result_v = _mm_add_epi16( *result_v, ihi );
291  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
292  unpack_8bit_to_16bit( *i3, ihi, ilo );
293  *result_h = _mm_sub_epi16( *result_h, ihi );
294  *result_h = _mm_sub_epi16( *result_h, ihi );
295  *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
296  *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
297  ihi = _mm_mullo_epi16( ihi, fours );
298  ilo = _mm_mullo_epi16( ilo, fours );
299  *result_v = _mm_add_epi16( *result_v, ihi );
300  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
301  unpack_8bit_to_16bit( *i4, ihi, ilo );
302  *result_h = _mm_sub_epi16( *result_h, ihi );
303  *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
304  *result_v = _mm_add_epi16( *result_v, ihi );
305  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
306  }
307  }
308 
309  void convolve_col_p1p1p0m1m1_5x5( const unsigned char* in, int16_t* out, int w, int h ) {
310  memset( out, 0, w*h*sizeof(int16_t) );
311  using namespace std;
312  assert( w % 16 == 0 && "width must be multiple of 16!" );
313  const int w_chunk = w/16;
314  __m128i* i0 = (__m128i*)( in );
315  __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
316  __m128i* i3 = (__m128i*)( in ) + w_chunk*3;
317  __m128i* i4 = (__m128i*)( in ) + w_chunk*4;
318  __m128i* result = (__m128i*)( out ) + 4*w_chunk;
319  __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
320  for( ; i4 != end_input; i0++, i1++, i3++, i4++, result+=2 ) {
321  __m128i ilo0, ihi0;
322  unpack_8bit_to_16bit( *i0, ihi0, ilo0 );
323  __m128i ilo1, ihi1;
324  unpack_8bit_to_16bit( *i1, ihi1, ilo1 );
325  *result = _mm_add_epi16( ihi0, ihi1 );
326  *(result+1) = _mm_add_epi16( ilo0, ilo1 );
327  __m128i ilo, ihi;
328  unpack_8bit_to_16bit( *i3, ihi, ilo );
329  *result = _mm_sub_epi16( *result, ihi );
330  *(result+1) = _mm_sub_epi16( *(result+1), ilo );
331  unpack_8bit_to_16bit( *i4, ihi, ilo );
332  *result = _mm_sub_epi16( *result, ihi );
333  *(result+1) = _mm_sub_epi16( *(result+1), ilo );
334  }
335  }
336 
337  void convolve_row_p1p1p0m1m1_5x5( const int16_t* in, int16_t* out, int w, int h ) {
338  assert( w % 16 == 0 && "width must be multiple of 16!" );
339  const __m128i* i0 = (const __m128i*)(in);
340  const int16_t* i1 = in+1;
341  const int16_t* i3 = in+3;
342  const int16_t* i4 = in+4;
343  int16_t* result = out + 2;
344  const int16_t* const end_input = in + w*h;
345  for( ; i4+8 < end_input; i0 += 1, i1 += 8, i3 += 8, i4 += 8, result += 8 ) {
346  __m128i result_register;
347  __m128i i0_register = *i0;
348  __m128i i1_register = _mm_loadu_si128( (__m128i*)( i1 ) );
349  __m128i i3_register = _mm_loadu_si128( (__m128i*)( i3 ) );
350  __m128i i4_register = _mm_loadu_si128( (__m128i*)( i4 ) );
351  result_register = _mm_add_epi16( i0_register, i1_register );
352  result_register = _mm_sub_epi16( result_register, i3_register );
353  result_register = _mm_sub_epi16( result_register, i4_register );
354  _mm_storeu_si128( ((__m128i*)( result )), result_register );
355  }
356  }
357 
358  void convolve_cols_3x3( const unsigned char* in, int16_t* out_v, int16_t* out_h, int w, int h ) {
359  using namespace std;
360  assert( w % 16 == 0 && "width must be multiple of 16!" );
361  const int w_chunk = w/16;
362  __m128i* i0 = (__m128i*)( in );
363  __m128i* i1 = (__m128i*)( in ) + w_chunk*1;
364  __m128i* i2 = (__m128i*)( in ) + w_chunk*2;
365  __m128i* result_h = (__m128i*)( out_h ) + 2*w_chunk;
366  __m128i* result_v = (__m128i*)( out_v ) + 2*w_chunk;
367  __m128i* end_input = (__m128i*)( in ) + w_chunk*h;
368  for( ; i2 != end_input; i0++, i1++, i2++, result_v+=2, result_h+=2 ) {
369  *result_h = _mm_setzero_si128();
370  *(result_h+1) = _mm_setzero_si128();
371  *result_v = _mm_setzero_si128();
372  *(result_v+1) = _mm_setzero_si128();
373  __m128i ilo, ihi;
374  unpack_8bit_to_16bit( *i0, ihi, ilo );
375  unpack_8bit_to_16bit( *i0, ihi, ilo );
376  *result_h = _mm_add_epi16( ihi, *result_h );
377  *(result_h+1) = _mm_add_epi16( ilo, *(result_h+1) );
378  *result_v = _mm_add_epi16( *result_v, ihi );
379  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
380  unpack_8bit_to_16bit( *i1, ihi, ilo );
381  *result_v = _mm_add_epi16( *result_v, ihi );
382  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
383  *result_v = _mm_add_epi16( *result_v, ihi );
384  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
385  unpack_8bit_to_16bit( *i2, ihi, ilo );
386  *result_h = _mm_sub_epi16( *result_h, ihi );
387  *(result_h+1) = _mm_sub_epi16( *(result_h+1), ilo );
388  *result_v = _mm_add_epi16( *result_v, ihi );
389  *(result_v+1) = _mm_add_epi16( *(result_v+1), ilo );
390  }
391  }
392  };
393 
394  void sobel3x3( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
395  int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
396  int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
397  detail::convolve_cols_3x3( in, temp_v, temp_h, w, h );
398  detail::convolve_101_row_3x3_16bit( temp_v, out_v, w, h );
399  detail::convolve_121_row_3x3_16bit( temp_h, out_h, w, h );
400  _mm_free( temp_h );
401  _mm_free( temp_v );
402  }
403 
404  void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
405  int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
406  int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
407  detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
408  detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
409  detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
410  _mm_free( temp_h );
411  _mm_free( temp_v );
412  }
413 
414  // -1 -1 0 1 1
415  // -1 -1 0 1 1
416  // 0 0 0 0 0
417  // 1 1 0 -1 -1
418  // 1 1 0 -1 -1
419  void checkerboard5x5( const uint8_t* in, int16_t* out, int w, int h ) {
420  int16_t* temp = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
421  detail::convolve_col_p1p1p0m1m1_5x5( in, temp, w, h );
422  detail::convolve_row_p1p1p0m1m1_5x5( temp, out, w, h );
423  _mm_free( temp );
424  }
425 
426  // -1 -1 -1 -1 -1
427  // -1 1 1 1 -1
428  // -1 1 8 1 -1
429  // -1 1 1 1 -1
430  // -1 -1 -1 -1 -1
431  void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) {
432  int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) );
433  detail::integral_image( in, integral, w, h );
434  int16_t* out_ptr = out + 3 + 3*w;
435  int16_t* out_end = out + w * h - 2 - 2*w;
436  const int32_t* i00 = integral;
437  const int32_t* i50 = integral + 5;
438  const int32_t* i05 = integral + 5*w;
439  const int32_t* i55 = integral + 5 + 5*w;
440  const int32_t* i11 = integral + 1 + 1*w;
441  const int32_t* i41 = integral + 4 + 1*w;
442  const int32_t* i14 = integral + 1 + 4*w;
443  const int32_t* i44 = integral + 4 + 4*w;
444  const uint8_t* im22 = in + 3 + 3*w;
445  for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
446  int32_t result = 0;
447  result = -( *i55 - *i50 - *i05 + *i00 );
448  result += 2*( *i44 - *i41 - *i14 + *i11 );
449  result += 7* *im22;
450  *out_ptr = result;
451  }
452  _mm_free( integral );
453  }
454 };