stereo-vision
All Data Structures Namespaces Functions Modules Pages
util.h
1 
21 #ifndef UTIL_H_
22 #define UTIL_H_
23 
24 #include <iostream>
25 #include <dirent.h>
26 #include <stdio.h>
27 
28 #define FERMI false
29 
30 #define GPU_THREADS_PER_BLOCK_FERMI 256
31 #define GPU_THREADS_PER_BLOCK_MAXWELL 64
32 
33 /* Defines related to GPU Architecture */
34 #if FERMI
35  #define GPU_THREADS_PER_BLOCK GPU_THREADS_PER_BLOCK_FERMI
36 #else
37  #define GPU_THREADS_PER_BLOCK GPU_THREADS_PER_BLOCK_MAXWELL
38 #endif
39 
40 #define WARP_SIZE 32
41 
42 static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
43 #define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
44 
49 static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
50  if (err == cudaSuccess)
51  return;
52  std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
53  exit (1);
54 }
55 
56 /*************************************
57 GPU Side defines (ASM instructions)
58 **************************************/
59 
60 // output temporal carry in internal register
61 #define UADD__CARRY_OUT(c, a, b) \
62  asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
63 
64 // add & output with temporal carry of internal register
65 #define UADD__IN_CARRY_OUT(c, a, b) \
66  asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
67 
68 // add with temporal carry of internal register
69 #define UADD__IN_CARRY(c, a, b) \
70  asm volatile("addc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
71 
72 // packing and unpacking: from uint64_t to uint2
73 #define V2S_B64(v,s) \
74  asm("mov.b64 %0, {%1,%2};" : "=l"(s) : "r"(v.x), "r"(v.y))
75 
76 // packing and unpacking: from uint2 to uint64_t
77 #define S2V_B64(s,v) \
78  asm("mov.b64 {%0,%1}, %2;" : "=r"(v.x), "=r"(v.y) : "l"(s))
79 
80 
81 /*************************************
82 DEVICE side basic block primitives
83 **************************************/
84 
85 #if FERMI
86  #define LDG(ptr) (* ptr)
87 #else
88  #define LDG(ptr) __ldg(ptr)
89 #endif
90 
91 #if FERMI
92 __shared__ int interBuff[GPU_THREADS_PER_BLOCK];
93 __inline__ __device__ int __emulated_shfl(const int scalarValue, const uint32_t source_lane) {
94  const int warpIdx = threadIdx.x / WARP_SIZE;
95  const int laneIdx = threadIdx.x % WARP_SIZE;
96  volatile int *interShuffle = interBuff + (warpIdx * WARP_SIZE);
97  interShuffle[laneIdx] = scalarValue;
98  return(interShuffle[source_lane % WARP_SIZE]);
99 }
100 #endif
101 
102 __inline__ __device__ int shfl_32(int scalarValue, const int lane) {
103  #if FERMI
104  return __emulated_shfl(scalarValue, (uint32_t)lane);
105  #else
106  return __shfl(scalarValue, lane);
107  #endif
108 }
109 
110 __inline__ __device__ int shfl_up_32(int scalarValue, const int n) {
111  #if FERMI
112  int lane = threadIdx.x % WARP_SIZE;
113  lane -= n;
114  return shfl_32(scalarValue, lane);
115  #else
116  return __shfl_up(scalarValue, n);
117  #endif
118 }
119 
120 __inline__ __device__ int shfl_down_32(int scalarValue, const int n) {
121  #if FERMI
122  int lane = threadIdx.x % WARP_SIZE;
123  lane += n;
124  return shfl_32(scalarValue, lane);
125  #else
126  return __shfl_down(scalarValue, n);
127  #endif
128 }
129 
130 __inline__ __device__ int shfl_xor_32(int scalarValue, const int n) {
131  #if FERMI
132  int lane = threadIdx.x % WARP_SIZE;
133  lane = lane ^ n;
134  return shfl_32(scalarValue, lane);
135  #else
136  return __shfl_xor(scalarValue, n);
137  #endif
138 }
139 
140 __device__ __forceinline__ uint32_t ld_gbl_ca(const __restrict__ uint32_t *addr) {
141  uint32_t return_value;
142  asm("ld.global.ca.u32 %0, [%1];" : "=r"(return_value) : "l"(addr));
143  return return_value;
144 }
145 
146 __device__ __forceinline__ uint32_t ld_gbl_cs(const __restrict__ uint32_t *addr) {
147  uint32_t return_value;
148  asm("ld.global.cs.u32 %0, [%1];" : "=r"(return_value) : "l"(addr));
149  return return_value;
150 }
151 
152 __device__ __forceinline__ void st_gbl_wt(const __restrict__ uint32_t *addr, const uint32_t value) {
153  asm("st.global.wt.u32 [%0], %1;" :: "l"(addr), "r"(value));
154 }
155 
156 __device__ __forceinline__ void st_gbl_cs(const __restrict__ uint32_t *addr, const uint32_t value) {
157  asm("st.global.cs.u32 [%0], %1;" :: "l"(addr), "r"(value));
158 }
159 
160 __device__ __forceinline__ uint32_t gpu_get_sm_idx(){
161  uint32_t smid;
162  asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
163  return(smid);
164 }
165 
166 __device__ __forceinline__ void uint32_to_uchars(const uint32_t s, int *u1, int *u2, int *u3, int *u4) {
167  //*u1 = s & 0xff;
168  *u1 = __byte_perm(s, 0, 0x4440);
169  //*u2 = (s>>8) & 0xff;
170  *u2 = __byte_perm(s, 0, 0x4441);
171  //*u3 = (s>>16) & 0xff;
172  *u3 = __byte_perm(s, 0, 0x4442);
173  //*u4 = s>>24;
174  *u4 = __byte_perm(s, 0, 0x4443);
175 }
176 
177 __device__ __forceinline__ uint32_t uchars_to_uint32(int u1, int u2, int u3, int u4) {
178  //return u1 | (u2<<8) | (u3<<16) | (u4<<24);
179  //return __byte_perm(u1, u2, 0x7740) + __byte_perm(u3, u4, 0x4077);
180  return u1 | (u2<<8) | __byte_perm(u3, u4, 0x4077);
181 }
182 
183 __device__ __forceinline__ uint32_t uchar_to_uint32(int u1) {
184  return __byte_perm(u1, u1, 0x0);
185 }
186 
187 __device__ __forceinline__ unsigned int vcmpgeu4(unsigned int a, unsigned int b) {
188  unsigned int r, c;
189  c = a-b;
190  asm ("prmt.b32 %0,%1,0,0xba98;" : "=r"(r) : "r"(c));// build mask from msbs
191  return r; // byte-wise unsigned gt-eq comparison with mask result
192 }
193 
194 __device__ __forceinline__ unsigned int vminu4(unsigned int a, unsigned int b) {
195  unsigned int r, s;
196  s = vcmpgeu4 (b, a);// mask = 0xff if a >= b
197  r = a & s; // select a when b >= a
198  s = b & ~s; // select b when b < a
199  r = r | s; // combine byte selections
200  return r;
201 }
202 
203 __device__ __forceinline__ void print_uchars(const char* str, const uint32_t s) {
204  int u1, u2, u3, u4;
205  uint32_to_uchars(s, &u1, &u2, &u3, &u4);
206  printf("%s: %d %d %d %d\n", str, u1, u2, u3, u4);
207 }
208 
209 template<class T>
210 __device__ __forceinline__ int popcount(T n) {
211 #if CSCT or CSCT_RECOMPUTE
212  return __popc(n);
213 #else
214  return __popcll(n);
215 #endif
216 }
217 
218 __inline__ __device__ uint8_t minu8_index4(int *min_idx, const uint8_t val1, const int dis, const uint8_t val2, const int dis2, const uint8_t val3, const int dis3, const uint8_t val4, const int dis4) {
219  int min_idx1 = dis;
220  uint8_t min1 = val1;
221  if(val1 > val2) {
222  min1 = val2;
223  min_idx1 = dis2;
224  }
225 
226  int min_idx2 = dis3;
227  uint8_t min2 = val3;
228  if(val3 > val4) {
229  min2 = val4;
230  min_idx2 = dis4;
231  }
232 
233  uint8_t minval = min1;
234  *min_idx = min_idx1;
235  if(min1 > min2) {
236  minval = min2;
237  *min_idx = min_idx2;
238  }
239  return minval;
240 }
241 
242 __inline__ __device__ uint8_t minu8_index8(int *min_idx, const uint8_t val1, const int dis, const uint8_t val2, const int dis2, const uint8_t val3, const int dis3, const uint8_t val4, const int dis4, const uint8_t val5, const int dis5, const uint8_t val6, const int dis6, const uint8_t val7, const int dis7, const uint8_t val8, const int dis8) {
243  int min_idx1, min_idx2;
244  uint8_t minval1, minval2;
245 
246  minval1 = minu8_index4(&min_idx1, val1, dis, val2, dis2, val3, dis3, val4, dis4);
247  minval2 = minu8_index4(&min_idx2, val5, dis5, val6, dis6, val7, dis7, val8, dis8);
248 
249  *min_idx = min_idx1;
250  uint8_t minval = minval1;
251  if(minval1 > minval2) {
252  *min_idx = min_idx2;
253  minval = minval2;
254  }
255  return minval;
256 }
257 
258 __inline__ __device__ int warpReduceMinIndex2(int *val, int idx) {
259  for(int d = 1; d < WARP_SIZE; d *= 2) {
260  int tmp = shfl_xor_32(*val, d);
261  int tmp_idx = shfl_xor_32(idx, d);
262  if(*val > tmp) {
263  *val = tmp;
264  idx = tmp_idx;
265  }
266  }
267  return idx;
268 }
269 
270 __inline__ __device__ int warpReduceMinIndex(int val, int idx) {
271  for(int d = 1; d < WARP_SIZE; d *= 2) {
272  int tmp = shfl_xor_32(val, d);
273  int tmp_idx = shfl_xor_32(idx, d);
274  if(val > tmp) {
275  val = tmp;
276  idx = tmp_idx;
277  }
278  }
279  return idx;
280 }
281 
282 __inline__ __device__ int warpReduceMin(int val) {
283  val = min(val, shfl_xor_32(val, 1));
284  val = min(val, shfl_xor_32(val, 2));
285  val = min(val, shfl_xor_32(val, 4));
286  val = min(val, shfl_xor_32(val, 8));
287  val = min(val, shfl_xor_32(val, 16));
288  return val;
289 }
290 
291 __inline__ __device__ int blockReduceMin(int val) {
292  static __shared__ int shared[WARP_SIZE]; // Shared mem for WARP_SIZE partial sums
293  const int lane = threadIdx.x % WARP_SIZE;
294  const int wid = threadIdx.x / WARP_SIZE;
295 
296  val = warpReduceMin(val); // Each warp performs partial reduction
297 
298  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
299 
300  __syncthreads(); // Wait for all partial reductions
301 
302  //read from shared memory only if that warp existed
303  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
304 
305  if (wid==0) val = warpReduceMin(val); //Final reduce within first warp
306 
307  return val;
308 }
309 
310 __inline__ __device__ int blockReduceMinIndex(int val, int idx) {
311  static __shared__ int shared_val[WARP_SIZE]; // Shared mem for WARP_SIZE partial mins
312  static __shared__ int shared_idx[WARP_SIZE]; // Shared mem for WARP_SIZE indexes
313  const int lane = threadIdx.x % WARP_SIZE;
314  const int wid = threadIdx.x / WARP_SIZE;
315 
316  idx = warpReduceMinIndex2(&val, idx); // Each warp performs partial reduction
317 
318  if (lane==0) {
319  shared_val[wid]=val;
320  shared_idx[wid]=idx;
321  }
322 
323  __syncthreads(); // Wait for all partial reductions
324 
325  //read from shared memory only if that warp existed
326  val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_val[lane] : INT_MAX;
327  idx = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_idx[lane] : INT_MAX;
328 
329  if (wid==0) {
330  idx = warpReduceMinIndex2(&val, idx); //Final reduce within first warp
331  }
332 
333  return idx;
334 }
335 
336 
337 __inline__ __device__ bool blockAny(bool local_condition) {
338  __shared__ bool conditions[WARP_SIZE];
339  const int lane = threadIdx.x % WARP_SIZE;
340  const int wid = threadIdx.x / WARP_SIZE;
341 
342  local_condition = __any(local_condition); // Each warp performs __any
343 
344  if (lane==0) {
345  conditions[wid]=local_condition;
346  }
347 
348  __syncthreads(); // Wait for all partial __any
349 
350  //read from shared memory only if that warp existed
351  local_condition = (threadIdx.x < blockDim.x / WARP_SIZE) ? conditions[lane] : false;
352 
353  if (wid==0) {
354  local_condition = __any(local_condition); //Final __any within first warp
355  }
356 
357  return local_condition;
358 }
359 
360 #endif /* UTIL_H_ */