30 #define GPU_THREADS_PER_BLOCK_FERMI 256
31 #define GPU_THREADS_PER_BLOCK_MAXWELL 64
35 #define GPU_THREADS_PER_BLOCK GPU_THREADS_PER_BLOCK_FERMI
37 #define GPU_THREADS_PER_BLOCK GPU_THREADS_PER_BLOCK_MAXWELL
42 static void CheckCudaErrorAux (
const char *,
unsigned,
const char *, cudaError_t);
43 #define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
49 static void CheckCudaErrorAux (
const char *file,
unsigned line,
const char *statement, cudaError_t err) {
50 if (err == cudaSuccess)
52 std::cerr << statement<<
" returned " << cudaGetErrorString(err) <<
"("<<err<<
") at "<<file<<
":"<<line << std::endl;
61 #define UADD__CARRY_OUT(c, a, b) \
62 asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
65 #define UADD__IN_CARRY_OUT(c, a, b) \
66 asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
69 #define UADD__IN_CARRY(c, a, b) \
70 asm volatile("addc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b))
73 #define V2S_B64(v,s) \
74 asm("mov.b64 %0, {%1,%2};" : "=l"(s) : "r"(v.x), "r"(v.y))
77 #define S2V_B64(s,v) \
78 asm("mov.b64 {%0,%1}, %2;" : "=r"(v.x), "=r"(v.y) : "l"(s))
86 #define LDG(ptr) (* ptr)
88 #define LDG(ptr) __ldg(ptr)
92 __shared__
int interBuff[GPU_THREADS_PER_BLOCK];
93 __inline__ __device__
int __emulated_shfl(
const int scalarValue,
const uint32_t source_lane) {
94 const int warpIdx = threadIdx.x / WARP_SIZE;
95 const int laneIdx = threadIdx.x % WARP_SIZE;
96 volatile int *interShuffle = interBuff + (warpIdx * WARP_SIZE);
97 interShuffle[laneIdx] = scalarValue;
98 return(interShuffle[source_lane % WARP_SIZE]);
102 __inline__ __device__
int shfl_32(
int scalarValue,
const int lane) {
104 return __emulated_shfl(scalarValue, (uint32_t)lane);
106 return __shfl(scalarValue, lane);
110 __inline__ __device__
int shfl_up_32(
int scalarValue,
const int n) {
112 int lane = threadIdx.x % WARP_SIZE;
114 return shfl_32(scalarValue, lane);
116 return __shfl_up(scalarValue, n);
120 __inline__ __device__
int shfl_down_32(
int scalarValue,
const int n) {
122 int lane = threadIdx.x % WARP_SIZE;
124 return shfl_32(scalarValue, lane);
126 return __shfl_down(scalarValue, n);
130 __inline__ __device__
int shfl_xor_32(
int scalarValue,
const int n) {
132 int lane = threadIdx.x % WARP_SIZE;
134 return shfl_32(scalarValue, lane);
136 return __shfl_xor(scalarValue, n);
140 __device__ __forceinline__ uint32_t ld_gbl_ca(
const __restrict__ uint32_t *addr) {
141 uint32_t return_value;
142 asm(
"ld.global.ca.u32 %0, [%1];" :
"=r"(return_value) :
"l"(addr));
146 __device__ __forceinline__ uint32_t ld_gbl_cs(
const __restrict__ uint32_t *addr) {
147 uint32_t return_value;
148 asm(
"ld.global.cs.u32 %0, [%1];" :
"=r"(return_value) :
"l"(addr));
152 __device__ __forceinline__
void st_gbl_wt(
const __restrict__ uint32_t *addr,
const uint32_t value) {
153 asm(
"st.global.wt.u32 [%0], %1;" ::
"l"(addr),
"r"(value));
156 __device__ __forceinline__
void st_gbl_cs(
const __restrict__ uint32_t *addr,
const uint32_t value) {
157 asm(
"st.global.cs.u32 [%0], %1;" ::
"l"(addr),
"r"(value));
160 __device__ __forceinline__ uint32_t gpu_get_sm_idx(){
162 asm volatile(
"mov.u32 %0, %%smid;" :
"=r"(smid));
166 __device__ __forceinline__
void uint32_to_uchars(
const uint32_t s,
int *u1,
int *u2,
int *u3,
int *u4) {
168 *u1 = __byte_perm(s, 0, 0x4440);
170 *u2 = __byte_perm(s, 0, 0x4441);
172 *u3 = __byte_perm(s, 0, 0x4442);
174 *u4 = __byte_perm(s, 0, 0x4443);
177 __device__ __forceinline__ uint32_t uchars_to_uint32(
int u1,
int u2,
int u3,
int u4) {
180 return u1 | (u2<<8) | __byte_perm(u3, u4, 0x4077);
183 __device__ __forceinline__ uint32_t uchar_to_uint32(
int u1) {
184 return __byte_perm(u1, u1, 0x0);
187 __device__ __forceinline__
unsigned int vcmpgeu4(
unsigned int a,
unsigned int b) {
190 asm (
"prmt.b32 %0,%1,0,0xba98;" :
"=r"(r) :
"r"(c));
194 __device__ __forceinline__
unsigned int vminu4(
unsigned int a,
unsigned int b) {
203 __device__ __forceinline__
void print_uchars(
const char* str,
const uint32_t s) {
205 uint32_to_uchars(s, &u1, &u2, &u3, &u4);
206 printf(
"%s: %d %d %d %d\n", str, u1, u2, u3, u4);
210 __device__ __forceinline__
int popcount(T n) {
211 #if CSCT or CSCT_RECOMPUTE
218 __inline__ __device__ uint8_t minu8_index4(
int *min_idx,
const uint8_t val1,
const int dis,
const uint8_t val2,
const int dis2,
const uint8_t val3,
const int dis3,
const uint8_t val4,
const int dis4) {
233 uint8_t minval = min1;
242 __inline__ __device__ uint8_t minu8_index8(
int *min_idx,
const uint8_t val1,
const int dis,
const uint8_t val2,
const int dis2,
const uint8_t val3,
const int dis3,
const uint8_t val4,
const int dis4,
const uint8_t val5,
const int dis5,
const uint8_t val6,
const int dis6,
const uint8_t val7,
const int dis7,
const uint8_t val8,
const int dis8) {
243 int min_idx1, min_idx2;
244 uint8_t minval1, minval2;
246 minval1 = minu8_index4(&min_idx1, val1, dis, val2, dis2, val3, dis3, val4, dis4);
247 minval2 = minu8_index4(&min_idx2, val5, dis5, val6, dis6, val7, dis7, val8, dis8);
250 uint8_t minval = minval1;
251 if(minval1 > minval2) {
258 __inline__ __device__
int warpReduceMinIndex2(
int *val,
int idx) {
259 for(
int d = 1; d < WARP_SIZE; d *= 2) {
260 int tmp = shfl_xor_32(*val, d);
261 int tmp_idx = shfl_xor_32(idx, d);
270 __inline__ __device__
int warpReduceMinIndex(
int val,
int idx) {
271 for(
int d = 1; d < WARP_SIZE; d *= 2) {
272 int tmp = shfl_xor_32(val, d);
273 int tmp_idx = shfl_xor_32(idx, d);
282 __inline__ __device__
int warpReduceMin(
int val) {
283 val = min(val, shfl_xor_32(val, 1));
284 val = min(val, shfl_xor_32(val, 2));
285 val = min(val, shfl_xor_32(val, 4));
286 val = min(val, shfl_xor_32(val, 8));
287 val = min(val, shfl_xor_32(val, 16));
291 __inline__ __device__
int blockReduceMin(
int val) {
292 static __shared__
int shared[WARP_SIZE];
293 const int lane = threadIdx.x % WARP_SIZE;
294 const int wid = threadIdx.x / WARP_SIZE;
296 val = warpReduceMin(val);
298 if (lane==0) shared[wid]=val;
303 val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;
305 if (wid==0) val = warpReduceMin(val);
310 __inline__ __device__
int blockReduceMinIndex(
int val,
int idx) {
311 static __shared__
int shared_val[WARP_SIZE];
312 static __shared__
int shared_idx[WARP_SIZE];
313 const int lane = threadIdx.x % WARP_SIZE;
314 const int wid = threadIdx.x / WARP_SIZE;
316 idx = warpReduceMinIndex2(&val, idx);
326 val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_val[lane] : INT_MAX;
327 idx = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared_idx[lane] : INT_MAX;
330 idx = warpReduceMinIndex2(&val, idx);
337 __inline__ __device__
bool blockAny(
bool local_condition) {
338 __shared__
bool conditions[WARP_SIZE];
339 const int lane = threadIdx.x % WARP_SIZE;
340 const int wid = threadIdx.x / WARP_SIZE;
342 local_condition = __any(local_condition);
345 conditions[wid]=local_condition;
351 local_condition = (threadIdx.x < blockDim.x / WARP_SIZE) ? conditions[lane] :
false;
354 local_condition = __any(local_condition);
357 return local_condition;