test-case
作者:互联网
#define FINAL_MASK 0xffffffff template <typename T> __inline__ __device__ T warpReduceSum(T val) { for(int mask = 16; mask > 0; mask >>= 1) val += __shfl_xor_sync(FINAL_MASK, val, mask, 32); return val; } template <typename T> __inline__ __device__ T blockReduceSum(T val) { static __shared__ T shared[32]; int lane = threadIdx.x & 0x1f; int wid = threadIdx.x >> 5; val = warpReduceSum<T>(val); if(lane == 0) shared[wid] = val; __syncthreads(); val = (threadIdx.x < ((blockDim.x+31) >> 5 )) ? shared[lane] : (T)0.0f; val = warpReduceSum(val); return val; } __global__ void get_block_sum(const int8_t *str, int m, int n, int* block_sum, char sep){ int tid = threadIdx.x + blockIdx.x * blockDim.x; int val; if(tid >= m * n) val = 0; else val = str[tid] == sep ? 1 : 0; __syncthreads(); int sum = blockReduceSum(val) + 1; if(threadIdx.x == 0){ block_sum[blockIdx.x] = sum; } }
int main() { char str[] = "1,41,42,43,41,44,55,66,54,35,1,41,42,43,41,44,55,66,54,35,1"; int8_t* d_str; cudaMalloc(&d_str, sizeof(str)); cudaMemcpy(d_str, str, sizeof(str), cudaMemcpyHostToDevice); int m = 1, n = sizeof(str); int aligned_n = 32*((n + 31)/32); int block = aligned_n, grid = m; int *d_wordcounts; cudaMalloc(&d_wordcounts, m*sizeof(int)); get_block_sum<<<grid, block>>>(d_str, m , n, d_wordcounts, ','); int host_count; cudaMemcpy(&host_count, d_wordcounts, m*sizeof(int), cudaMemcpyDeviceToHost); printf("host count is %d\n", host_count); return 0; }
标签:case,__,val,int,sum,threadIdx,str,test 来源: https://www.cnblogs.com/buddho/p/14341692.html