c - A simple reduction program in CUDA -


in below code, trying implement simple parallel reduction blocksize , number of threads per block being 1024. however, after implementing partial reduction, wish see whether implementation going right or not , in process make program print first element of host memory (after data has been copied device memory host memory). host memory initialize '1' , copied device memory reduction. , printf statement after reduction process still gives me '1' @ first element of array.

is there problem in getting print or logical in implementation of reduction? in addition printf statements in kernel not print anything. there wrong in syntax or call printf statement? code below:

    ifndef cudacc define cudacc endif include "cuda_runtime.h" include "device_launch_parameters.h" include include ifndef threadsperblock define threadsperblock 1024 endif ifndef numblocks define numblocks 1024 endif  global void reducekernel(int *c) { extern shared int sh_arr[];  int index = blockdim.x*blockidx.x + threadidx.x; int sh_index = threadidx.x;  // storing data global memory shared memory sh_arr[sh_index] = c[index]; __syncthreads();  for(unsigned int = blockdim.x/2; i>0 ; i>>=1) {     if(sh_index < i){         sh_arr[sh_index] += sh_arr[i+sh_index];     }     __syncthreads(); }  if(sh_index ==0)     c[blockidx.x]=sh_arr[sh_index]; printf("value stored @ %d %d \n", blockidx.x, c[blockidx.x]); return;  }  int main() { int *h_a; int *d_a; int share_memsize, h_memsize; size_t d_memsize;  share_memsize = threadsperblock*sizeof(int); h_memsize = threadsperblock*numblocks;  h_a = (int*)malloc(sizeof(int)*h_memsize);  d_memsize=threadsperblock*numblocks; cudamalloc( (void**)&d_a, h_memsize*sizeof(int));  for(int i=0; i<h_memsize; i++) {     h_a[i]=1;     };  //printf("last element of array %d \n", h_a[h_memsize-1]);  cudamemcpy((void**)&d_a, (void**)&h_a, h_memsize, cudamemcpyhosttodevice); reducekernel<<<numblocks, threadsperblock, share_memsize>>>(d_a); cudamemcpy((void**)&h_a, (void**)&d_a, d_memsize, cudamemcpydevicetohost);  printf("sizeof host memory %d \n", d_memsize); //sizeof(h_a)); printf("sum after reduction %d \n", h_a[0]);  } 

there number of problems code.

  1. much of you've posted not valid code. few examples, global , shared keywords supposed have double-underscores before , after, this: __global__ , __shared__. assume sort of copy-paste error or formatting error. there problems define statements well. should endeavor post code doesn't have these sorts of problems.

  2. any time having trouble cuda code, should use proper cuda error checking , run code cuda-memcheck before asking help. if had done , have focused attention on item 3 below.

  3. your cudamemcpy operations broken in couple of ways:

    cudamemcpy((void**)&d_a, (void**)&h_a, h_memsize, cudamemcpyhosttodevice); 

    first, unlike cudamalloc, memcpy, cudamemcpy takes ordinary pointer arguments. second, size of transfer (like memcpy) in bytes, sizes need scaled sizeof(int):

    cudamemcpy(d_a, h_a, h_memsize*sizeof(int), cudamemcpyhosttodevice); 

    and 1 after kernel.

  4. printf every thread in large kernel (like 1 has 1048576 threads) not idea. won't output expect, , on windows (appears running on windows) may run wddm watchdog timeout due kernel execution taking long. if need printf large kernel, selective , condition printf on threadidx.x , blockidx.x

  5. the above things enough sensible printout, , point out you're not finished yet anyway: "i wish see whether implementation going right or not ". however, kernel, crafted, overwrites input data output data:

    __global__ void reducekernel(int *c) ...     c[blockidx.x]=sh_arr[sh_index]; 

    this lead race condition. rather trying sort out you, i'd suggest separating output data input data. better, should study cuda reduction sample code has associated presentation.

here modified version of code has of above issues fixed. it's still not correct. still has defect 5 above in it. rather rewrite code fix defect 5, direct cuda sample code mentioned above.

$ cat t820.cu #include <stdio.h>  #ifndef threadsperblock #define threadsperblock 1024 #endif #ifndef numblocks #define numblocks 1024 #endif  __global__ void reducekernel(int *c) { extern __shared__ int sh_arr[];  int index = blockdim.x*blockidx.x + threadidx.x; int sh_index = threadidx.x;  // storing data global memory shared memory sh_arr[sh_index] = c[index]; __syncthreads();  for(unsigned int = blockdim.x/2; i>0 ; i>>=1) {     if(sh_index < i){         sh_arr[sh_index] += sh_arr[i+sh_index];     }     __syncthreads(); }  if(sh_index ==0)     c[blockidx.x]=sh_arr[sh_index]; // printf("value stored @ %d %d \n", blockidx.x, c[blockidx.x]); return;  }  int main() { int *h_a; int *d_a; int share_memsize, h_memsize; size_t d_memsize;  share_memsize = threadsperblock*sizeof(int); h_memsize = threadsperblock*numblocks;  h_a = (int*)malloc(sizeof(int)*h_memsize);  d_memsize=threadsperblock*numblocks; cudamalloc( (void**)&d_a, h_memsize*sizeof(int));  for(int i=0; i<h_memsize; i++) {     h_a[i]=1; };  //printf("last element of array %d \n", h_a[h_memsize-1]);  cudamemcpy(d_a, h_a, h_memsize*sizeof(int), cudamemcpyhosttodevice); reducekernel<<<numblocks, threadsperblock, share_memsize>>>(d_a); cudamemcpy(h_a, d_a, d_memsize*sizeof(int), cudamemcpydevicetohost);  printf("sizeof host memory %d \n", d_memsize); //sizeof(h_a)); printf("first block sum after reduction %d \n", h_a[0]); } $ nvcc -o t820 t820.cu $ cuda-memcheck ./t820 ========= cuda-memcheck sizeof host memory 1048576 first block sum after reduction 1024 ========= error summary: 0 errors $ 

Comments

Popular posts from this blog

android - Gradle sync Error:Configuration with name 'default' not found -

java - Andrioid studio start fail: Fatal error initializing 'null' -

html - jQuery UI Sortable - Remove placeholder after item is dropped -