Dorian Bourgeoisat 5 months ago
parent
commit
ed98ec3e47
  1. 25
      TP2_reduction/Reduce.cu

25
TP2_reduction/Reduce.cu

@ -37,13 +37,16 @@ __global__ void reduce_kernel( int n, const int *in_buffer, int *out_buffer, con
{
// Allocate shared memory inside the block.
extern __shared__ int s_mem[];
float my_sum=0;
int my_sum=0;
// The range of data to work with.
int2 range = block_ranges[blockIdx.x];
// Compute the sum of my elements.
// TODO: fill-in that section of the code
for(int i = range.x + threadIdx.x; i < range.y; i+=blockDim.x)
{
my_sum += in_buffer[i];
}
// Copy my sum in shared memory.
s_mem[threadIdx.x] = my_sum;
@ -54,6 +57,16 @@ float my_sum=0;
// Compute the sum inside the block.
// TODO: fill-in that section of the code
for(int e = 1; 1<<e <= blockDim.x; e++)
{
int a = threadIdx.x+(1<<(e-1));
if(threadIdx.x%(1<<e) == 0 && a < blockDim.x)
{
s_mem[threadIdx.x] += s_mem[a];
}
__syncthreads();
}
// The first thread of the block stores its result.
if( threadIdx.x == 0 )
@ -124,7 +137,7 @@ __global__ void reduce_kernel_optimized( int n, const int *in_buffer, int *out_b
{
// The number of warps in the block.
const int NUM_WARPS = BLOCK_DIM / WARP_SIZE;
float my_sum=0;
int my_sum=0;
// Allocate shared memory inside the block.
__shared__ volatile int s_mem[BLOCK_DIM];
@ -138,12 +151,16 @@ float my_sum=0;
// Compute the sum of my elements.
// TODO: fill-in that section of the code
for(int i = range.x + threadIdx.x; i < range.y; i+=blockDim.x)
{
my_sum += in_buffer[i];
}
// Copy my sum in shared memory.
s_mem[threadIdx.x] = my_sum;
// Compute the sum inside each warp.
// TODO: fill-in that section of the code
// Each warp leader stores the result for the warp.

Loading…
Cancel
Save