A3

7ae7a078 · Kai Westerkamp · 449a502c · 7ae7a078 · 7ae7a078 · 7ae7a078
Commit 7ae7a078 authored Nov 27, 2016 by Kai Westerkamp
7 changed files
--- a/Assignment3/Assignment3/CAssignment3.cpp
+++ b/Assignment3/Assignment3/CAssignment3.cpp
@@ -40,21 +40,26 @@ bool CAssignment3::DoCompute()
 	}
 	cout<<endl<<"########################################"<<endl;
 	cout<<"Task 2: Separable convolution"<<endl<<endl;
 	{
-		size_t HGroupSize[2] = {32, 16};
+		size_t HGroupSize[2] = { 32, 16 };
-		size_t VGroupSize[2] = {32, 16};
+		size_t VGroupSize[2] = { 32, 16 };
+		//for (int steps = 1; steps < 6; steps++) {
+			int steps = 4;
+			cout << "Stepsize = " << steps << endl;
 			{
 				//simple box filter
 				float ConvKernel[9];
-			for(int i = 0; i < 9; i++)
+				for (int i = 0; i < 9; i++)
 					ConvKernel[i] = 1.0f / 9.0f;
 				CConvolutionSeparableTask convTask("box_4x4", "Images/input.pfm", HGroupSize, VGroupSize,
-				4, 4, 4, ConvKernel, ConvKernel);
+					steps, steps, 4, ConvKernel, ConvKernel);
 				// note: the last argument is ignored, but our framework requires it
 				// for the horizontal and vertical passes different local sizes might be used
 				RunComputeTask(convTask, HGroupSize);
@@ -63,11 +68,11 @@ bool CAssignment3::DoCompute()
 			{
 				//simple box filter
 				float ConvKernel[17];
-			for(int i = 0; i < 17; i++)
+				for (int i = 0; i < 17; i++)
 					ConvKernel[i] = 1.0f / 17.0f;
 				CConvolutionSeparableTask convTask("box_8x8", "Images/input.pfm", HGroupSize, VGroupSize,
-				4, 4, 8, ConvKernel, ConvKernel);
+					steps, steps, 8, ConvKernel, ConvKernel);
 				RunComputeTask(convTask, HGroupSize);
 			}
@@ -77,24 +82,28 @@ bool CAssignment3::DoCompute()
 					0.000817774f, 0.0286433f, 0.235018f, 0.471041f, 0.235018f, 0.0286433f, 0.000817774f
 				};
 				CConvolutionSeparableTask convTask("gauss_3x3", "Images/input.pfm", HGroupSize, VGroupSize,
-				4, 4, 3, ConvKernel, ConvKernel);
+					steps, steps, 3, ConvKernel, ConvKernel);
 				RunComputeTask(convTask, HGroupSize);
 			}
+		//}
 	}
-	cout<<endl<<"########################################"<<endl;
-	cout<<"Task 3: Separable bilateral convolution"<<endl<<endl;
-	{
-		size_t HGroupSize[2] = {32, 4};
-		size_t VGroupSize[2] = {32, 4};
-		float ConvKernel[9] = {0.010284844f,	0.0417071f,	0.113371652f,	0.206576619f,	0.252313252f,	0.206576619f,	0.113371652f,	0.0417071f,	0.010284844f};
-		CConvolutionBilateralTask convTask("Images/color.pfm", "Images/normals.pfm", "Images/depth.pfm", HGroupSize, VGroupSize,
+	//cout<<endl<<"########################################"<<endl;
-			4, 4, 4, ConvKernel, ConvKernel);
+	//cout<<"Task 3: Separable bilateral convolution"<<endl<<endl;
-		RunComputeTask(convTask, HGroupSize);
+	//{
-	}
+	//	size_t HGroupSize[2] = {32, 4};
+	//	size_t VGroupSize[2] = {32, 4};
+	//	float ConvKernel[9] = {0.010284844f,	0.0417071f,	0.113371652f,	0.206576619f,	0.252313252f,	0.206576619f,	0.113371652f,	0.0417071f,	0.010284844f};
+	//	CConvolutionBilateralTask convTask("Images/color.pfm", "Images/normals.pfm", "Images/depth.pfm", HGroupSize, VGroupSize,
+	//		4, 4, 4, ConvKernel, ConvKernel);
+	//	RunComputeTask(convTask, HGroupSize);
+	//}
 	cout<<endl<<"########################################"<<endl;
 	cout<<"Task 4: Histogram"<<endl<<endl;

--- a/Assignment3/Assignment3/CConvolutionSeparableTask.cpp
+++ b/Assignment3/Assignment3/CConvolutionSeparableTask.cpp
@@ -229,7 +229,7 @@ double CConvolutionSeparableTask::ConvolutionChannelGPU(unsigned int Channel, cl
 {
 	cl_int clErr;
-	clErr  = clSetKernelArg(m_HorizontalKernel, 0, sizeof(cl_mem), (void*)&m_dGPUWorkingBuffer);
+	clErr  = clSetKernelArg(m_HorizontalKernel, 0, sizeof(cl_mem), (void*)&m_dGPUWorkingBuffer); //m_dGPUWorkingBuffer
 	clErr |= clSetKernelArg(m_HorizontalKernel, 1, sizeof(cl_mem), (void*)&m_dSourceChannels[Channel]);
 	V_RETURN_0_CL(clErr, "Error setting horizontal kernel arguments");

--- a/Assignment3/Assignment3/Convolution3x3.cl
+++ b/Assignment3/Assignment3/Convolution3x3.cl
@@ -12,6 +12,15 @@ to be the multiple of the given tile-size.
 #define TILE_Y 16
+#define load(X, Y) _load(d_Src, X, Y, Width, Height, Pitch)
+inline float _load(__global const float* d_Src, uint x, uint y, uint Width, uint Height, uint Pitch) {
+	if (x < Width && y < Height) {
+		return d_Src[y * Pitch + x];
+	} else {
+		return 0.0f;
+	}
+}
 // d_Dst is the convolution of d_Src with the kernel c_Kernel
 // c_Kernel is assumed to be a float[11] array of the 3x3 convolution constants, one multiplier (for normalization) and an offset (in this order!)
 // With & Height are the image dimensions (should be multiple of the tile size)
@@ -30,15 +39,71 @@ void Convolution(
 	// the size of the local memory necessary for the convolution is the tile size + the halo area
 	__local float tile[TILE_Y + 2][TILE_X + 2];
-	// TO DO...
+	const uint2 gid = (uint2)(get_global_id(0), get_global_id(1));
+	const uint2 lid = (uint2)(get_local_id(0), get_local_id(1));
-	// Fill the halo with zeros
+	// TO DO...
+	// Fill the halo with zeros -> laod function
 	// Load main filtered area from d_Src
 	// Load halo regions from d_Src (edges and corners separately), check for image bounds!
+	tile[lid.y + 1][lid.x + 1] = load(gid.x, gid.y);
+	if (lid.x == 0) {
+		tile[lid.y + 1][0] = load(gid.x - 1, gid.y);
+		if(lid.y == 0)
+		{
+			tile[0][0] = load(gid.x - 1, gid.y - 1);
+		}
+		if(lid.y ==  TILE_Y - 1)
+		{
+			tile[TILE_Y + 1][0] = load(gid.x - 1, gid.y + 1);
+		}
+	}
+	if (lid.x == TILE_X - 1) {
+		tile[lid.y + 1][lid.x + 2] = load(gid.x + 1, gid.y);
+		if (lid.y == 0) {
+			tile[0][TILE_X + 1] = load(gid.x + 1, gid.y - 1);
+		}
+		if(lid.y ==  TILE_Y - 1)
+		{
+			tile[TILE_Y + 1][TILE_X + 1] = load(gid.x + 1, gid.y + 1);
+		}
+	}
+	if (lid.y == 0) {
+		tile[0][lid.x + 1] = load(gid.x, gid.y - 1);
+	}
+	if (lid.y == TILE_Y - 1) {
+		tile[lid.y + 2][lid.x + 1] = load(gid.x, gid.y + 1);
+	}
 	// Sync threads
+	barrier(CLK_LOCAL_MEM_FENCE);
 	// Perform the convolution and store the convolved signal to d_Dst.
+	float tmp;
+	tmp  = c_Kernel[0] * tile[lid.y+0][lid.x+0];
+	tmp += c_Kernel[1] * tile[lid.y+0][lid.x+1];
+	tmp += c_Kernel[2] * tile[lid.y+0][lid.x+2];
+	tmp += c_Kernel[3] * tile[lid.y+1][lid.x+0];
+	tmp += c_Kernel[4] * tile[lid.y+1][lid.x+1];
+	tmp += c_Kernel[5] * tile[lid.y+1][lid.x+2];
+	tmp += c_Kernel[6] * tile[lid.y+2][lid.x+0];
+	tmp += c_Kernel[7] * tile[lid.y+2][lid.x+1];
+	tmp += c_Kernel[8] * tile[lid.y+2][lid.x+2];
+	tmp = c_Kernel[9] * tmp + c_Kernel[10];
+	d_Dst[gid.y * Pitch + gid.x] = tmp;
 }
\ No newline at end of file
--- a/Assignment3/Assignment3/ConvolutionSeparable.cl
+++ b/Assignment3/Assignment3/ConvolutionSeparable.cl
@@ -52,18 +52,37 @@ void ConvHorizontal(
 	__local float tile[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
 	// TODO:
-	//const int baseX = ...
+	const int baseX = (get_group_id(0) * H_RESULT_STEPS - 1) * H_GROUPSIZE_X + get_local_id(0);
-	//const int baseY = ...
+	const int baseY = get_group_id(1) * H_GROUPSIZE_Y + get_local_id(1);
 	//const int offset = ...
 	// Load left halo (check for left bound)
 	// Load main data + right halo (check for right bound)
 	// for (int tileID = 1; tileID < ...)
+	tile[get_local_id(1)][get_local_id(0)] = (baseX > 0) ? d_Src[baseY * Pitch + baseX ] : 0.0f;
+	#pragma unroll
+	for (int i = 1; i < H_RESULT_STEPS + 2; i++) {
+		tile[get_local_id(1)][get_local_id(0) + i * H_GROUPSIZE_X] = (baseX + i * H_GROUPSIZE_X < Width) ? d_Src[baseY * Pitch + baseX + i * H_GROUPSIZE_X] : 0.0f;
+	}
 	// Sync the work-items after loading
+	barrier(CLK_LOCAL_MEM_FENCE);
 	// Convolve and store the result
+	#pragma unroll
+	for (int i = 1; i < H_RESULT_STEPS + 1; i++) {
+		if (baseX + i * H_GROUPSIZE_X < Width) {
+			float result = 0.0f;
+			#pragma unroll
+			for (int x = -KERNEL_RADIUS; x <= KERNEL_RADIUS; x++) {
+				result += c_Kernel[KERNEL_RADIUS + x] * tile[get_local_id(1)][get_local_id(0) + i * H_GROUPSIZE_X + x];
+			}
+			d_Dst[baseY * Pitch + baseX + i * H_GROUPSIZE_X] = result;
+		}
+	}
 }
@@ -86,7 +105,31 @@ void ConvVertical(
 	// Conceptually similar to ConvHorizontal
 	// Load top halo + main data + bottom halo
-	// Compute and store results
+	const int baseX = get_group_id(0) * V_GROUPSIZE_X + get_local_id(0);
+	const int baseY = (get_group_id(1) * V_RESULT_STEPS - 1) * V_GROUPSIZE_Y + get_local_id(1);
+	tile[get_local_id(1)][get_local_id(0)] = (baseY >= 0) ? d_Src[baseY * Pitch + baseX] : 0.0f;
+	#pragma unroll
+	for (int i = 1; i < V_RESULT_STEPS + 1; i++) {
+		tile[get_local_id(1) + i * V_GROUPSIZE_Y][get_local_id(0)] = (baseY + i * V_GROUPSIZE_Y < Height) ? d_Src[(baseY + i * V_GROUPSIZE_Y) * Pitch + baseX] : 0.0f;
+	}
+	tile[get_local_id(1) + (V_RESULT_STEPS + 1) * V_GROUPSIZE_Y][get_local_id(0)] = (baseY + (V_RESULT_STEPS + 1) * V_GROUPSIZE_Y < Height) ? d_Src[(baseY + (V_RESULT_STEPS + 1) * V_GROUPSIZE_Y) * Pitch + baseX] : 0.0f;
+	// Compute and store results
+	barrier(CLK_LOCAL_MEM_FENCE);
+	#pragma unroll
+	for (int i = 1; i < V_RESULT_STEPS + 1; i++) {
+		if (baseY + i * V_GROUPSIZE_Y < Height) {
+			float result = 0.0f;
+			#pragma unroll
+			for (int y = -KERNEL_RADIUS; y <= KERNEL_RADIUS; y++) {
+				result += c_Kernel[KERNEL_RADIUS + y] * tile[get_local_id(1) + i * V_GROUPSIZE_Y + y][get_local_id(0)];
+			}
+			d_Dst[(baseY + i * V_GROUPSIZE_Y) * Pitch + baseX] = result;
+		}
+	}
 }
--- a/Assignment3/Assignment3/Images/Times.pdf
+++ b/Assignment3/Assignment3/Images/Times.pdf
--- a/Assignment3/Assignment3/Images/Times.xlsx
+++ b/Assignment3/Assignment3/Images/Times.xlsx
--- a/Assignment3/Assignment3/histogram.cl
+++ b/Assignment3/Assignment3/histogram.cl
@@ -22,6 +22,12 @@ compute_histogram(
 )
 {
 	// Insert your kernel code here
+	const uint2 gid = (uint2)(get_global_id(0), get_global_id(1));
+	if (gid.x < width && gid.y < height) {
+		float pixel = img[gid.y * pitch + gid.x];
+		int bin = min(num_hist_bins - 1, max(0, (int)(pixel * ((float) num_hist_bins))));
+		atomic_inc(&(histogram[bin]));
+	}
 } 
 __kernel void
@@ -35,5 +41,28 @@ compute_histogram_local_memory(
 	__local int *local_hist
 )
 {
-	// Insert your kernel code here
+	const uint index = get_local_id(1) * get_local_size(0) + get_local_id(0);
+	const uint2 gid = (uint2)(get_global_id(0), get_global_id(1));
+	// init 0
+	if (index < num_hist_bins) {
+		local_hist[index] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	if (gid.x < width && gid.y < height) {
+		float pixel = img[gid.y * pitch + gid.x];
+		int bin = min(num_hist_bins - 1, max(0, (int)(pixel * ((float) num_hist_bins))));
+		atomic_inc(&(local_hist[bin]));
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//write back
+	if (index < num_hist_bins) {
+		atomic_add(&(histogram[index]), local_hist[index]);
+	}
 }