Decomp Kernel

5c5f4cb3 · Kai Westerkamp · a752caf0 · 5c5f4cb3 · 5c5f4cb3
Commit 5c5f4cb3 authored Nov 07, 2016 by Kai Westerkamp
Show whitespace changes
Inline Side-by-side

Showing with 80 additions and 8 deletions

CReductionTask.cpp Assignment2/Assignment2/CReductionTask.cpp +46 -4

Reduction.cl Assignment2/Assignment2/Reduction.cl +34 -4

No files found.
--- a/Assignment2/Assignment2/CReductionTask.cpp
+++ b/Assignment2/Assignment2/CReductionTask.cpp
@@ -42,8 +42,10 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
 	//fill the array with some values
 	for(unsigned int i = 0; i < m_N; i++) 
-		m_hInput[i] = 1;			// Use this for debugging
+		//m_hInput[i] = 1;			// Use this for debugging
-		//m_hInput[i] = rand() & 15;
+		m_hInput[i] = rand() & 15;
+	//cout << "init are 1" << endl;
 	//device resources
 	cl_int clError, clError2;
@@ -96,9 +98,13 @@ void CReductionTask::ReleaseResources()
 void CReductionTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
 {
 	ExecuteTask(Context, CommandQueue, LocalWorkSize, 0);
+	cout << "Task 1 Done"<< endl;
 	ExecuteTask(Context, CommandQueue, LocalWorkSize, 1);
+	cout << "Task 2 Done" << endl;
 	ExecuteTask(Context, CommandQueue, LocalWorkSize, 2);
+	cout << "Task 3 Done" << endl;
 	ExecuteTask(Context, CommandQueue, LocalWorkSize, 3);
+	cout << "Task 4 Done" << endl;
 	TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
 	TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
@@ -192,10 +198,13 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
 	cl_int clErr;
 	size_t globalWorkSize[1];
 	size_t localWorkSize[1];
-	localWorkSize[0] = LocalWorkSize[0];
+	//for (int j = 0; j < 32; j++)
+	//	printf("%4d, ", m_hInput[j]);
+	//cout << endl;
 	for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
-		localWorkSize[0] = min(localWorkSize[0], i);
+		localWorkSize[0] = min(LocalWorkSize[0], i);
 		clErr = clSetKernelArg(m_DecompKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
 		clErr |= clSetKernelArg(m_DecompKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
@@ -209,6 +218,10 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
 		swap(m_dPingArray, m_dPongArray);
+		//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
+		//for (int j = 0; j < 32; j++)
+		//	printf("%4d, ", m_hDebug[j]);
+		//cout << endl;
 	}
@@ -223,6 +236,35 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
 void CReductionTask::Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
 {
+	cl_int clErr;
+	size_t globalWorkSize[1];
+	size_t localWorkSize[1];
+	//for (int j = 0; j < 32; j++)
+	//	printf("%4d, ", m_hInput[j]);
+	//cout << endl;
+	for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
+		localWorkSize[0] = min(LocalWorkSize[0], i);
+		clErr = clSetKernelArg(m_DecompUnrollKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
+		clErr |= clSetKernelArg(m_DecompUnrollKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
+		clErr |= clSetKernelArg(m_DecompUnrollKernel, 2, sizeof(cl_uint), (void*)&i);
+		clErr |= clSetKernelArg(m_DecompUnrollKernel, 3, localWorkSize[0] * sizeof(cl_uint), NULL);
+		V_RETURN_CL(clErr, "Error setting Kernel args");
+		globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
+		clErr = clEnqueueNDRangeKernel(CommandQueue, m_DecompUnrollKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+		V_RETURN_CL(clErr, "Error running Kernel");
+		swap(m_dPingArray, m_dPongArray);
+		//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
+		//for (int j = 0; j < 32; j++)
+		//	printf("%4d, ", m_hDebug[j]);
+		//cout << endl;
+	}
 	// TO DO: Implement reduction with loop unrolling

--- a/Assignment2/Assignment2/Reduction.cl
+++ b/Assignment2/Assignment2/Reduction.cl
@@ -28,22 +28,24 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
 {
 	int ls = get_local_size(0);
 	int gs = get_global_size(0);
 	int gid = get_global_id(0);
 	int lid = get_local_id(0);
 	int groupID = get_group_id(0);
 	localBlock[lid] = inArray[gid] + inArray[gid + gs];
 	barrier(CLK_LOCAL_MEM_FENCE);
-	for(uint stride = ls / 2; stride > 0; ls /= 2){
+	for(uint stride = ls / 2; stride > 0; stride /= 2){
 		if(lid < stride){
-			localBlock[lid] = inArray[lid] + inArray[lid + stride];
+			localBlock[lid] = localBlock[lid] + localBlock[lid + stride];
 		}
 		barrier(CLK_LOCAL_MEM_FENCE);
 	}
-	if(lid = 0)
+	if(lid == 0)
 		outArray[groupID] = localBlock[0]; 
 }
@@ -51,5 +53,33 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void Reduction_DecompUnroll(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
 {
-	// TO DO: Kernel implementation
+		int ls = get_local_size(0);
+	int gs = get_global_size(0);
+	int gid = get_global_id(0);
+	int lid = get_local_id(0);
+	int groupID = get_group_id(0);
+	localBlock[lid] = inArray[gid] + inArray[gid + gs];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	for(uint stride = ls / 2; stride > 32; stride /= 2){
+		if(lid < stride){
+			localBlock[lid] = localBlock[lid] + localBlock[lid + stride];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	if (lid < 32) {
+		if (ls >= 64) localBlock[lid] += localBlock[lid + 32];
+		if (ls >= 32) localBlock[lid] += localBlock[lid + 16];
+		if (ls >= 16) localBlock[lid] += localBlock[lid + 8];
+		if (ls >= 8) localBlock[lid] += localBlock[lid + 4];
+		if (ls >= 4) localBlock[lid] += localBlock[lid + 2];
+	}
+	if(lid == 0)
+		outArray[groupID] = localBlock[0]+localBlock[1]; 
 }