interleaved, + sequential

a752caf0 · Kai Westerkamp · e1738542 · a752caf0 · a752caf0 · a752caf0
Commit a752caf0 authored Nov 07, 2016 by Kai Westerkamp
7 changed files
--- a/Assignment2/Assignment2/CAssignment2.cpp
+++ b/Assignment2/Assignment2/CAssignment2.cpp
@@ -31,8 +31,8 @@ bool CAssignment2::DoCompute()
 	cout<<"Running parallel prefix sum task..."<<endl<<endl;
 	{
 		size_t LocalWorkSize[3] = {256, 1, 1};
-		CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
-		RunComputeTask(scan, LocalWorkSize);
+		//CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
+		//RunComputeTask(scan, LocalWorkSize);
 	}



--- a/Assignment2/Assignment2/CReductionTask.cpp
+++ b/Assignment2/Assignment2/CReductionTask.cpp
@@ -38,11 +38,12 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
 {
 	//CPU resources
 	m_hInput = new unsigned int[m_N];
+	m_hDebug = new unsigned int[m_N];

 	//fill the array with some values
 	for(unsigned int i = 0; i < m_N; i++) 
-		//m_hInput[i] = 1;			// Use this for debugging
-		m_hInput[i] = rand() & 15;
+		m_hInput[i] = 1;			// Use this for debugging
+		//m_hInput[i] = rand() & 15;

 	//device resources
 	cl_int clError, clError2;
@@ -141,26 +142,75 @@ bool CReductionTask::ValidateResults()

 void CReductionTask::Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
 {
-	//cl_int clErr;
-	//size_t globalWorkSize[1];
-	//size_t localWorkSize[1];
-	//unsigned int stride = ...;
+	cl_int clErr;
+	size_t globalWorkSize[1];
+	size_t localWorkSize[1];
+	
+	clErr = clSetKernelArg(m_InterleavedAddressingKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
+	V_RETURN_CL(clErr, "Error setting Kernel Arg 1");
+
+	unsigned int stride = 1;
+
+	for (unsigned int i = m_N/2; i > 0; i/=2) {
+		clErr |= clSetKernelArg(m_InterleavedAddressingKernel, 1, sizeof(cl_uint), (void*)&stride);
+		V_RETURN_CL(clErr, "Error setting Kernel Arg 2");

-	// TO DO: Implement reduction with interleaved addressing
+		localWorkSize[0] = min(LocalWorkSize[0], i);
+		globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);

-	//for (...) {
-	//}
+		clErr = clEnqueueNDRangeKernel(CommandQueue, m_InterleavedAddressingKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+		V_RETURN_CL(clErr, "Error running Kernel");
+
+		stride *= 2;
+	}
 }

 void CReductionTask::Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
 {

-	// TO DO: Implement reduction with sequential addressing
+	cl_int clErr;
+	size_t globalWorkSize[1];
+	size_t localWorkSize[1];
+
+	clErr = clSetKernelArg(m_SequentialAddressingKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
+	V_RETURN_CL(clErr, "Error setting Kernel Arg 1");
+
+	for (unsigned int i = m_N / 2; i > 0; i /= 2) {
+		clErr |= clSetKernelArg(m_SequentialAddressingKernel, 1, sizeof(cl_uint), (void*)&i);
+		V_RETURN_CL(clErr, "Error setting Kernel Arg 2");
+
+		localWorkSize[0] = min(LocalWorkSize[0], i);
+		globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
+		clErr = clEnqueueNDRangeKernel(CommandQueue, m_SequentialAddressingKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+		V_RETURN_CL(clErr, "Error running Kernel");
+	}

 }

 void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
 {
+	cl_int clErr;
+	size_t globalWorkSize[1];
+	size_t localWorkSize[1];
+	localWorkSize[0] = LocalWorkSize[0];
+
+	for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
+		localWorkSize[0] = min(localWorkSize[0], i);
+
+		clErr = clSetKernelArg(m_DecompKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
+		clErr |= clSetKernelArg(m_DecompKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
+		clErr |= clSetKernelArg(m_DecompKernel, 2, sizeof(cl_uint), (void*)&i);
+		clErr |= clSetKernelArg(m_DecompKernel, 3, localWorkSize[0] * sizeof(cl_uint), NULL);
+		V_RETURN_CL(clErr, "Error setting Kernel args");
+
+		globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
+		clErr = clEnqueueNDRangeKernel(CommandQueue, m_DecompKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+		V_RETURN_CL(clErr, "Error running Kernel");
+
+		swap(m_dPingArray, m_dPongArray);
+
+
+	}

 	// TO DO: Implement reduction with kernel decomposition

@@ -214,6 +264,8 @@ void CReductionTask::ExecuteTask(cl_context Context, cl_command_queue CommandQue

 void CReductionTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
 {
+
+	//return; //TODO
 	cout << "Testing performance of task " << g_kernelNames[Task] << endl;

 	//write input data to the GPU

--- a/Assignment2/Assignment2/CReductionTask.h
+++ b/Assignment2/Assignment2/CReductionTask.h
@@ -59,6 +59,8 @@ protected:

 	// input data
 	unsigned int		*m_hInput;
+	unsigned int		*m_hDebug;
+
 	// results
 	unsigned int		m_resultCPU;
 	unsigned int		m_resultGPU[4];

--- a/Assignment2/Assignment2/Reduction.cl
+++ b/Assignment2/Assignment2/Reduction.cl
@@ -2,21 +2,49 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void Reduction_InterleavedAddressing(__global uint* array, uint stride) 
 {
-	// TO DO: Kernel implementation
+	int GID = get_global_id(0);
+	int pos1 = GID*2*stride;
+	int pos2 = pos1+stride;
+
+	array[pos1] = array[pos1] + array[pos2];
+	array[pos2] =  0;
+
 }


 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void Reduction_SequentialAddressing(__global uint* array, uint stride) 
 {
-	// TO DO: Kernel implementation
+	int GID = get_global_id(0);
+	int pos2 = GID + stride;
+
+	array[GID] = array[GID] + array[pos2];
+
 }


 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
 {
-	// TO DO: Kernel implementation
+	int ls = get_local_size(0);
+	int gs = get_global_size(0);
+	int gid = get_global_id(0);
+	int lid = get_local_id(0);
+	int groupID = get_group_id(0);
+
+	localBlock[lid] = inArray[gid] + inArray[gid + gs];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	for(uint stride = ls / 2; stride > 0; ls /= 2){
+		if(lid < stride){
+			localBlock[lid] = inArray[lid] + inArray[lid + stride];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+
+	if(lid = 0)
+		outArray[groupID] = localBlock[0]; 
 }



--- a/Assignment2/buildVS15/ALL_BUILD.vcxproj
+++ b/Assignment2/buildVS15/ALL_BUILD.vcxproj
--- a/Assignment2/buildVS15/Assignment.vcxproj
+++ b/Assignment2/buildVS15/Assignment.vcxproj
--- a/Assignment2/buildVS15/ZERO_CHECK.vcxproj
+++ b/Assignment2/buildVS15/ZERO_CHECK.vcxproj