scan work efficient

84bbc7a6 · Kai Westerkamp · 0d7ab823 · 84bbc7a6 · 84bbc7a6 · 84bbc7a6
Commit 84bbc7a6 authored Nov 08, 2016 by Kai Westerkamp
Hide whitespace changes
Inline Side-by-side

Showing with 90 additions and 34 deletions

CAssignment2.cpp Assignment2/Assignment2/CAssignment2.cpp +4 -4

CScanTask.cpp Assignment2/Assignment2/CScanTask.cpp +55 -18

Scan.cl Assignment2/Assignment2/Scan.cl +31 -12

No files found.
--- a/Assignment2/Assignment2/CAssignment2.cpp
+++ b/Assignment2/Assignment2/CAssignment2.cpp
@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute()
 	cout<<"Running parallel reduction task..."<<endl<<endl;
 	{
 		size_t LocalWorkSize[3] = {256, 1, 1};
-		//CReductionTask reduction(1024 * 1024 * 16);
+		CReductionTask reduction(1024 * 1024 * 16);
-		//RunComputeTask(reduction, LocalWorkSize);
+		RunComputeTask(reduction, LocalWorkSize);
 	}
 	// Task 2: parallel prefix sum
@@ -32,8 +32,8 @@ bool CAssignment2::DoCompute()
 	{
 		size_t LocalWorkSize[3] = {256, 1, 1};
-		CScanTask scan(512, LocalWorkSize[0]);
+	//	CScanTask scan(1024, LocalWorkSize[0]);
-		RunComputeTask(scan, LocalWorkSize);
+	//	RunComputeTask(scan, LocalWorkSize);
 		CScanTask scan2(1024 * 1024 * 64, LocalWorkSize[0]);
 		RunComputeTask(scan2, LocalWorkSize);

--- a/Assignment2/Assignment2/CScanTask.cpp
+++ b/Assignment2/Assignment2/CScanTask.cpp
@@ -62,9 +62,8 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
 	//fill the array with some values
 	for(unsigned int i = 0; i < m_N; i++)
-		m_hArray[i] = 1;			// Use this for debugging
+		//m_hArray[i] = 1;			// Use this for debugging
-		//m_hArray[i] = rand() & 15;
+		m_hArray[i] = rand() & 15;
-	cout << "Debug array" << endl;
 	//device resources
 	// ping-pong buffers
@@ -139,7 +138,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si
 	cout << endl;
 	TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
-	//TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
+	TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
 	cout << endl;
 }
@@ -215,19 +214,61 @@ void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQ
 	cl_int clErr;
 	size_t globalWorkSize[1];
 	size_t localWorkSize[1];
+	size_t levelSize = m_N;
+	size_t *levelSizes = new size_t[m_nLevels];
+	//local PPS
+	for (size_t level = 0; level < m_nLevels - 1; level++) {
+		levelSizes[level] = levelSize;
+		localWorkSize[0] = min(levelSize / 2, LocalWorkSize[0]);
+		if (localWorkSize[0] == 0)
+			continue;
+		globalWorkSize[0] = CLUtil::GetGlobalWorkSize(levelSize/2, localWorkSize[0]);
+		clErr = clSetKernelArg(m_ScanWorkEfficientKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[level]);
+		clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[level+1]);
+		clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 2, localWorkSize[0] * 3 *sizeof(cl_uint), NULL);
+		V_RETURN_CL(clErr, "Error setting Kernel args3");
+		clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+		V_RETURN_CL(clErr, "Error running Kernel");
-	localWorkSize[0] = LocalWorkSize[0];
+		//cout << "Reading level " << levelSize << " Elements " << endl;
-	globalWorkSize[0] = CLUtil::GetGlobalWorkSize(m_N/2, localWorkSize[0]);
+		//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[level], CL_TRUE, 0, levelSize * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
+		//for (int j = 0; j < levelSize; j++)
+		//	printf("%2d, ", m_hResultGPU[j]);
+		//cout << endl;
+		levelSize /= (2 * LocalWorkSize[0]);
+		//cout << "Reading level +1 " << levelSize << " Elements " << endl;
+		//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[level+1], CL_TRUE, 0, levelSize * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
+		//for (int j = 0; j < levelSize; j++)
+		//	printf("%2d, ", m_hResultGPU[j]);
+		//cout << endl;
+	}
-	clErr = clSetKernelArg(m_ScanWorkEfficientKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[0]);
-	clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[1]);
-	clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 2, localWorkSize[0] * 2 *sizeof(cl_uint), NULL);
-	V_RETURN_CL(clErr, "Error setting Kernel args3");
-	clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
-	V_RETURN_CL(clErr, "Error running Kernel");
-	swap(m_dPingArray, m_dPongArray);
+	for (size_t level = m_nLevels - 3; level < m_nLevels; level--) { // unsigned int -> -1 > m_nLevels
+		levelSize = levelSizes[level];
+		localWorkSize[0] = min(levelSize / 2, LocalWorkSize[0]);
+		if (localWorkSize[0] == 0)
+			continue;
+		globalWorkSize[0] = CLUtil::GetGlobalWorkSize(levelSize / 2, localWorkSize[0]);
+		clErr  = clSetKernelArg(m_ScanWorkEfficientAddKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[level + 1]);
+		V_RETURN_CL(clErr, "Failed to set kernel args1");
+		clErr |= clSetKernelArg(m_ScanWorkEfficientAddKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[level]);
+		V_RETURN_CL(clErr, "Failed to set kernel args2");
+		clErr |= clSetKernelArg(m_ScanWorkEfficientAddKernel, 2, localWorkSize[0] * 2 * sizeof(cl_uint), NULL);
+		V_RETURN_CL(clErr, "Failed to set kernel args3");
+		clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientAddKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+		V_RETURN_CL(clErr, "Failed to execute kernel");
+	}
 	// TO DO: Implement efficient version of scan
@@ -254,10 +295,6 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue, 
 	// validate results
 	m_bValidationResults[Task] =( memcmp(m_hResultCPU, m_hResultGPU, m_N * sizeof(unsigned int)) == 0);
-	//for (int j = 0; j < m_N; j++)
-	//	printf("%2d, ", m_hResultGPU[j]);
-	//cout << endl;
 }
 void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
@@ -273,7 +310,7 @@ void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueu
 	timer.Start();
 	//run the kernel N times
-	unsigned int nIterations = 100;
+	unsigned int nIterations = 10;
 	for(unsigned int i = 0; i < nIterations; i++) {
 		//run selected task
 		switch (Task){

--- a/Assignment2/Assignment2/Scan.cl
+++ b/Assignment2/Assignment2/Scan.cl
@@ -31,6 +31,7 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray, 
 #define AVOID_BANK_CONFLICTS
 #ifdef AVOID_BANK_CONFLICTS
 	// TO DO: define your conflict-free macro here
+	#define OFFSET(A) ((A) + (A)/NUM_BANKS)
 #else
 	#define OFFSET(A) (A)
 #endif
@@ -47,8 +48,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
 	//read to local
 	uint inLeft, inRight;;
-	localBlock[lid2] = inLeft = array[gid2];
+	localBlock[OFFSET(lid2)] = inLeft = array[gid2];
-	localBlock[lid2+1] = inRight = array[gid2+1];
+	localBlock[OFFSET(lid2+1)] = inRight = array[gid2+1];
 	barrier(CLK_LOCAL_MEM_FENCE);
 	//Upsweep
@@ -57,8 +58,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
 		uint right = (ls2 - 1) - lid * stride * 2;
 		uint left = right - stride;
-		if (right < ls2) {
+		if (right < ls2 && left < ls2) {
-			localBlock[right] += localBlock[left];
+			localBlock[OFFSET(right)] += localBlock[OFFSET(left)];
 		}
 		barrier(CLK_LOCAL_MEM_FENCE);
@@ -66,7 +67,7 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
 	//letzten auf 0 setzen 
 	if(lid == 0)
-		localBlock[ls2 - 1] = 0;
+		localBlock[OFFSET(ls2 - 1)] = 0;
 	barrier(CLK_LOCAL_MEM_FENCE);
@@ -76,19 +77,19 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
 		uint right = (ls2 - 1) - lid * stride * 2;
 		uint left = right - stride;
-		if (right < ls2) {
+		if (right < ls2 && left < ls2) {
-			uint vright = localBlock[right];
+			uint vright = localBlock[OFFSET(right)];
-			uint vleft = localBlock[left];
+			uint vleft = localBlock[OFFSET(left)];
-			localBlock[left] = vright;
+			localBlock[OFFSET(left)] = vright;
-			localBlock[right] = vleft + vright;
+			localBlock[OFFSET(right)] = vleft + vright;
 		}
 		barrier(CLK_LOCAL_MEM_FENCE);
 	}
 	//write back
-	array[gid2] = localBlock[lid2] + inLeft;
+	array[gid2] = localBlock[OFFSET(lid2)] + inLeft;
-	uint last = localBlock[lid2 + 1] + inRight;
+	uint last = localBlock[OFFSET(lid2 + 1)] + inRight;
 	array[gid2 + 1] = last;
 	if (lid + 1 == ls) {
@@ -100,6 +101,23 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void Scan_WorkEfficientAdd(__global uint* higherLevelArray, __global uint* array, __local uint* localBlock) 
 {
+	int ls2 = get_local_size(0) * 2;
+	int gs2 = get_global_size(0) * 2;
+	int lid = get_local_id(0);
+	int lid2 = lid *2;
+	uint id = (get_group_id(0) + 1) * ls2 + lid2;
+	uint add = higherLevelArray[get_group_id(0)];
+	if (id < gs2) {
+		array[id] += add;
+	}
+	if (id + 1 < gs2) {
+		array[id + 1] += add;
+	}
 	// TO DO: Kernel implementation (large arrays)
 	// Kernel that should add the group PPS to the local PPS (Figure 14)
 }
\ No newline at end of file