local

0d7ab823 · Kai Westerkamp · 5c5f4cb3 · 0d7ab823 · 0d7ab823 · 0d7ab823
Commit 0d7ab823 authored Nov 07, 2016 by Kai Westerkamp
Show whitespace changes
Inline Side-by-side

Showing with 122 additions and 9 deletions

CAssignment2.cpp Assignment2/Assignment2/CAssignment2.cpp +8 -4

CScanTask.cpp Assignment2/Assignment2/CScanTask.cpp +46 -3

Scan.cl Assignment2/Assignment2/Scan.cl +68 -2

No files found.
--- a/Assignment2/Assignment2/CAssignment2.cpp
+++ b/Assignment2/Assignment2/CAssignment2.cpp
@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute()
 	cout<<"Running parallel reduction task..."<<endl<<endl;
 	{
 		size_t LocalWorkSize[3] = {256, 1, 1};
-		CReductionTask reduction(1024 * 1024 * 16);
+		//CReductionTask reduction(1024 * 1024 * 16);
-		RunComputeTask(reduction, LocalWorkSize);
+		//RunComputeTask(reduction, LocalWorkSize);
 	}
 	// Task 2: parallel prefix sum
@@ -31,8 +31,12 @@ bool CAssignment2::DoCompute()
 	cout<<"Running parallel prefix sum task..."<<endl<<endl;
 	{
 		size_t LocalWorkSize[3] = {256, 1, 1};
-		//CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
-		//RunComputeTask(scan, LocalWorkSize);
+		CScanTask scan(512, LocalWorkSize[0]);
+		RunComputeTask(scan, LocalWorkSize);
+		CScanTask scan2(1024 * 1024 * 64, LocalWorkSize[0]);
+		RunComputeTask(scan2, LocalWorkSize);
 	}

--- a/Assignment2/Assignment2/CScanTask.cpp
+++ b/Assignment2/Assignment2/CScanTask.cpp
@@ -62,8 +62,9 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
 	//fill the array with some values
 	for(unsigned int i = 0; i < m_N; i++)
-		//m_hArray[i] = 1;			// Use this for debugging
+		m_hArray[i] = 1;			// Use this for debugging
-		m_hArray[i] = rand() & 15;
+		//m_hArray[i] = rand() & 15;
+	cout << "Debug array" << endl;
 	//device resources
 	// ping-pong buffers
@@ -138,7 +139,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si
 	cout << endl;
 	TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
-	TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
+	//TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
 	cout << endl;
 }
@@ -178,6 +179,27 @@ bool CScanTask::ValidateResults()
 void CScanTask::Scan_Naive(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
 {
+	cl_int clErr;
+	size_t globalWorkSize[1];
+	size_t localWorkSize[1];
+	localWorkSize[0] = LocalWorkSize[0];
+	globalWorkSize[0] = CLUtil::GetGlobalWorkSize(m_N, localWorkSize[0]);
+	clErr = clSetKernelArg(m_ScanNaiveKernel, 2, sizeof(cl_uint), (void*)&m_N);
+	V_RETURN_CL(clErr, "Error setting Kernel Arg 3");
+	for (unsigned int offset = 1 ; offset <= m_N; offset *= 2) {
+		clErr = clSetKernelArg(m_ScanNaiveKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
+		clErr |= clSetKernelArg(m_ScanNaiveKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
+		clErr |= clSetKernelArg(m_ScanNaiveKernel, 3,  sizeof(cl_uint), (void*)&offset);
+		V_RETURN_CL(clErr, "Error setting Kernel args");
+		clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanNaiveKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+		V_RETURN_CL(clErr, "Error running Kernel");
+		swap(m_dPingArray, m_dPongArray);
+	}
 	// TO DO: Implement naive version of scan
@@ -190,6 +212,23 @@ void CScanTask::Scan_Naive(cl_context Context, cl_command_queue CommandQueue, si
 void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
 {
+	cl_int clErr;
+	size_t globalWorkSize[1];
+	size_t localWorkSize[1];
+	localWorkSize[0] = LocalWorkSize[0];
+	globalWorkSize[0] = CLUtil::GetGlobalWorkSize(m_N/2, localWorkSize[0]);
+	clErr = clSetKernelArg(m_ScanWorkEfficientKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[0]);
+	clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[1]);
+	clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 2, localWorkSize[0] * 2 *sizeof(cl_uint), NULL);
+	V_RETURN_CL(clErr, "Error setting Kernel args3");
+	clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+	V_RETURN_CL(clErr, "Error running Kernel");
+	swap(m_dPingArray, m_dPongArray);
 	// TO DO: Implement efficient version of scan
@@ -215,6 +254,10 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue, 
 	// validate results
 	m_bValidationResults[Task] =( memcmp(m_hResultCPU, m_hResultGPU, m_N * sizeof(unsigned int)) == 0);
+	//for (int j = 0; j < m_N; j++)
+	//	printf("%2d, ", m_hResultGPU[j]);
+	//cout << endl;
 }
 void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)

--- a/Assignment2/Assignment2/Scan.cl
+++ b/Assignment2/Assignment2/Scan.cl
@@ -4,7 +4,18 @@
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray, uint N, uint offset) 
 {
-	// TO DO: Kernel implementation
+	int left = get_global_id(0);
+	int right = left + offset;
+	int value  = inArray[left];
+	if(left < offset){
+		outArray[left] = value;
+	}
+	if (right < N){
+		outArray[right] = value + inArray[right];
+	}
 }
@@ -27,7 +38,62 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray, 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLevelArray, __local uint* localBlock) 
 {
-	// TO DO: Kernel implementation
+	int ls = get_local_size(0);
+	int ls2 = ls * 2;
+	int gid2 = get_global_id(0) *2;
+	int lid = get_local_id(0);
+	int lid2 = lid *2;
+	//read to local
+	uint inLeft, inRight;;
+	localBlock[lid2] = inLeft = array[gid2];
+	localBlock[lid2+1] = inRight = array[gid2+1];
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//Upsweep
+	for (uint stride = 1; stride <= ls; stride *= 2) {
+		uint right = (ls2 - 1) - lid * stride * 2;
+		uint left = right - stride;
+		if (right < ls2) {
+			localBlock[right] += localBlock[left];
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	//letzten auf 0 setzen 
+	if(lid == 0)
+		localBlock[ls2 - 1] = 0;
+	barrier(CLK_LOCAL_MEM_FENCE);
+	//Downsweep
+	for (uint stride = ls; stride >= 1; stride /= 2) {
+		uint right = (ls2 - 1) - lid * stride * 2;
+		uint left = right - stride;
+		if (right < ls2) {
+			uint vright = localBlock[right];
+			uint vleft = localBlock[left];
+			localBlock[left] = vright;
+			localBlock[right] = vleft + vright;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+	//write back
+	array[gid2] = localBlock[lid2] + inLeft;
+	uint last = localBlock[lid2 + 1] + inRight;
+	array[gid2 + 1] = last;
+	if (lid + 1 == ls) {
+		higherLevelArray[get_group_id(0)] = last;
+	}
 }