Commit 0d7ab823 by Kai Westerkamp

local

parent 5c5f4cb3
......@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute()
cout<<"Running parallel reduction task..."<<endl<<endl;
{
size_t LocalWorkSize[3] = {256, 1, 1};
CReductionTask reduction(1024 * 1024 * 16);
RunComputeTask(reduction, LocalWorkSize);
//CReductionTask reduction(1024 * 1024 * 16);
//RunComputeTask(reduction, LocalWorkSize);
}
// Task 2: parallel prefix sum
......@@ -31,8 +31,12 @@ bool CAssignment2::DoCompute()
cout<<"Running parallel prefix sum task..."<<endl<<endl;
{
size_t LocalWorkSize[3] = {256, 1, 1};
//CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
//RunComputeTask(scan, LocalWorkSize);
CScanTask scan(512, LocalWorkSize[0]);
RunComputeTask(scan, LocalWorkSize);
CScanTask scan2(1024 * 1024 * 64, LocalWorkSize[0]);
RunComputeTask(scan2, LocalWorkSize);
}
......
......@@ -62,8 +62,9 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
//fill the array with some values
for(unsigned int i = 0; i < m_N; i++)
//m_hArray[i] = 1; // Use this for debugging
m_hArray[i] = rand() & 15;
m_hArray[i] = 1; // Use this for debugging
//m_hArray[i] = rand() & 15;
cout << "Debug array" << endl;
//device resources
// ping-pong buffers
......@@ -138,7 +139,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si
cout << endl;
TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
//TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
cout << endl;
}
......@@ -178,6 +179,27 @@ bool CScanTask::ValidateResults()
void CScanTask::Scan_Naive(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
localWorkSize[0] = LocalWorkSize[0];
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(m_N, localWorkSize[0]);
clErr = clSetKernelArg(m_ScanNaiveKernel, 2, sizeof(cl_uint), (void*)&m_N);
V_RETURN_CL(clErr, "Error setting Kernel Arg 3");
for (unsigned int offset = 1 ; offset <= m_N; offset *= 2) {
clErr = clSetKernelArg(m_ScanNaiveKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
clErr |= clSetKernelArg(m_ScanNaiveKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
clErr |= clSetKernelArg(m_ScanNaiveKernel, 3, sizeof(cl_uint), (void*)&offset);
V_RETURN_CL(clErr, "Error setting Kernel args");
clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanNaiveKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
swap(m_dPingArray, m_dPongArray);
}
// TO DO: Implement naive version of scan
......@@ -190,6 +212,23 @@ void CScanTask::Scan_Naive(cl_context Context, cl_command_queue CommandQueue, si
void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
localWorkSize[0] = LocalWorkSize[0];
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(m_N/2, localWorkSize[0]);
clErr = clSetKernelArg(m_ScanWorkEfficientKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[0]);
clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[1]);
clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 2, localWorkSize[0] * 2 *sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Error setting Kernel args3");
clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
swap(m_dPingArray, m_dPongArray);
// TO DO: Implement efficient version of scan
......@@ -215,6 +254,10 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue,
// validate results
m_bValidationResults[Task] =( memcmp(m_hResultCPU, m_hResultGPU, m_N * sizeof(unsigned int)) == 0);
//for (int j = 0; j < m_N; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
}
void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
......
......@@ -4,7 +4,18 @@
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray, uint N, uint offset)
{
// TO DO: Kernel implementation
int left = get_global_id(0);
int right = left + offset;
int value = inArray[left];
if(left < offset){
outArray[left] = value;
}
if (right < N){
outArray[right] = value + inArray[right];
}
}
......@@ -27,7 +38,62 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray,
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLevelArray, __local uint* localBlock)
{
// TO DO: Kernel implementation
int ls = get_local_size(0);
int ls2 = ls * 2;
int gid2 = get_global_id(0) *2;
int lid = get_local_id(0);
int lid2 = lid *2;
//read to local
uint inLeft, inRight;;
localBlock[lid2] = inLeft = array[gid2];
localBlock[lid2+1] = inRight = array[gid2+1];
barrier(CLK_LOCAL_MEM_FENCE);
//Upsweep
for (uint stride = 1; stride <= ls; stride *= 2) {
uint right = (ls2 - 1) - lid * stride * 2;
uint left = right - stride;
if (right < ls2) {
localBlock[right] += localBlock[left];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//letzten auf 0 setzen
if(lid == 0)
localBlock[ls2 - 1] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
//Downsweep
for (uint stride = ls; stride >= 1; stride /= 2) {
uint right = (ls2 - 1) - lid * stride * 2;
uint left = right - stride;
if (right < ls2) {
uint vright = localBlock[right];
uint vleft = localBlock[left];
localBlock[left] = vright;
localBlock[right] = vleft + vright;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
//write back
array[gid2] = localBlock[lid2] + inLeft;
uint last = localBlock[lid2 + 1] + inRight;
array[gid2 + 1] = last;
if (lid + 1 == ls) {
higherLevelArray[get_group_id(0)] = last;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment