Commit 84bbc7a6 by Kai Westerkamp

scan work efficient

parent 0d7ab823
...@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute() ...@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute()
cout<<"Running parallel reduction task..."<<endl<<endl; cout<<"Running parallel reduction task..."<<endl<<endl;
{ {
size_t LocalWorkSize[3] = {256, 1, 1}; size_t LocalWorkSize[3] = {256, 1, 1};
//CReductionTask reduction(1024 * 1024 * 16); CReductionTask reduction(1024 * 1024 * 16);
//RunComputeTask(reduction, LocalWorkSize); RunComputeTask(reduction, LocalWorkSize);
} }
// Task 2: parallel prefix sum // Task 2: parallel prefix sum
...@@ -32,8 +32,8 @@ bool CAssignment2::DoCompute() ...@@ -32,8 +32,8 @@ bool CAssignment2::DoCompute()
{ {
size_t LocalWorkSize[3] = {256, 1, 1}; size_t LocalWorkSize[3] = {256, 1, 1};
CScanTask scan(512, LocalWorkSize[0]); // CScanTask scan(1024, LocalWorkSize[0]);
RunComputeTask(scan, LocalWorkSize); // RunComputeTask(scan, LocalWorkSize);
CScanTask scan2(1024 * 1024 * 64, LocalWorkSize[0]); CScanTask scan2(1024 * 1024 * 64, LocalWorkSize[0]);
RunComputeTask(scan2, LocalWorkSize); RunComputeTask(scan2, LocalWorkSize);
......
...@@ -62,9 +62,8 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context) ...@@ -62,9 +62,8 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
//fill the array with some values //fill the array with some values
for(unsigned int i = 0; i < m_N; i++) for(unsigned int i = 0; i < m_N; i++)
m_hArray[i] = 1; // Use this for debugging //m_hArray[i] = 1; // Use this for debugging
//m_hArray[i] = rand() & 15; m_hArray[i] = rand() & 15;
cout << "Debug array" << endl;
//device resources //device resources
// ping-pong buffers // ping-pong buffers
...@@ -139,7 +138,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si ...@@ -139,7 +138,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si
cout << endl; cout << endl;
TestPerformance(Context, CommandQueue, LocalWorkSize, 0); TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
//TestPerformance(Context, CommandQueue, LocalWorkSize, 1); TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
cout << endl; cout << endl;
} }
...@@ -215,19 +214,61 @@ void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQ ...@@ -215,19 +214,61 @@ void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQ
cl_int clErr; cl_int clErr;
size_t globalWorkSize[1]; size_t globalWorkSize[1];
size_t localWorkSize[1]; size_t localWorkSize[1];
size_t levelSize = m_N;
size_t *levelSizes = new size_t[m_nLevels];
//local PPS
for (size_t level = 0; level < m_nLevels - 1; level++) {
levelSizes[level] = levelSize;
localWorkSize[0] = min(levelSize / 2, LocalWorkSize[0]);
if (localWorkSize[0] == 0)
continue;
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(levelSize/2, localWorkSize[0]);
clErr = clSetKernelArg(m_ScanWorkEfficientKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[level]);
clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[level+1]);
clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 2, localWorkSize[0] * 3 *sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Error setting Kernel args3");
clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
localWorkSize[0] = LocalWorkSize[0]; //cout << "Reading level " << levelSize << " Elements " << endl;
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(m_N/2, localWorkSize[0]); //V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[level], CL_TRUE, 0, levelSize * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
//for (int j = 0; j < levelSize; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
levelSize /= (2 * LocalWorkSize[0]);
//cout << "Reading level +1 " << levelSize << " Elements " << endl;
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[level+1], CL_TRUE, 0, levelSize * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
//for (int j = 0; j < levelSize; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
}
clErr = clSetKernelArg(m_ScanWorkEfficientKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[0]);
clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[1]);
clErr |= clSetKernelArg(m_ScanWorkEfficientKernel, 2, localWorkSize[0] * 2 *sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Error setting Kernel args3");
clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
swap(m_dPingArray, m_dPongArray); for (size_t level = m_nLevels - 3; level < m_nLevels; level--) { // unsigned int -> -1 > m_nLevels
levelSize = levelSizes[level];
localWorkSize[0] = min(levelSize / 2, LocalWorkSize[0]);
if (localWorkSize[0] == 0)
continue;
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(levelSize / 2, localWorkSize[0]);
clErr = clSetKernelArg(m_ScanWorkEfficientAddKernel, 0, sizeof(cl_mem), (void*)&m_dLevelArrays[level + 1]);
V_RETURN_CL(clErr, "Failed to set kernel args1");
clErr |= clSetKernelArg(m_ScanWorkEfficientAddKernel, 1, sizeof(cl_mem), (void*)&m_dLevelArrays[level]);
V_RETURN_CL(clErr, "Failed to set kernel args2");
clErr |= clSetKernelArg(m_ScanWorkEfficientAddKernel, 2, localWorkSize[0] * 2 * sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Failed to set kernel args3");
clErr = clEnqueueNDRangeKernel(CommandQueue, m_ScanWorkEfficientAddKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Failed to execute kernel");
}
// TO DO: Implement efficient version of scan // TO DO: Implement efficient version of scan
...@@ -254,10 +295,6 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue, ...@@ -254,10 +295,6 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue,
// validate results // validate results
m_bValidationResults[Task] =( memcmp(m_hResultCPU, m_hResultGPU, m_N * sizeof(unsigned int)) == 0); m_bValidationResults[Task] =( memcmp(m_hResultCPU, m_hResultGPU, m_N * sizeof(unsigned int)) == 0);
//for (int j = 0; j < m_N; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
} }
void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task) void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
...@@ -273,7 +310,7 @@ void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueu ...@@ -273,7 +310,7 @@ void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueu
timer.Start(); timer.Start();
//run the kernel N times //run the kernel N times
unsigned int nIterations = 100; unsigned int nIterations = 10;
for(unsigned int i = 0; i < nIterations; i++) { for(unsigned int i = 0; i < nIterations; i++) {
//run selected task //run selected task
switch (Task){ switch (Task){
......
...@@ -31,6 +31,7 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray, ...@@ -31,6 +31,7 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray,
#define AVOID_BANK_CONFLICTS #define AVOID_BANK_CONFLICTS
#ifdef AVOID_BANK_CONFLICTS #ifdef AVOID_BANK_CONFLICTS
// TO DO: define your conflict-free macro here // TO DO: define your conflict-free macro here
#define OFFSET(A) ((A) + (A)/NUM_BANKS)
#else #else
#define OFFSET(A) (A) #define OFFSET(A) (A)
#endif #endif
...@@ -47,8 +48,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve ...@@ -47,8 +48,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
//read to local //read to local
uint inLeft, inRight;; uint inLeft, inRight;;
localBlock[lid2] = inLeft = array[gid2]; localBlock[OFFSET(lid2)] = inLeft = array[gid2];
localBlock[lid2+1] = inRight = array[gid2+1]; localBlock[OFFSET(lid2+1)] = inRight = array[gid2+1];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
//Upsweep //Upsweep
...@@ -57,8 +58,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve ...@@ -57,8 +58,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
uint right = (ls2 - 1) - lid * stride * 2; uint right = (ls2 - 1) - lid * stride * 2;
uint left = right - stride; uint left = right - stride;
if (right < ls2) { if (right < ls2 && left < ls2) {
localBlock[right] += localBlock[left]; localBlock[OFFSET(right)] += localBlock[OFFSET(left)];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -66,7 +67,7 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve ...@@ -66,7 +67,7 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
//letzten auf 0 setzen //letzten auf 0 setzen
if(lid == 0) if(lid == 0)
localBlock[ls2 - 1] = 0; localBlock[OFFSET(ls2 - 1)] = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
...@@ -76,19 +77,19 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve ...@@ -76,19 +77,19 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
uint right = (ls2 - 1) - lid * stride * 2; uint right = (ls2 - 1) - lid * stride * 2;
uint left = right - stride; uint left = right - stride;
if (right < ls2) { if (right < ls2 && left < ls2) {
uint vright = localBlock[right]; uint vright = localBlock[OFFSET(right)];
uint vleft = localBlock[left]; uint vleft = localBlock[OFFSET(left)];
localBlock[left] = vright; localBlock[OFFSET(left)] = vright;
localBlock[right] = vleft + vright; localBlock[OFFSET(right)] = vleft + vright;
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
//write back //write back
array[gid2] = localBlock[lid2] + inLeft; array[gid2] = localBlock[OFFSET(lid2)] + inLeft;
uint last = localBlock[lid2 + 1] + inRight; uint last = localBlock[OFFSET(lid2 + 1)] + inRight;
array[gid2 + 1] = last; array[gid2 + 1] = last;
if (lid + 1 == ls) { if (lid + 1 == ls) {
...@@ -100,6 +101,23 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve ...@@ -100,6 +101,23 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Scan_WorkEfficientAdd(__global uint* higherLevelArray, __global uint* array, __local uint* localBlock) __kernel void Scan_WorkEfficientAdd(__global uint* higherLevelArray, __global uint* array, __local uint* localBlock)
{ {
int ls2 = get_local_size(0) * 2;
int gs2 = get_global_size(0) * 2;
int lid = get_local_id(0);
int lid2 = lid *2;
uint id = (get_group_id(0) + 1) * ls2 + lid2;
uint add = higherLevelArray[get_group_id(0)];
if (id < gs2) {
array[id] += add;
}
if (id + 1 < gs2) {
array[id + 1] += add;
}
// TO DO: Kernel implementation (large arrays) // TO DO: Kernel implementation (large arrays)
// Kernel that should add the group PPS to the local PPS (Figure 14) // Kernel that should add the group PPS to the local PPS (Figure 14)
} }
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment