Commit 5c5f4cb3 by Kai Westerkamp

Decomp Kernel

parent a752caf0
......@@ -42,8 +42,10 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
//fill the array with some values
for(unsigned int i = 0; i < m_N; i++)
m_hInput[i] = 1; // Use this for debugging
//m_hInput[i] = rand() & 15;
//m_hInput[i] = 1; // Use this for debugging
m_hInput[i] = rand() & 15;
//cout << "init are 1" << endl;
//device resources
cl_int clError, clError2;
......@@ -96,9 +98,13 @@ void CReductionTask::ReleaseResources()
void CReductionTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
ExecuteTask(Context, CommandQueue, LocalWorkSize, 0);
cout << "Task 1 Done"<< endl;
ExecuteTask(Context, CommandQueue, LocalWorkSize, 1);
cout << "Task 2 Done" << endl;
ExecuteTask(Context, CommandQueue, LocalWorkSize, 2);
cout << "Task 3 Done" << endl;
ExecuteTask(Context, CommandQueue, LocalWorkSize, 3);
cout << "Task 4 Done" << endl;
TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
......@@ -192,10 +198,13 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
localWorkSize[0] = LocalWorkSize[0];
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hInput[j]);
//cout << endl;
for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
localWorkSize[0] = min(localWorkSize[0], i);
localWorkSize[0] = min(LocalWorkSize[0], i);
clErr = clSetKernelArg(m_DecompKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
clErr |= clSetKernelArg(m_DecompKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
......@@ -209,6 +218,10 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
swap(m_dPingArray, m_dPongArray);
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hDebug[j]);
//cout << endl;
}
......@@ -223,6 +236,35 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
void CReductionTask::Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hInput[j]);
//cout << endl;
for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
localWorkSize[0] = min(LocalWorkSize[0], i);
clErr = clSetKernelArg(m_DecompUnrollKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
clErr |= clSetKernelArg(m_DecompUnrollKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
clErr |= clSetKernelArg(m_DecompUnrollKernel, 2, sizeof(cl_uint), (void*)&i);
clErr |= clSetKernelArg(m_DecompUnrollKernel, 3, localWorkSize[0] * sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Error setting Kernel args");
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
clErr = clEnqueueNDRangeKernel(CommandQueue, m_DecompUnrollKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
swap(m_dPingArray, m_dPongArray);
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hDebug[j]);
//cout << endl;
}
// TO DO: Implement reduction with loop unrolling
......
......@@ -28,22 +28,24 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
{
int ls = get_local_size(0);
int gs = get_global_size(0);
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
localBlock[lid] = inArray[gid] + inArray[gid + gs];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint stride = ls / 2; stride > 0; ls /= 2){
for(uint stride = ls / 2; stride > 0; stride /= 2){
if(lid < stride){
localBlock[lid] = inArray[lid] + inArray[lid + stride];
localBlock[lid] = localBlock[lid] + localBlock[lid + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid = 0)
if(lid == 0)
outArray[groupID] = localBlock[0];
}
......@@ -51,5 +53,33 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_DecompUnroll(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
{
// TO DO: Kernel implementation
int ls = get_local_size(0);
int gs = get_global_size(0);
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
localBlock[lid] = inArray[gid] + inArray[gid + gs];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint stride = ls / 2; stride > 32; stride /= 2){
if(lid < stride){
localBlock[lid] = localBlock[lid] + localBlock[lid + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid < 32) {
if (ls >= 64) localBlock[lid] += localBlock[lid + 32];
if (ls >= 32) localBlock[lid] += localBlock[lid + 16];
if (ls >= 16) localBlock[lid] += localBlock[lid + 8];
if (ls >= 8) localBlock[lid] += localBlock[lid + 4];
if (ls >= 4) localBlock[lid] += localBlock[lid + 2];
}
if(lid == 0)
outArray[groupID] = localBlock[0]+localBlock[1];
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment