Commit 5c5f4cb3 by Kai Westerkamp

Decomp Kernel

parent a752caf0
...@@ -42,8 +42,10 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context) ...@@ -42,8 +42,10 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
//fill the array with some values //fill the array with some values
for(unsigned int i = 0; i < m_N; i++) for(unsigned int i = 0; i < m_N; i++)
m_hInput[i] = 1; // Use this for debugging //m_hInput[i] = 1; // Use this for debugging
//m_hInput[i] = rand() & 15; m_hInput[i] = rand() & 15;
//cout << "init are 1" << endl;
//device resources //device resources
cl_int clError, clError2; cl_int clError, clError2;
...@@ -96,9 +98,13 @@ void CReductionTask::ReleaseResources() ...@@ -96,9 +98,13 @@ void CReductionTask::ReleaseResources()
void CReductionTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) void CReductionTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{ {
ExecuteTask(Context, CommandQueue, LocalWorkSize, 0); ExecuteTask(Context, CommandQueue, LocalWorkSize, 0);
cout << "Task 1 Done"<< endl;
ExecuteTask(Context, CommandQueue, LocalWorkSize, 1); ExecuteTask(Context, CommandQueue, LocalWorkSize, 1);
cout << "Task 2 Done" << endl;
ExecuteTask(Context, CommandQueue, LocalWorkSize, 2); ExecuteTask(Context, CommandQueue, LocalWorkSize, 2);
cout << "Task 3 Done" << endl;
ExecuteTask(Context, CommandQueue, LocalWorkSize, 3); ExecuteTask(Context, CommandQueue, LocalWorkSize, 3);
cout << "Task 4 Done" << endl;
TestPerformance(Context, CommandQueue, LocalWorkSize, 0); TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
TestPerformance(Context, CommandQueue, LocalWorkSize, 1); TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
...@@ -192,10 +198,13 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma ...@@ -192,10 +198,13 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
cl_int clErr; cl_int clErr;
size_t globalWorkSize[1]; size_t globalWorkSize[1];
size_t localWorkSize[1]; size_t localWorkSize[1];
localWorkSize[0] = LocalWorkSize[0];
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hInput[j]);
//cout << endl;
for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) { for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
localWorkSize[0] = min(localWorkSize[0], i); localWorkSize[0] = min(LocalWorkSize[0], i);
clErr = clSetKernelArg(m_DecompKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray); clErr = clSetKernelArg(m_DecompKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
clErr |= clSetKernelArg(m_DecompKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray); clErr |= clSetKernelArg(m_DecompKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
...@@ -209,6 +218,10 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma ...@@ -209,6 +218,10 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
swap(m_dPingArray, m_dPongArray); swap(m_dPingArray, m_dPongArray);
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hDebug[j]);
//cout << endl;
} }
...@@ -223,6 +236,35 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma ...@@ -223,6 +236,35 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
void CReductionTask::Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) void CReductionTask::Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{ {
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hInput[j]);
//cout << endl;
for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
localWorkSize[0] = min(LocalWorkSize[0], i);
clErr = clSetKernelArg(m_DecompUnrollKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
clErr |= clSetKernelArg(m_DecompUnrollKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
clErr |= clSetKernelArg(m_DecompUnrollKernel, 2, sizeof(cl_uint), (void*)&i);
clErr |= clSetKernelArg(m_DecompUnrollKernel, 3, localWorkSize[0] * sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Error setting Kernel args");
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
clErr = clEnqueueNDRangeKernel(CommandQueue, m_DecompUnrollKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
swap(m_dPingArray, m_dPongArray);
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hDebug[j]);
//cout << endl;
}
// TO DO: Implement reduction with loop unrolling // TO DO: Implement reduction with loop unrolling
......
...@@ -28,22 +28,24 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA ...@@ -28,22 +28,24 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
{ {
int ls = get_local_size(0); int ls = get_local_size(0);
int gs = get_global_size(0); int gs = get_global_size(0);
int gid = get_global_id(0); int gid = get_global_id(0);
int lid = get_local_id(0); int lid = get_local_id(0);
int groupID = get_group_id(0); int groupID = get_group_id(0);
localBlock[lid] = inArray[gid] + inArray[gid + gs]; localBlock[lid] = inArray[gid] + inArray[gid + gs];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for(uint stride = ls / 2; stride > 0; ls /= 2){ for(uint stride = ls / 2; stride > 0; stride /= 2){
if(lid < stride){ if(lid < stride){
localBlock[lid] = inArray[lid] + inArray[lid + stride]; localBlock[lid] = localBlock[lid] + localBlock[lid + stride];
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if(lid = 0) if(lid == 0)
outArray[groupID] = localBlock[0]; outArray[groupID] = localBlock[0];
} }
...@@ -51,5 +53,33 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA ...@@ -51,5 +53,33 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_DecompUnroll(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock) __kernel void Reduction_DecompUnroll(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
{ {
// TO DO: Kernel implementation int ls = get_local_size(0);
int gs = get_global_size(0);
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
localBlock[lid] = inArray[gid] + inArray[gid + gs];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint stride = ls / 2; stride > 32; stride /= 2){
if(lid < stride){
localBlock[lid] = localBlock[lid] + localBlock[lid + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid < 32) {
if (ls >= 64) localBlock[lid] += localBlock[lid + 32];
if (ls >= 32) localBlock[lid] += localBlock[lid + 16];
if (ls >= 16) localBlock[lid] += localBlock[lid + 8];
if (ls >= 8) localBlock[lid] += localBlock[lid + 4];
if (ls >= 4) localBlock[lid] += localBlock[lid + 2];
}
if(lid == 0)
outArray[groupID] = localBlock[0]+localBlock[1];
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment