Commit a752caf0 by Kai Westerkamp

interleaved, + sequential

parent e1738542
...@@ -31,8 +31,8 @@ bool CAssignment2::DoCompute() ...@@ -31,8 +31,8 @@ bool CAssignment2::DoCompute()
cout<<"Running parallel prefix sum task..."<<endl<<endl; cout<<"Running parallel prefix sum task..."<<endl<<endl;
{ {
size_t LocalWorkSize[3] = {256, 1, 1}; size_t LocalWorkSize[3] = {256, 1, 1};
CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]); //CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
RunComputeTask(scan, LocalWorkSize); //RunComputeTask(scan, LocalWorkSize);
} }
......
...@@ -38,11 +38,12 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context) ...@@ -38,11 +38,12 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
{ {
//CPU resources //CPU resources
m_hInput = new unsigned int[m_N]; m_hInput = new unsigned int[m_N];
m_hDebug = new unsigned int[m_N];
//fill the array with some values //fill the array with some values
for(unsigned int i = 0; i < m_N; i++) for(unsigned int i = 0; i < m_N; i++)
//m_hInput[i] = 1; // Use this for debugging m_hInput[i] = 1; // Use this for debugging
m_hInput[i] = rand() & 15; //m_hInput[i] = rand() & 15;
//device resources //device resources
cl_int clError, clError2; cl_int clError, clError2;
...@@ -141,26 +142,75 @@ bool CReductionTask::ValidateResults() ...@@ -141,26 +142,75 @@ bool CReductionTask::ValidateResults()
void CReductionTask::Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) void CReductionTask::Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{ {
//cl_int clErr; cl_int clErr;
//size_t globalWorkSize[1]; size_t globalWorkSize[1];
//size_t localWorkSize[1]; size_t localWorkSize[1];
//unsigned int stride = ...;
clErr = clSetKernelArg(m_InterleavedAddressingKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
V_RETURN_CL(clErr, "Error setting Kernel Arg 1");
unsigned int stride = 1;
for (unsigned int i = m_N/2; i > 0; i/=2) {
clErr |= clSetKernelArg(m_InterleavedAddressingKernel, 1, sizeof(cl_uint), (void*)&stride);
V_RETURN_CL(clErr, "Error setting Kernel Arg 2");
// TO DO: Implement reduction with interleaved addressing localWorkSize[0] = min(LocalWorkSize[0], i);
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
//for (...) { clErr = clEnqueueNDRangeKernel(CommandQueue, m_InterleavedAddressingKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
//} V_RETURN_CL(clErr, "Error running Kernel");
stride *= 2;
}
} }
void CReductionTask::Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) void CReductionTask::Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{ {
// TO DO: Implement reduction with sequential addressing cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
clErr = clSetKernelArg(m_SequentialAddressingKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
V_RETURN_CL(clErr, "Error setting Kernel Arg 1");
for (unsigned int i = m_N / 2; i > 0; i /= 2) {
clErr |= clSetKernelArg(m_SequentialAddressingKernel, 1, sizeof(cl_uint), (void*)&i);
V_RETURN_CL(clErr, "Error setting Kernel Arg 2");
localWorkSize[0] = min(LocalWorkSize[0], i);
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
clErr = clEnqueueNDRangeKernel(CommandQueue, m_SequentialAddressingKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
}
} }
void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{ {
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
localWorkSize[0] = LocalWorkSize[0];
for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
localWorkSize[0] = min(localWorkSize[0], i);
clErr = clSetKernelArg(m_DecompKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
clErr |= clSetKernelArg(m_DecompKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
clErr |= clSetKernelArg(m_DecompKernel, 2, sizeof(cl_uint), (void*)&i);
clErr |= clSetKernelArg(m_DecompKernel, 3, localWorkSize[0] * sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Error setting Kernel args");
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
clErr = clEnqueueNDRangeKernel(CommandQueue, m_DecompKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
swap(m_dPingArray, m_dPongArray);
}
// TO DO: Implement reduction with kernel decomposition // TO DO: Implement reduction with kernel decomposition
...@@ -214,6 +264,8 @@ void CReductionTask::ExecuteTask(cl_context Context, cl_command_queue CommandQue ...@@ -214,6 +264,8 @@ void CReductionTask::ExecuteTask(cl_context Context, cl_command_queue CommandQue
void CReductionTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task) void CReductionTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
{ {
//return; //TODO
cout << "Testing performance of task " << g_kernelNames[Task] << endl; cout << "Testing performance of task " << g_kernelNames[Task] << endl;
//write input data to the GPU //write input data to the GPU
......
...@@ -59,6 +59,8 @@ protected: ...@@ -59,6 +59,8 @@ protected:
// input data // input data
unsigned int *m_hInput; unsigned int *m_hInput;
unsigned int *m_hDebug;
// results // results
unsigned int m_resultCPU; unsigned int m_resultCPU;
unsigned int m_resultGPU[4]; unsigned int m_resultGPU[4];
......
...@@ -2,21 +2,49 @@ ...@@ -2,21 +2,49 @@
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_InterleavedAddressing(__global uint* array, uint stride) __kernel void Reduction_InterleavedAddressing(__global uint* array, uint stride)
{ {
// TO DO: Kernel implementation int GID = get_global_id(0);
int pos1 = GID*2*stride;
int pos2 = pos1+stride;
array[pos1] = array[pos1] + array[pos2];
array[pos2] = 0;
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_SequentialAddressing(__global uint* array, uint stride) __kernel void Reduction_SequentialAddressing(__global uint* array, uint stride)
{ {
// TO DO: Kernel implementation int GID = get_global_id(0);
int pos2 = GID + stride;
array[GID] = array[GID] + array[pos2];
} }
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock) __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
{ {
// TO DO: Kernel implementation int ls = get_local_size(0);
int gs = get_global_size(0);
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
localBlock[lid] = inArray[gid] + inArray[gid + gs];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint stride = ls / 2; stride > 0; ls /= 2){
if(lid < stride){
localBlock[lid] = inArray[lid] + inArray[lid + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid = 0)
outArray[groupID] = localBlock[0];
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment