Commit a752caf0 by Kai Westerkamp

interleaved, + sequential

parent e1738542
......@@ -31,8 +31,8 @@ bool CAssignment2::DoCompute()
cout<<"Running parallel prefix sum task..."<<endl<<endl;
{
size_t LocalWorkSize[3] = {256, 1, 1};
CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
RunComputeTask(scan, LocalWorkSize);
//CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
//RunComputeTask(scan, LocalWorkSize);
}
......
......@@ -38,11 +38,12 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
{
//CPU resources
m_hInput = new unsigned int[m_N];
m_hDebug = new unsigned int[m_N];
//fill the array with some values
for(unsigned int i = 0; i < m_N; i++)
//m_hInput[i] = 1; // Use this for debugging
m_hInput[i] = rand() & 15;
m_hInput[i] = 1; // Use this for debugging
//m_hInput[i] = rand() & 15;
//device resources
cl_int clError, clError2;
......@@ -141,26 +142,75 @@ bool CReductionTask::ValidateResults()
void CReductionTask::Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
//cl_int clErr;
//size_t globalWorkSize[1];
//size_t localWorkSize[1];
//unsigned int stride = ...;
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
clErr = clSetKernelArg(m_InterleavedAddressingKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
V_RETURN_CL(clErr, "Error setting Kernel Arg 1");
unsigned int stride = 1;
for (unsigned int i = m_N/2; i > 0; i/=2) {
clErr |= clSetKernelArg(m_InterleavedAddressingKernel, 1, sizeof(cl_uint), (void*)&stride);
V_RETURN_CL(clErr, "Error setting Kernel Arg 2");
// TO DO: Implement reduction with interleaved addressing
localWorkSize[0] = min(LocalWorkSize[0], i);
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
//for (...) {
//}
clErr = clEnqueueNDRangeKernel(CommandQueue, m_InterleavedAddressingKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
stride *= 2;
}
}
void CReductionTask::Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// TO DO: Implement reduction with sequential addressing
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
clErr = clSetKernelArg(m_SequentialAddressingKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
V_RETURN_CL(clErr, "Error setting Kernel Arg 1");
for (unsigned int i = m_N / 2; i > 0; i /= 2) {
clErr |= clSetKernelArg(m_SequentialAddressingKernel, 1, sizeof(cl_uint), (void*)&i);
V_RETURN_CL(clErr, "Error setting Kernel Arg 2");
localWorkSize[0] = min(LocalWorkSize[0], i);
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
clErr = clEnqueueNDRangeKernel(CommandQueue, m_SequentialAddressingKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
}
}
void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
cl_int clErr;
size_t globalWorkSize[1];
size_t localWorkSize[1];
localWorkSize[0] = LocalWorkSize[0];
for (unsigned int i = m_N / 2; i > 1; i /= 2 * localWorkSize[0]) {
localWorkSize[0] = min(localWorkSize[0], i);
clErr = clSetKernelArg(m_DecompKernel, 0, sizeof(cl_mem), (void*)&m_dPingArray);
clErr |= clSetKernelArg(m_DecompKernel, 1, sizeof(cl_mem), (void*)&m_dPongArray);
clErr |= clSetKernelArg(m_DecompKernel, 2, sizeof(cl_uint), (void*)&i);
clErr |= clSetKernelArg(m_DecompKernel, 3, localWorkSize[0] * sizeof(cl_uint), NULL);
V_RETURN_CL(clErr, "Error setting Kernel args");
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(i, localWorkSize[0]);
clErr = clEnqueueNDRangeKernel(CommandQueue, m_DecompKernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
V_RETURN_CL(clErr, "Error running Kernel");
swap(m_dPingArray, m_dPongArray);
}
// TO DO: Implement reduction with kernel decomposition
......@@ -214,6 +264,8 @@ void CReductionTask::ExecuteTask(cl_context Context, cl_command_queue CommandQue
void CReductionTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
{
//return; //TODO
cout << "Testing performance of task " << g_kernelNames[Task] << endl;
//write input data to the GPU
......
......@@ -59,6 +59,8 @@ protected:
// input data
unsigned int *m_hInput;
unsigned int *m_hDebug;
// results
unsigned int m_resultCPU;
unsigned int m_resultGPU[4];
......
......@@ -2,21 +2,49 @@
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_InterleavedAddressing(__global uint* array, uint stride)
{
// TO DO: Kernel implementation
int GID = get_global_id(0);
int pos1 = GID*2*stride;
int pos2 = pos1+stride;
array[pos1] = array[pos1] + array[pos2];
array[pos2] = 0;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_SequentialAddressing(__global uint* array, uint stride)
{
// TO DO: Kernel implementation
int GID = get_global_id(0);
int pos2 = GID + stride;
array[GID] = array[GID] + array[pos2];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
{
// TO DO: Kernel implementation
int ls = get_local_size(0);
int gs = get_global_size(0);
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupID = get_group_id(0);
localBlock[lid] = inArray[gid] + inArray[gid + gs];
barrier(CLK_LOCAL_MEM_FENCE);
for(uint stride = ls / 2; stride > 0; ls /= 2){
if(lid < stride){
localBlock[lid] = inArray[lid] + inArray[lid + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid = 0)
outArray[groupID] = localBlock[0];
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment