Commit 449a502c by Kai Westerkamp

A3

parent 84bbc7a6
File added
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignment3.h"
#include "CConvolution3x3Task.h"
#include "CConvolutionSeparableTask.h"
#include "CConvolutionBilateralTask.h"
#include "CHistogramTask.h"
#include <iostream>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CAssignment3
bool CAssignment3::DoCompute()
{
cout<<"########################################"<<endl;
cout<<"GPU Computing assignment 3"<<endl<<endl;
cout<<"IMPORTANT: Make sure you always check the difference images."<<endl;
cout<<"The CPU 'gold' test is only suitable to catch trivial errors,"<<endl;
cout<<"A low MSE (mean squared error) might still happen with a few corrupted pixels."<<endl;
cout<<"########################################"<<endl;
cout<<"Task 1: 3x3 convolution"<<endl<<endl;
{
size_t TileSize[2] = {32, 16};
float ConvKernel[3][3] = {
{ -1.0f / 8.0f, -1.0f / 8.0f, -1.0f / 8.0f },
{ -1.0f / 8.0f, 1.0f, -1.0f / 8.0f },
{ -1.0f / 8.0f, -1.0f / 8.0f, -1.0f / 8.0f },
};
CConvolution3x3Task convTask("Images/input.pfm", TileSize, ConvKernel, true, 0.0f);
RunComputeTask(convTask, TileSize);
}
cout<<endl<<"########################################"<<endl;
cout<<"Task 2: Separable convolution"<<endl<<endl;
{
size_t HGroupSize[2] = {32, 16};
size_t VGroupSize[2] = {32, 16};
{
//simple box filter
float ConvKernel[9];
for(int i = 0; i < 9; i++)
ConvKernel[i] = 1.0f / 9.0f;
CConvolutionSeparableTask convTask("box_4x4", "Images/input.pfm", HGroupSize, VGroupSize,
4, 4, 4, ConvKernel, ConvKernel);
// note: the last argument is ignored, but our framework requires it
// for the horizontal and vertical passes different local sizes might be used
RunComputeTask(convTask, HGroupSize);
}
{
//simple box filter
float ConvKernel[17];
for(int i = 0; i < 17; i++)
ConvKernel[i] = 1.0f / 17.0f;
CConvolutionSeparableTask convTask("box_8x8", "Images/input.pfm", HGroupSize, VGroupSize,
4, 4, 8, ConvKernel, ConvKernel);
RunComputeTask(convTask, HGroupSize);
}
{
// Gaussian blur
float ConvKernel[7] = {
0.000817774f, 0.0286433f, 0.235018f, 0.471041f, 0.235018f, 0.0286433f, 0.000817774f
};
CConvolutionSeparableTask convTask("gauss_3x3", "Images/input.pfm", HGroupSize, VGroupSize,
4, 4, 3, ConvKernel, ConvKernel);
RunComputeTask(convTask, HGroupSize);
}
}
cout<<endl<<"########################################"<<endl;
cout<<"Task 3: Separable bilateral convolution"<<endl<<endl;
{
size_t HGroupSize[2] = {32, 4};
size_t VGroupSize[2] = {32, 4};
float ConvKernel[9] = {0.010284844f, 0.0417071f, 0.113371652f, 0.206576619f, 0.252313252f, 0.206576619f, 0.113371652f, 0.0417071f, 0.010284844f};
CConvolutionBilateralTask convTask("Images/color.pfm", "Images/normals.pfm", "Images/depth.pfm", HGroupSize, VGroupSize,
4, 4, 4, ConvKernel, ConvKernel);
RunComputeTask(convTask, HGroupSize);
}
cout<<endl<<"########################################"<<endl;
cout<<"Task 4: Histogram"<<endl<<endl;
{
size_t group_size[2] = {16, 16};
{
CHistogramTask histogram(0.25f, 0.26f, false, "Images/input.pfm");
RunComputeTask(histogram, group_size);
}
{
CHistogramTask histogram(0.25f, 0.26f, true, "Images/input.pfm");
RunComputeTask(histogram, group_size);
}
}
return true;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CASSIGNMENT2_H
#define _CASSIGNMENT2_H
#include "../Common/CAssignmentBase.h"
//! Assignment3 solution
class CAssignment3 : public CAssignmentBase
{
public:
virtual ~CAssignment3() {};
virtual bool DoCompute();
};
#endif // _CASSIGNMENT2_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CConvolution3x3Task.h"
#include "../Common/CLUtil.h"
#include "../Common/CTimer.h"
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CConvolution3x3Task
CConvolution3x3Task::
CConvolution3x3Task(
const std::string& FileName,
size_t TileSize[2],
float ConvKernel[3][3],
bool Monochrome,
float Offset
)
: CConvolutionTaskBase(FileName, Monochrome)
, m_Offset(Offset)
{
m_TileSize[0] = TileSize[0];
m_TileSize[1] = TileSize[1];
m_KernelWeight = 0;
for(int i = 0; i < 3; i++)
{
for(int j = 0; j < 3; j++)
{
m_hConvolutionKernel[i][j] = ConvKernel[i][j];
m_KernelWeight += ConvKernel[i][j];
}
}
if(m_KernelWeight > 0)
m_KernelWeight = 1.0f / m_KernelWeight;
else
m_KernelWeight = 1.0f;
m_FileNamePostfix = "3x3";
}
CConvolution3x3Task::~CConvolution3x3Task()
{
ReleaseResources();
}
bool CConvolution3x3Task::InitResources(cl_device_id Device, cl_context Context)
{
if(!CConvolutionTaskBase::InitResources(Device, Context))
return false;
//we can init the kernel buffer during creation as its contents will not change
cl_int clError;
cl_float kernelConstants[11];
for(int y = 0; y < 3; y++)
for(int x = 0; x < 3; x++)
kernelConstants[3*y + x] = m_hConvolutionKernel[y][x];
kernelConstants[9] = m_KernelWeight;
kernelConstants[10] = m_Offset;
m_dKernelConstants = clCreateBuffer(Context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 11 * sizeof(cl_float),
kernelConstants, &clError);
V_RETURN_FALSE_CL(clError, "Error allocating device kernel constants.");
string programCode;
CLUtil::LoadProgramSourceToMemory("Convolution3x3.cl", programCode);
m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode);
if(m_Program == nullptr) return false;
//create kernel(s)
m_ConvolutionKernel = clCreateKernel(m_Program, "Convolution", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel.");
//bind kernel attributes
clError = clSetKernelArg(m_ConvolutionKernel, 2, sizeof(cl_mem), (void*)&m_dKernelConstants);
clError |= clSetKernelArg(m_ConvolutionKernel, 3, sizeof(cl_uint), (void*)&m_Width);
clError |= clSetKernelArg(m_ConvolutionKernel, 4, sizeof(cl_uint), (void*)&m_Height);
clError |= clSetKernelArg(m_ConvolutionKernel, 5, sizeof(cl_uint), (void*)&m_Pitch);
V_RETURN_FALSE_CL(clError, "Error setting kernel arguments");
return true;
}
void CConvolution3x3Task::ReleaseResources()
{
SAFE_RELEASE_MEMOBJECT(m_dKernelConstants);
SAFE_RELEASE_KERNEL(m_ConvolutionKernel);
SAFE_RELEASE_PROGRAM(m_Program);
CConvolutionTaskBase::ReleaseResources();
}
void CConvolution3x3Task::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// This time we can take a bit less iterations than before, since the image processing itself
// is more time consuming than the previous tasks
const int nIterations = 1000;
//do 1 or 3 convolution steps, based on the number of color channels to process
unsigned int numChannels = m_Monochrome ? 1 : 3;
size_t dataSize = m_Pitch * m_Height * sizeof(cl_float);
//perform the convolution and measure the performance
double runTime = 0.0f;
for(unsigned int iChannel = 0; iChannel < numChannels; iChannel++)
runTime += ConvolutionChannelGPU(iChannel, Context, CommandQueue, nIterations);
cout<<" Average GPU time: "<<runTime<<" ms, throughput: "<< 1.0e-6 * m_Width * m_Height / runTime << " Gpixels/s" <<endl;
for(unsigned int iChannel = 0; iChannel < numChannels; iChannel++)
{
//copy the results back to the CPU
V_RETURN_CL( clEnqueueReadBuffer(CommandQueue, m_dResultChannels[iChannel], CL_TRUE, 0, dataSize,
m_hGPUResultChannels[iChannel], 0, NULL, NULL), "Error reading back results from the device!" );
}
SaveImage("Images/GPUResult3x3.pfm", m_hGPUResultChannels);
}
void CConvolution3x3Task::ComputeCPU()
{
//number of channels to compute
unsigned int numChannels = m_Monochrome ? 1 : 3;
double runTime = 0.0;
for(unsigned int iChannel = 0; iChannel < numChannels; iChannel++)
{
runTime += ConvolutionChannelCPU(iChannel);
}
cout<<" CPU time: "<<runTime<<" ms, throughput: "<< 1.0e-6 * m_Width * m_Height / runTime << " Gpixels/s" <<endl;
SaveImage("Images/CPUResult3x3.pfm", m_hCPUResultChannels);
}
double CConvolution3x3Task::ConvolutionChannelCPU(unsigned int Channel)
{
//also measure the time for the first channel
CTimer timer;
const int nIterations = 10;
timer.Start();
for(int iter = 0; iter < nIterations; iter++)
{
for(unsigned int y = 0; y < m_Height; y++)
{
for(unsigned int x = 0; x < m_Width; x++)
{
float value = 0;
//apply convolution kernel
for(int offsetY = -1; offsetY < 2; offsetY ++)
{
int sy = y + offsetY;
if(sy >= 0 && sy < int(m_Height))
for(int offsetX = -1; offsetX < 2; offsetX++)
{
int sx = x + offsetX;
if(sx >= 0 && sx < int(m_Width))
value += m_hSourceChannels[Channel][sy * m_Pitch + sx] * m_hConvolutionKernel[1 + offsetY][1 + offsetX];
}
}
m_hCPUResultChannels[Channel][y * m_Pitch + x] = value * m_KernelWeight + m_Offset;
}
}
}
timer.Stop();
return timer.GetElapsedMilliseconds();
}
double CConvolution3x3Task::ConvolutionChannelGPU(unsigned int Channel, cl_context Context,
cl_command_queue CommandQueue, int NIterations)
{
size_t globalWorkSize[2] = {CLUtil::GetGlobalWorkSize(m_Width, m_TileSize[0]), CLUtil::GetGlobalWorkSize(m_Height, m_TileSize[1])};
cl_int clErr;
clErr = clSetKernelArg(m_ConvolutionKernel, 0, sizeof(cl_mem), (void*)&m_dResultChannels[Channel]);
clErr |= clSetKernelArg(m_ConvolutionKernel, 1, sizeof(cl_mem), (void*)&m_dSourceChannels[Channel]);
V_RETURN_0_CL(clErr, "Error setting kernel arguments!");
return CLUtil::ProfileKernel(CommandQueue, m_ConvolutionKernel, 2, globalWorkSize, m_TileSize, NIterations);
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CCONVOLUTION_3X3_TASK_H
#define _CCONVOLUTION_3X3_TASK_H
#include "CConvolutionTaskBase.h"
#include <string>
//! A3 / T1 3x3 convolution
class CConvolution3x3Task : public CConvolutionTaskBase
{
public:
CConvolution3x3Task(
const std::string& FileName,
size_t TileSize[2],
float ConvKernel[3][3],
bool Monochrome,
float Offset);
virtual ~CConvolution3x3Task();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual void ReleaseResources();
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
virtual void ComputeCPU();
protected:
// the return value is the run time in milliseconds
double ConvolutionChannelCPU(unsigned int Channel);
//the last parameter is for timing, and the returned value is the average run time in milliseconds
double ConvolutionChannelGPU(unsigned int Channel, cl_context Context, cl_command_queue CommandQueue, int NIterations);
size_t m_TileSize[2];
// host data
float m_hConvolutionKernel[3][3];
float m_KernelWeight;
float m_Offset;
//kernel constants
cl_mem m_dKernelConstants = nullptr;
cl_program m_Program = nullptr;
cl_kernel m_ConvolutionKernel = nullptr;
};
#endif // _CCONVOLUTION_3X3_TASK_H
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CCONVOLUTION_BILATERAL_TASK_H
#define _CCONVOLUTION_BILATERAL_TASK_H
#include "CConvolutionSeparableTask.h"
#include <string>
#include <cmath>
#define DEPTH_THRESHOLD 0.025f
#define NORM_THRESHOLD 0.9f
//! A3/T3 bilateral filter
/*!
This class implements a separable convolution filter, but
extended with a discontinuity detection pass. Therefore with
inherit it from the second task...
*/
class CConvolutionBilateralTask: public CConvolutionSeparableTask
{
public:
// Yes, we have more and more attributes :) But we did not want to complicate it with more "setter" methods...
CConvolutionBilateralTask(const std::string& FileName, const std::string& NormalFileName,
const std::string& DepthFileName, size_t LocalSizeHorizontal[2], size_t LocalSizeVertical[2],
int StepsHorizontal, int StepsVertical, int KernelRadius, float* pKernelHorizontal, float* pKernelVertical);
virtual ~CConvolutionBilateralTask();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual bool InitKernels();
virtual void ReleaseResources();
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
virtual void ComputeCPU();
protected:
// the return value is the run time in milliseconds
double ConvolutionChannelCPU(unsigned int Channel);
// the return value is the run time in milliseconds
double ConvolutionChannelGPU(unsigned int Channel, cl_context Context, cl_command_queue CommandQueue, int NIterations);
// These helper methods are used to build the discontinuity buffer
inline bool IsNormalDiscontinuity(const cl_float4 &n1, const cl_float4 &n2) {
return ::std::fabs(n1.s[0] * n2.s[0] + n1.s[1] * n2.s[1] + n1.s[2] * n2.s[2]) < NORM_THRESHOLD;
}
inline bool IsDepthDiscontinuity(float d1, float d2){
return ::std::fabs(d1 - d2) > DEPTH_THRESHOLD;
}
std::string m_NormalFileName;
std::string m_DepthFileName;
//host data
cl_float4* m_hNormDepthBuffer;
// discontinuity buffers
cl_int* m_hCPUDiscBuffer;
cl_int* m_hGPUDiscBuffer;
// device data
cl_mem m_dDiscBuffer;
cl_mem m_dNormDepthBuffer;
// kernels for discontinuity detection
cl_kernel m_HorizontalDiscKernel;
cl_kernel m_VerticalDiscKernel;
};
#endif // _CCONVOLUTION_BILATERAL_TASK_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CConvolutionSeparableTask.h"
#include "../Common/CLUtil.h"
#include "../Common/CTimer.h"
#include <sstream>
#include <cstring>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CConvolutionSeparableTask
CConvolutionSeparableTask::CConvolutionSeparableTask(
const std::string& OutFileName,
const std::string& FileName,
size_t LocalSizeHorizontal[2],
size_t LocalSizeVertical[2],
int StepsHorizontal,
int StepsVertical,
int KernelRadius,
float* pKernelHorizontal,
float* pKernelVertical
)
: CConvolutionTaskBase(FileName, false)
, m_OutFileName(OutFileName)
, m_StepsHorizontal(StepsHorizontal)
, m_StepsVertical(StepsVertical)
, m_KernelRadius(KernelRadius)
{
m_LocalSizeHorizontal[0] = LocalSizeHorizontal[0];
m_LocalSizeHorizontal[1] = LocalSizeHorizontal[1];
m_LocalSizeVertical[0] = LocalSizeVertical[0];
m_LocalSizeVertical[1] = LocalSizeVertical[1];
const unsigned int kernelSize = 2 * m_KernelRadius + 1;
m_hKernelHorizontal = new float[kernelSize];
m_hKernelVertical = new float[kernelSize];
memcpy(m_hKernelHorizontal, pKernelHorizontal, kernelSize * sizeof(float));
memcpy(m_hKernelVertical, pKernelVertical, kernelSize * sizeof(float));
m_dGPUWorkingBuffer = nullptr;
m_hCPUWorkingBuffer = nullptr;
m_FileNamePostfix = "Separable_" + OutFileName;
m_ProgramName = "ConvolutionSeparable.cl";
}
CConvolutionSeparableTask::~CConvolutionSeparableTask()
{
delete [] m_hKernelHorizontal;
delete [] m_hKernelVertical;
ReleaseResources();
}
bool CConvolutionSeparableTask::InitResources(cl_device_id Device, cl_context Context)
{
if(!CConvolutionTaskBase::InitResources(Device, Context))
return false;
//create GPU resources
//we can init the kernel buffer during creation as its contents will not change
const unsigned int kernelSize = 2 * m_KernelRadius + 1;
cl_int clError = 0;
cl_int clErr;
m_dKernelHorizontal = clCreateBuffer(Context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kernelSize * sizeof(cl_float),
m_hKernelHorizontal, &clErr);
clError |= clErr;
m_dKernelVertical = clCreateBuffer(Context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, kernelSize * sizeof(cl_float),
m_hKernelVertical, &clErr);
clError |= clErr;
V_RETURN_FALSE_CL(clError, "Error allocating device kernel constants.");
m_dGPUWorkingBuffer = clCreateBuffer(Context, CL_MEM_READ_WRITE, m_Pitch * m_Height * sizeof(cl_float), NULL, &clError);
V_RETURN_FALSE_CL(clError, "Error allocating device working array");
m_hCPUWorkingBuffer = new float[m_Height * m_Pitch];
string programCode;
CLUtil::LoadProgramSourceToMemory(m_ProgramName, programCode);
//This time we define several kernel-specific constants that we did not know during
//implementing the kernel, but we need to include during compile time.
stringstream compileOptions;
compileOptions<<"-cl-fast-relaxed-math"
<<" -D KERNEL_RADIUS="<<m_KernelRadius
<<" -D H_GROUPSIZE_X="<<m_LocalSizeHorizontal[0]<<" -D H_GROUPSIZE_Y="<<m_LocalSizeHorizontal[1]
<<" -D H_RESULT_STEPS="<<m_StepsHorizontal
<<" -D V_GROUPSIZE_X="<<m_LocalSizeVertical[0]<<" -D V_GROUPSIZE_Y="<<m_LocalSizeVertical[1]
<<" -D V_RESULT_STEPS="<<m_StepsVertical;
m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode, compileOptions.str());
if(m_Program == nullptr) return false;
return InitKernels();
}
bool CConvolutionSeparableTask::InitKernels()
{
cl_int clError;
//create kernel(s)
m_HorizontalKernel = clCreateKernel(m_Program, "ConvHorizontal", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create horizontal kernel.");
m_VerticalKernel = clCreateKernel(m_Program, "ConvVertical", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create vertical kernel.");
//bind kernel attributes
//the resulting image will be in buffer 1
clError = clSetKernelArg(m_HorizontalKernel, 2, sizeof(cl_mem), (void*)&m_dKernelHorizontal);
clError |= clSetKernelArg(m_HorizontalKernel, 3, sizeof(cl_uint), (void*)&m_Width);
clError |= clSetKernelArg(m_HorizontalKernel, 4, sizeof(cl_uint), (void*)&m_Pitch);
V_RETURN_FALSE_CL(clError, "Error setting horizontal kernel arguments");
//the resulting image will be in buffer 0
clError = clSetKernelArg(m_VerticalKernel, 2, sizeof(cl_mem), (void*)&m_dKernelVertical);
clError |= clSetKernelArg(m_VerticalKernel, 3, sizeof(cl_uint), (void*)&m_Height);
clError |= clSetKernelArg(m_VerticalKernel, 4, sizeof(cl_uint), (void*)&m_Pitch);
V_RETURN_FALSE_CL(clError, "Error setting vertical kernel arguments");
return true;
}
void CConvolutionSeparableTask::ReleaseResources()
{
SAFE_DELETE_ARRAY( m_hCPUWorkingBuffer );
SAFE_RELEASE_MEMOBJECT(m_dGPUWorkingBuffer);
SAFE_RELEASE_MEMOBJECT(m_dKernelHorizontal);
SAFE_RELEASE_MEMOBJECT(m_dKernelVertical);
SAFE_RELEASE_KERNEL(m_HorizontalKernel);
SAFE_RELEASE_KERNEL(m_VerticalKernel);
SAFE_RELEASE_PROGRAM(m_Program);
}
void CConvolutionSeparableTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
size_t dataSize = m_Pitch * m_Height * sizeof(cl_float);
int nIterations = 100;
unsigned int numChannels = 3;
double runTime = 0.0f;
for(unsigned int iChannel = 0; iChannel < numChannels; iChannel++)
{
runTime += ConvolutionChannelGPU(iChannel, Context, CommandQueue, nIterations);
}
cout<<" Average GPU time: "<<runTime<<" ms, throughput: "<< 1.0e-6 * m_Width * m_Height / runTime << " Gpixels/s" <<endl;
for(unsigned int iChannel = 0; iChannel < numChannels; iChannel++)
{
//copy the results back to the CPU
//(this time the data is in the same buffer as the input was, because of the 2 convolution passes)
V_RETURN_CL( clEnqueueReadBuffer(CommandQueue, m_dResultChannels[iChannel], CL_TRUE, 0, dataSize,
m_hGPUResultChannels[iChannel], 0, NULL, NULL), "Error reading back results from the device!" );
}
SaveImage("Images/GPUResultSeparable_" + m_OutFileName + ".pfm", m_hGPUResultChannels);
}
void CConvolutionSeparableTask::ComputeCPU()
{
double runTime = 0.0;
for(unsigned int iChannel = 0; iChannel < 3; iChannel++)
{
runTime += ConvolutionChannelCPU(iChannel);
}
cout<<" CPU time: "<<runTime<<" ms, throughput: "<< 1.0e-6 * m_Width * m_Height / runTime << " Gpixels/s" <<endl;
SaveImage("Images/CPUResultSeparable_" + m_OutFileName + ".pfm", m_hCPUResultChannels);
}
double CConvolutionSeparableTask::ConvolutionChannelCPU(unsigned int Channel)
{
CTimer timer;
timer.Start();
//horizontal pass
for(int y = 0; y < (int)m_Height; y++)
for(int x = 0; x < (int)m_Width; x++)
{
float value = 0;
//apply horizontal kernel
for(int k = -m_KernelRadius; k <= m_KernelRadius; k++)
{
int sx = x + k;
if(sx >= 0 && sx < (int)m_Width)
value += m_hSourceChannels[Channel][y * m_Pitch + sx] * m_hKernelHorizontal[m_KernelRadius - k];
}
m_hCPUWorkingBuffer[y * m_Pitch + x] = value;
}
//vertical pass
for(int x = 0; x < (int)m_Width; x++)
for(int y = 0; y < (int)m_Height; y++)
{
float value = 0;
//apply horizontal kernel
for(int k = -m_KernelRadius; k <= m_KernelRadius; k++)
{
int sy = y + k;
if(sy >= 0 && sy < (int)m_Height)
value += m_hCPUWorkingBuffer[sy * m_Pitch + x] * m_hKernelVertical[m_KernelRadius - k];
}
m_hCPUResultChannels[Channel][y * m_Pitch + x] = value;
}
timer.Stop();
return timer.GetElapsedMilliseconds();
}
double CConvolutionSeparableTask::ConvolutionChannelGPU(unsigned int Channel, cl_context Context, cl_command_queue CommandQueue, int NIterations)
{
cl_int clErr;
clErr = clSetKernelArg(m_HorizontalKernel, 0, sizeof(cl_mem), (void*)&m_dGPUWorkingBuffer);
clErr |= clSetKernelArg(m_HorizontalKernel, 1, sizeof(cl_mem), (void*)&m_dSourceChannels[Channel]);
V_RETURN_0_CL(clErr, "Error setting horizontal kernel arguments");
clErr = clSetKernelArg(m_VerticalKernel, 0, sizeof(cl_mem), (void*)&m_dResultChannels[Channel]);
clErr |= clSetKernelArg(m_VerticalKernel, 1, sizeof(cl_mem), (void*)&m_dGPUWorkingBuffer);
V_RETURN_0_CL(clErr, "Error setting vertical kernel arguments");
double runTime;
size_t globalWorkSizeH[2] = {
CLUtil::GetGlobalWorkSize(m_Width / m_StepsHorizontal, m_LocalSizeHorizontal[0]),
CLUtil::GetGlobalWorkSize(m_Height, m_LocalSizeHorizontal[1])
};
runTime = CLUtil::ProfileKernel(CommandQueue, m_HorizontalKernel, 2, globalWorkSizeH, m_LocalSizeHorizontal, NIterations);
size_t globalWorkSizeV[2] = {
CLUtil::GetGlobalWorkSize(m_Width, m_LocalSizeVertical[0]),
CLUtil::GetGlobalWorkSize(m_Height / m_StepsVertical, m_LocalSizeVertical[1])
};
runTime += CLUtil::ProfileKernel(CommandQueue, m_VerticalKernel, 2, globalWorkSizeV, m_LocalSizeVertical, NIterations);
return runTime;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CCONVOLUTION_SEPARABLE_TASK_H
#define _CCONVOLUTION_SEPARABLE_TASK_H
#include "CConvolutionTaskBase.h"
#include <string>
//! A3 / T2 separable convolution
class CConvolutionSeparableTask : public CConvolutionTaskBase
{
public:
CConvolutionSeparableTask(
const std::string& OutFileName,
const std::string& FileName,
size_t LocalSizeHorizontal[2],
size_t LocalSizeVertical[2],
int StepsHorizontal,
int StepsVertical,
int KernelRadius,
float* pKernelHorizontal,
float* pKernelVertical);
virtual ~CConvolutionSeparableTask();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual bool InitKernels();
virtual void ReleaseResources();
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
virtual void ComputeCPU();
protected:
// the return value is the run time in milliseconds
double ConvolutionChannelCPU(unsigned int Channel);
// the return value is the run time in milliseconds
double ConvolutionChannelGPU(unsigned int Channel, cl_context Context, cl_command_queue CommandQueue, int NIterations);
std::string m_OutFileName;
//we use different local work sizes during the two convolution kernels
size_t m_LocalSizeHorizontal[2];
size_t m_LocalSizeVertical[2];
//how many convolution steps a single thread executes
int m_StepsHorizontal = 0;
int m_StepsVertical = 0;
//host data
float* m_hKernelHorizontal = nullptr;
float* m_hKernelVertical = nullptr;
int m_KernelRadius = 0;
// device data
cl_mem m_dGPUWorkingBuffer;
float* m_hCPUWorkingBuffer;
//kernel coefficients
cl_mem m_dKernelHorizontal = nullptr;
cl_mem m_dKernelVertical = nullptr;
cl_program m_Program = nullptr;
std::string m_ProgramName;
//horizontal convolution pass
cl_kernel m_HorizontalKernel = nullptr;
//vertical convolution pass
cl_kernel m_VerticalKernel = nullptr;
};
#endif // _CCONVOLUTION_SEPARABLE_TASK_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CConvolutionTaskBase.h"
#include "../Common/CLUtil.h"
#include "Pfm.h"
#include <sstream>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <cstdint>
#include <vector>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CConvolutionTaskBase
CConvolutionTaskBase::CConvolutionTaskBase(const std::string& FileName, bool Monochrome)
: m_FileName(FileName), m_Monochrome(Monochrome)
{
}
CConvolutionTaskBase::~CConvolutionTaskBase()
{
ReleaseResources();
}
bool CConvolutionTaskBase::InitResources(cl_device_id , cl_context Context)
{
PFM inputPfm;
if (!inputPfm.LoadRGB(m_FileName.c_str())) {
cerr<<"Error loading file: " << m_FileName.c_str() << "." << endl;
return false;
}
//internally, we convert the bitmap to floats, and execute the same convolution
//operation on its three channels separately
m_Height = inputPfm.height;
m_Width = inputPfm.width;
m_Pitch = m_Width;
if(m_Width % 32 != 0)
m_Pitch = m_Width + 32 - (m_Width % 32); //This will make sure that the data accesses are ALWAYS coalesced
cout<<"Size of image: "<<m_Width<<" x "<<m_Height<<endl;
//allocate data for the float channels
for(int i = 0; i < 3; i++)
{
m_hSourceChannels[i] = new float[m_Height * m_Pitch];
m_hCPUResultChannels[i] = new float[m_Height * m_Pitch];
m_hGPUResultChannels[i] = new float[m_Height * m_Pitch];
}
//extract R, G, B channels
unsigned int pixelOffset = 0;
unsigned int trippleOffset = 0;
for(unsigned int y = 0; y < m_Height; y++)
{
for(unsigned int x = 0; x < m_Width; x++)
{
m_hSourceChannels[0][pixelOffset] = inputPfm.pImg[trippleOffset ];
m_hSourceChannels[1][pixelOffset] = inputPfm.pImg[trippleOffset + 1];
m_hSourceChannels[2][pixelOffset] = inputPfm.pImg[trippleOffset + 2];
//monochrome: the data is converted to grayscale
if(m_Monochrome)
RGBToGrayScale( m_hSourceChannels[0][pixelOffset],
m_hSourceChannels[1][pixelOffset],
m_hSourceChannels[2][pixelOffset]);
pixelOffset++;
trippleOffset += 3;
}
//pad the image with zeros
for(unsigned int i = 0; i < m_Pitch - m_Width; i++)
{
m_hSourceChannels[0][pixelOffset + i] = 0.0f;
m_hSourceChannels[1][pixelOffset + i] = 0.0f;
m_hSourceChannels[2][pixelOffset + i] = 0.0f;
}
pixelOffset += m_Pitch - m_Width;
}
unsigned int dataSize = m_Pitch * m_Height * sizeof(cl_float);
cl_int clError;
for(int i = 0; i < 3; i++)
{
m_dSourceChannels[i] = clCreateBuffer(Context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, m_hSourceChannels[i], &clError);
V_RETURN_FALSE_CL(clError, "Error allocating device input array");
m_dResultChannels[i] = clCreateBuffer(Context, CL_MEM_WRITE_ONLY, dataSize, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Error allocating device output array");
}
return true;
}
void CConvolutionTaskBase::ReleaseResources()
{
for(int i = 0; i < 3; i++)
{
SAFE_DELETE_ARRAY( m_hSourceChannels[i] );
SAFE_DELETE_ARRAY( m_hCPUResultChannels[i] );
SAFE_DELETE_ARRAY( m_hGPUResultChannels[i] );
SAFE_RELEASE_MEMOBJECT( m_dSourceChannels[i] );
SAFE_RELEASE_MEMOBJECT( m_dResultChannels[i] );
}
}
bool CConvolutionTaskBase::ValidateResults()
{
//number of channels to compute
unsigned int numChannels = m_Monochrome ? 1 : 3;
//calculate the average squared difference
float avgError = 0;
float maxError = 0;
float numValues = float(numChannels * m_Width * (m_Height - 1));
float scaling = 1.0f / numValues;
for(unsigned int y = 0; y < m_Height; y++)
for(unsigned int x = 0; x < m_Width; x++)
for(unsigned int i = 0; i < numChannels; i++)
{
float L2Error = m_hCPUResultChannels[i][y * m_Pitch + x] - m_hGPUResultChannels[i][y * m_Pitch + x];
L2Error = L2Error * L2Error;
// Ignore the last line for the difference computations because we seem to have issues with NANs and other incorrect values in the last line with
// the current driver version (versions 344.75, 344.11 and 335.23) in the separable kernel exercise.
// This should be removed ASAP if the driver works again. You will also have to change the numValues initialization above.
if (y < m_Height - 1)
{
maxError = max(maxError, L2Error);
avgError += L2Error * scaling;
}
//to see the difference...
m_hCPUResultChannels[i][y * m_Pitch + x] = L2Error;
}
cout<<"Mean sq. error (MSE): "<<avgError<<endl;
cout<<"Maximum sq. error: "<<maxError<<endl;
//save difference image
std::stringstream strm;
strm<<"Images/DifferenceImage"<<m_FileNamePostfix<<".pfm";
SaveImage(strm.str().c_str(), m_hCPUResultChannels);
return (avgError < 1e-10f && maxError < 1e-8);
}
#ifdef HAVE_BIG_ENDIAN
# define SWAP_32(D) \
# ((D << 24) | ((D << 8) & 0x00FF0000) \
# | ((D >> 8) & 0x0000FF00) | (D >> 24))
# define SWAP_16(D) ((D >> 8) | (D << 8))
#else
# define SWAP_32(D) (D)
# define SWAP_16(D) (D)
#endif
void
save_image_bmp(const char *path, unsigned char *data, int width, int height)
{
FILE *f;
if(!(f = fopen(path, "wb"))) {
cerr << "Could not open \"" << path << "\"" << endl;
return;
}
int numPixels = width * height;
unsigned char header[] = {
0x42, 0x4D, 0, 0, 0, 0, 0, 0,
0, 0, 0x36, 0, 0, 0, 0x28, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0x18, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0
};
int32_t width_end = SWAP_32(width);
int32_t height_end = SWAP_32(height);
int32_t num_pix = numPixels * 3;
int32_t wtf = 54 + numPixels * 3 + (height) * ((width & 3));
memcpy(header + 18, &width_end, sizeof(width_end));
memcpy(header + 22, &height_end, sizeof(height_end));
memcpy(header + 34, &num_pix, sizeof(num_pix));
memcpy(header + 2, &wtf, sizeof(wtf));
size_t w;
w = fwrite(header, 54, 1, f);
assert(w == 1);
for(int y = height - 1; y >= 0; y--) {
for(int x = 0; x < width; x++) {
unsigned char* imgPos = data + 3 * (x + y * width);
for(int i = 2; i >= 0; i--) { // reverse for BGR
unsigned char c = *(imgPos + i);
w = fwrite(&c, 1, 1, f);
assert(w == 1);
}
}
char temp = 0;
for(int i = 0; i < (width) % 4; i++) {
w = fwrite(&temp, 1, 1, f); // padding
assert(w == 1);
}
}
fclose(f);
}
void CConvolutionTaskBase::SaveImage(const std::string& FileName, float* Channels[3])
{
// Save the result back to the disk
PFM resPfm;
resPfm.pImg = new float[m_Width * m_Height * 3];
unsigned int pfmOffset = 0;
unsigned int pixOffset = 0;
for(unsigned int y = 0; y < m_Height; y++)
{
for(unsigned int x = 0; x < m_Width; x++)
{
if(m_Monochrome)
{
resPfm.pImg[pfmOffset] = Channels[0][pixOffset];
resPfm.pImg[pfmOffset + 1] = Channels[0][pixOffset];
resPfm.pImg[pfmOffset + 2] = Channels[0][pixOffset];
}
else
{
resPfm.pImg[pfmOffset] = Channels[0][pixOffset];
resPfm.pImg[pfmOffset + 1] = Channels[1][pixOffset];
resPfm.pImg[pfmOffset + 2] = Channels[2][pixOffset];
}
pfmOffset += 3;
pixOffset++;
}
pixOffset += m_Pitch - m_Width;
}
resPfm.width = m_Width;
resPfm.height = m_Height;
if(!resPfm.SaveRGB(FileName.c_str()))
{
cerr<<"Error saving "<<FileName<<"."<<endl;
}
// stupid to do it here...
//
vector<uint8_t> img(m_Width * m_Height * 3);
for(size_t i = 0; i < img.size(); i++) {
img[i] = std::min<int>(0xff, std::max<int>(0, int(resPfm.pImg[i] * 0xff)));
}
string bmp_path = FileName;
bmp_path.replace(bmp_path.rfind(".pfm"), 4, ".bmp");
//save_image_bmp(bmp_path.c_str(), img.data(), m_Width, m_Height);
}
void CConvolutionTaskBase::SaveIntImage(const std::string& FileName, int* Channel)
{
// Write data to the disc
PFM resPfm;
resPfm.pImg = new float[m_Width * m_Height];
unsigned int pfmOffset = 0;
unsigned int pixOffset = 0;
for(unsigned int y = 0; y < m_Height; y++)
{
for(unsigned int x = 0; x < m_Width; x++)
{
resPfm.pImg[pfmOffset] = (float)Channel[pixOffset];
pfmOffset++;
pixOffset++;
}
pixOffset += m_Pitch - m_Width;
}
resPfm.width = m_Width;
resPfm.height = m_Height;
if(!resPfm.SaveGrayscale(FileName.c_str()))
{
cout<<"Error saving "<<FileName<<"."<<endl;
return;
}
}
float CConvolutionTaskBase::RGBToGrayScale(float R, float G, float B)
{
return 0.3f * R + 0.59f * G + 0.11f * B;
}
unsigned int CConvolutionTaskBase::To8BitChannel(float Value)
{
Value = Value * 255.0f;
if(Value > 255.0f)
Value = 255.0f;
else if(Value < 0.0f)
Value = 0;
return unsigned(Value);
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CCONVOLUTION_TASK_BASE_H
#define _CCONVOLUTION_TASK_BASE_H
#include "../Common/IComputeTask.h"
#include <string>
//! Abstract base class for all convolution tasks
/*!
This class does not handle any actual computation, but implements methods used by all
tasks such as loading and saving images and comparing GPU-CPU results.
*/
class CConvolutionTaskBase : public IComputeTask
{
public:
CConvolutionTaskBase(const std::string& FileName, bool Monochrome = false);
virtual ~CConvolutionTaskBase();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual void ReleaseResources();
virtual bool ValidateResults();
protected:
void SaveImage(const std::string& FileName, float* Channels[3]);
void SaveIntImage(const std::string& FileName, int* Channel);
// helper functions:
// one grayscale floating point value out of RGB
float RGBToGrayScale(float R, float G, float B);
// quantize a floating point to a, 8 bit fixed point
unsigned int To8BitChannel(float Value);
std::string m_FileName;
//if true, only one channel is used
bool m_Monochrome;
// internally used, so different tasks can name their differece images
// uniquely
std::string m_FileNamePostfix;
unsigned int m_Height = 0;
unsigned int m_Width = 0;
unsigned int m_Pitch = 0;
float* m_hSourceChannels[3] /*= { nullptr, nullptr, nullptr }*/; //R, G, B channels
float* m_hCPUResultChannels[3] /*= { nullptr, nullptr, nullptr }*/; //the convolved image
float* m_hGPUResultChannels[3] /*= { nullptr, nullptr, nullptr }*/; //the convolved image
//we process exactly one channel on the GPU in the same time
cl_mem m_dSourceChannels[3] /*= { nullptr, nullptr, nullptr}*/;
cl_mem m_dResultChannels[3] /*= { nullptr, nullptr, nullptr}*/;
};
#endif // _CCONVOLUTION_TASK_BASE_H
#include "CHistogramTask.h"
#include "../Common/CLUtil.h"
#include "../Common/CTimer.h"
#include "Pfm.h"
#include <string.h>
#include <cassert>
CHistogramTask::
CHistogramTask(float min_val, float max_val, bool use_local_memory, const std::string &img_path)
: m_min_val(min_val)
, m_max_val(max_val)
, m_img_path(img_path)
, m_use_local_memory(use_local_memory)
{
}
CHistogramTask::
~CHistogramTask()
{
ReleaseResources();
}
bool CHistogramTask::
InitResources(cl_device_id dev, cl_context ctx)
{
cl_int err;
PFM img;
if(!img.LoadRGB(m_img_path.c_str())) {
std::cerr << "Error loading image: \"" << m_img_path << "\"!" << std::endl;
return false;
}
m_img_width = img.width;
m_img_height = img.height;
m_img_stride = img.width % 32 ? (img.width + 32 - img.width % 32) : img.width;
m_pixels.resize(m_img_stride * m_img_height, 0.0f);
for(int y = 0; y < m_img_height; y++) {
for(int x = 0; x < m_img_width; x++) {
auto &s = m_pixels[y * m_img_stride + x];
s = 0.0f;
s += img.pImg[(y * img.width + x) * 3 + 0] * 0.3f;
s += img.pImg[(y * img.width + x) * 3 + 1] * 0.59f;
s += img.pImg[(y * img.width + x) * 3 + 2] * 0.11f;
}
}
m_d_pixels = clCreateBuffer(ctx,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * m_pixels.size(),
m_pixels.data(),
&err);
V_RETURN_FALSE_CL(err, "Failed to allocate device memory");
std::vector<int> zeroes(NUM_HIST_BINS, 0);
m_d_hist = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, NUM_HIST_BINS * sizeof(int),
zeroes.data(), &err);
V_RETURN_FALSE_CL(err, "Failed to allocate device memory");
std::string src;
if(!CLUtil::LoadProgramSourceToMemory("histogram.cl", src))
return false;
m_program = CLUtil::BuildCLProgramFromMemory(dev, ctx, src);
if(!m_program)
return false;
int num_hist_bins = NUM_HIST_BINS;
m_kernel_histogram = clCreateKernel(
m_program,
m_use_local_memory ? "compute_histogram_local_memory" : "compute_histogram",
&err);
V_RETURN_FALSE_CL(err, "Failed to create kernel: histogram");
err = clSetKernelArg(m_kernel_histogram, 0, sizeof(cl_mem), &m_d_hist);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 0");
err = clSetKernelArg(m_kernel_histogram, 1, sizeof(cl_mem), &m_d_pixels);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 1");
err = clSetKernelArg(m_kernel_histogram, 2, sizeof(int), &m_img_width);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 2");
err = clSetKernelArg(m_kernel_histogram, 3, sizeof(int), &m_img_height);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 3");
err = clSetKernelArg(m_kernel_histogram, 4, sizeof(int), &m_img_stride);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 4");
err = clSetKernelArg(m_kernel_histogram, 5, sizeof(int), &num_hist_bins);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 5");
if(m_use_local_memory) {
err = clSetKernelArg(m_kernel_histogram, 6, sizeof(int) * NUM_HIST_BINS, nullptr);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 6");
}
m_kernel_set_to_val = clCreateKernel(m_program, "set_array_to_constant", &err);
V_RETURN_FALSE_CL(err, "Failed to create kernel: set_array_to_constant");
err = clSetKernelArg(m_kernel_set_to_val, 0, sizeof(cl_mem), &m_d_hist);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 0");
err = clSetKernelArg(m_kernel_set_to_val, 1, sizeof(int), &num_hist_bins);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 1");
int zero = 0;
err = clSetKernelArg(m_kernel_set_to_val, 2, sizeof(int), &zero);
V_RETURN_FALSE_CL(err, "Error setting kernel Arg 2");
return true;
}
void CHistogramTask::
ReleaseResources()
{
SAFE_RELEASE_MEMOBJECT(m_d_pixels);
SAFE_RELEASE_MEMOBJECT(m_d_hist);
SAFE_RELEASE_KERNEL(m_kernel_histogram);
SAFE_RELEASE_KERNEL(m_kernel_set_to_val);
}
static void
print_histogram(const std::vector<int> &h)
{
int max_val = 0;
for(auto i: h)
max_val = std::max<int>(max_val, i);
std::cout << "+";
for(size_t i = 0; i < h.size(); i++)
std::cout << "-";
std::cout << "+\n";
const int max_height = 8;
for(int y = max_height - 1; y >= 0; y--) {
int val = (max_val * y) / max_height;
std::cout << "|";
for(auto i: h)
std::cout << (i >= val ? '#' : ' ');
std::cout << "|\n";
}
std::cout << "+";
for(size_t i = 0; i < h.size(); i++)
std::cout << "-";
std::cout << "+\n";
}
void CHistogramTask::
ComputeGPU(cl_context ctx, cl_command_queue cmdq, size_t lws[3])
{
size_t local_size_clear = 256;
size_t global_size_clear = ((NUM_HIST_BINS + local_size_clear - 1) / local_size_clear) * local_size_clear;
size_t global_size[2] = {
((m_img_width + lws[0] - 1) / lws[0]) * lws[0],
((m_img_height + lws[1] - 1) / lws[1]) * lws[1]
};
CTimer timer;
clFinish(cmdq);
timer.Start();
const int num_iterations = 100;
for(int i = 0; i < num_iterations; i++) {
clEnqueueNDRangeKernel(cmdq, m_kernel_set_to_val, 1, NULL, &global_size_clear, &local_size_clear, 0, NULL, NULL);
clEnqueueNDRangeKernel(cmdq, m_kernel_histogram, 2, NULL, global_size, lws, 0, NULL, NULL);
}
clFinish(cmdq);
timer.Stop();
const char *prefix = m_use_local_memory
? " Histogram GPU time (using local memory): "
: " Histogram GPU time (no local memory): ";
std::cout << prefix << timer.GetElapsedMilliseconds() / float(num_iterations) << " ms\n";
m_histogram_gpu.resize(NUM_HIST_BINS);
clEnqueueReadBuffer(cmdq, m_d_hist, CL_TRUE, 0, sizeof(int) * NUM_HIST_BINS,
m_histogram_gpu.data(), 0, nullptr, nullptr);
}
void CHistogramTask::
ComputeCPU()
{
m_histogram.assign(NUM_HIST_BINS, 0);
CTimer timer;
timer.Start();
for(int y = 0; y < m_img_height; y++) {
for(int x = 0; x < m_img_width; x++) {
float p = m_pixels[y * m_img_stride + x] * float(NUM_HIST_BINS);
int h_idx = std::min<int>(NUM_HIST_BINS - 1, std::max<int>(0, int(p)));
m_histogram[h_idx]++;
}
}
timer.Stop();
std::cout << " Histogram CPU time: " << timer.GetElapsedMilliseconds() << " ms\n";
}
bool CHistogramTask::
ValidateResults()
{
bool is_same = true;
assert(m_histogram.size() == m_histogram_gpu.size());
for(size_t i = 0; i < m_histogram.size(); i++) {
if(m_histogram[i] != m_histogram_gpu[i])
is_same = false;
}
if(is_same) {
print_histogram(m_histogram);
}
else {
std::cout << "Results do not match!" << std::endl;
std::cout << "Histogram CPU:" << std::endl;
print_histogram(m_histogram);
std::cout << "Histogram GPU:" << std::endl;
print_histogram(m_histogram_gpu);
std::cout << "CPU GPU" << std::endl;
for(size_t i = 0; i < m_histogram.size(); i++) {
std::cout << m_histogram[i] << " " << m_histogram_gpu[i] << std::endl;
}
}
return is_same;
}
#ifndef __CPIXELCOUNTTASK_H__
#define __CPIXELCOUNTTASK_H__
#include <string>
#include <vector>
#include "../Common/IComputeTask.h"
class CHistogramTask : public IComputeTask
{
public:
enum { NUM_HIST_BINS = 64 };
CHistogramTask(float min_val, float max_val, bool use_local_memory, const std::string &img_path);
virtual ~CHistogramTask();
virtual bool InitResources(cl_device_id Device, cl_context Context) override;
virtual void ReleaseResources() override;
virtual void ComputeGPU(cl_context ctx, cl_command_queue cmdq, size_t lws[3]) override;
virtual void ComputeCPU() override;
virtual bool ValidateResults() override;
protected:
float m_min_val = 0.0f, m_max_val = 1.0f;
const std::string m_img_path;
const bool m_use_local_memory;
int m_img_width = 0, m_img_height = 0, m_img_stride = 0;
cl_program m_program = nullptr;
cl_kernel m_kernel_histogram = nullptr, m_kernel_set_to_val = nullptr;
cl_mem m_d_pixels = nullptr;
cl_mem m_d_hist = nullptr;
std::vector<int> m_histogram, m_histogram_gpu;
std::vector<float> m_pixels;
};
#endif /*__CPIXELCOUNTTASK_H__*/
cmake_minimum_required (VERSION 2.8.3)
project (GPUComputing)
# Add our modules to the path
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../cmake/")
include(CheckCXXCompilerFlag)
if (WIN32)
else (WIN32)
#set (EXTRA_COMPILE_FLAGS "-Wall -Werror")
set (EXTRA_COMPILE_FLAGS "-Wall")
CHECK_CXX_COMPILER_FLAG(-std=c++11 HAS_CXX_11)
if (HAS_CXX_11)
set(EXTRA_COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -std=c++11 -Wall")
message(STATUS "Enabling C++11 support")
else(HAS_CXX_11)
message(WARNING "No C++11 support detected, build will fail.")
endif()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_COMPILE_FLAGS}")
endif (WIN32)
# Include support for changing the working directory in Visual Studio
include(ChangeWorkingDirectory)
# Search for OpenCL and add paths
find_package( OpenCL REQUIRED )
include_directories( ${OPENCL_INCLUDE_DIRS} )
# Include Common module
add_subdirectory (../Common ${CMAKE_BINARY_DIR}/Common)
# Define source files for this assignment
FILE(GLOB Sources *.cpp)
FILE(GLOB Headers *.h)
FILE(GLOB CLSources *.cl)
ADD_EXECUTABLE (Assignment
${Sources}
${Headers}
${CLSources}
)
# Link required libraries
target_link_libraries(Assignment ${OPENCL_LIBRARIES})
target_link_libraries(Assignment GPUCommon)
if (WIN32)
change_workingdir(Assignment ${CMAKE_SOURCE_DIR})
endif()
/*
We assume a 3x3 (radius: 1) convolution kernel, which is not separable.
Each work-group will process a (TILE_X x TILE_Y) tile of the image.
For coalescing, TILE_X should be multiple of 16.
Instead of examining the image border for each kernel, we recommend to pad the image
to be the multiple of the given tile-size.
*/
//should be multiple of 32 on Fermi and 16 on pre-Fermi...
#define TILE_X 32
#define TILE_Y 16
// d_Dst is the convolution of d_Src with the kernel c_Kernel
// c_Kernel is assumed to be a float[11] array of the 3x3 convolution constants, one multiplier (for normalization) and an offset (in this order!)
// With & Height are the image dimensions (should be multiple of the tile size)
__kernel __attribute__((reqd_work_group_size(TILE_X, TILE_Y, 1)))
void Convolution(
__global float* d_Dst,
__global const float* d_Src,
__constant float* c_Kernel,
uint Width, // Use width to check for image bounds
uint Height,
uint Pitch // Use pitch for offsetting between lines
)
{
// OpenCL allows to allocate the local memory from 'inside' the kernel (without using the clSetKernelArg() call)
// in a similar way to standard C.
// the size of the local memory necessary for the convolution is the tile size + the halo area
__local float tile[TILE_Y + 2][TILE_X + 2];
// TO DO...
// Fill the halo with zeros
// Load main filtered area from d_Src
// Load halo regions from d_Src (edges and corners separately), check for image bounds!
// Sync threads
// Perform the convolution and store the convolved signal to d_Dst.
}
\ No newline at end of file
#define KERNEL_LENGTH (2 * KERNEL_RADIUS + 1)
#define DEPTH_THRESHOLD 0.025f
#define NORM_THRESHOLD 0.9f
// These functions define discontinuities
bool IsNormalDiscontinuity(float4 n1, float4 n2){
return fabs(dot(n1, n2)) < NORM_THRESHOLD;
}
bool IsDepthDiscontinuity(float d1, float d2){
return fabs(d1 - d2) > DEPTH_THRESHOLD;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Horizontal convolution filter
//require matching work-group size
__kernel __attribute__((reqd_work_group_size(H_GROUPSIZE_X, H_GROUPSIZE_Y, 1)))
void DiscontinuityHorizontal(
__global int* d_Disc,
__global const float4* d_NormDepth,
int Width,
int Height,
int Pitch
)
{
// TODO: Uncomment code and fill in the missing code.
// You don't have to follow the provided code. Feel free to adjust it if you want.
// The size of the local memory: one value for each work-item.
// We even load unused pixels to the halo area, to keep the code and local memory access simple.
// Since these loads are coalesced, they introduce no overhead, except for slightly redundant local memory allocation.
// Each work-item loads H_RESULT_STEPS values + 2 halo values
// We split the float4 (normal + depth) into an array of float3 and float to avoid bank conflicts.
//__local float tileNormX[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
//__local float tileNormY[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
//__local float tileNormZ[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
//__local float tileDepth[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
//const int baseX = ...
//const int baseY = ...
//const int offset = ...
//Load left halo (each thread loads exactly one)
//float4 nd = ...
//tileNormX[get_local_id(1)][get_local_id(0)] = nd.x;
//tileNormY[get_local_id(1)][get_local_id(0)] = nd.y;
//tileNormZ[get_local_id(1)][get_local_id(0)] = nd.z;
//tileDepth[get_local_id(1)][get_local_id(0)] = nd.w;
// Load main data + right halo
// pragma unroll is not necessary as the compiler should unroll the short loops by itself.
//#pragma unroll
//for(...) {
//float4 nd = ...
//tileNormX[get_local_id(1)][get_local_id(0) + i * H_GROUPSIZE_X] = nd.x;
//tileNormY[get_local_id(1)][get_local_id(0) + i * H_GROUPSIZE_X] = nd.y;
//tileNormZ[get_local_id(1)][get_local_id(0) + i * H_GROUPSIZE_X] = nd.z;
//tileDepth[get_local_id(1)][get_local_id(0) + i * H_GROUPSIZE_X] = nd.w;
//}
// Sync threads
// Identify discontinuities
//#pragma unroll
//for(...) {
// int flag = 0;
//float myDepth = ...
//float4 myNorm = ...
// Check the left neighbor
//float leftDepth = ...
//float4 leftNorm = ...
//if (IsDepthDiscontinuity(myDepth, leftDepth) || IsNormalDiscontinuity(myNorm, leftNorm))
// flag |= 1;
// Check the right neighbor
//float rightDepth = ...
//float4 rightNorm = ...
//if (IsDepthDiscontinuity(myDepth, rightDepth) || IsNormalDiscontinuity(myNorm, rightNorm))
// flag |= 2;
// Write the flag out
// d_Disc['index'] = flag;
//}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Vertical convolution filter
//require matching work-group size
__kernel __attribute__((reqd_work_group_size(V_GROUPSIZE_X, V_GROUPSIZE_Y, 1)))
void DiscontinuityVertical(
__global int* d_Disc,
__global const float4* d_NormDepth,
int Width,
int Height,
int Pitch
)
{
// Comments in the DiscontinuityHorizontal should be enough.
// TODO
// WARNING: For profiling reasons, it might happen that the framework will run
// this kernel several times.
// You need to make sure that the output of this kernel DOES NOT influence the input.
// In this case, we are both reading and writing the d_Disc[] buffer.
// here is a proposed solution: use separate flags for the vertical discontinuity
// and merge this with the global discontinuity buffer, using bitwise OR.
// This way do do not depent on the number of kernel executions.
//int flag = 0;
// if there is a discontinuity:
// flag |= 4...
//d_Disc['index'] |= flag; // do NOT use '='
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Horizontal convolution filter
//require matching work-group size
__kernel __attribute__((reqd_work_group_size(H_GROUPSIZE_X, H_GROUPSIZE_Y, 1)))
void ConvHorizontal(
__global float* d_Dst,
__global const float* d_Src,
__global const int* d_Disc,
__constant float* c_Kernel,
int Width,
int Height,
int Pitch
)
{
// TODO
// This will be very similar to the separable convolution, except that you have
// also load the discontinuity buffer into the local memory
// Each work-item loads H_RESULT_STEPS values + 2 halo values
//__local float tile[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
//__local int disc[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
// Load data to the tile and disc local arrays
// During the convolution iterate inside-out from the center pixel towards the borders.
//for (...) // Iterate over tiles
// When you iterate to the left, check for 'left' discontinuities.
//for (... > -KERNEL_RADIUS...)
// If you find relevant discontinuity, stop iterating
// When iterating to the right, check for 'right' discontinuities.
//for (... < KERNEL_RADIUS...)
// If you find a relevant discontinuity, stop iterating
// Don't forget to accumulate the weights to normalize the kernel (divide the pixel value by the summed weights)
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Vertical convolution filter
//require matching work-group size
__kernel __attribute__((reqd_work_group_size(V_GROUPSIZE_X, V_GROUPSIZE_Y, 1)))
void ConvVertical(
__global float* d_Dst,
__global const float* d_Src,
__global const int* d_Disc,
__constant float* c_Kernel,
int Width,
int Height,
int Pitch
)
{
// TODO
}
//Each thread load exactly one halo pixel
//Thus, we assume that the halo size is not larger than the
//dimension of the work-group in the direction of the kernel
//to efficiently reduce the memory transfer overhead of the global memory
// (each pixel is lodaded multiple times at high overlaps)
// one work-item will compute RESULT_STEPS pixels
//for unrolling loops, these values have to be known at compile time
/* These macros will be defined dynamically during building the program
#define KERNEL_RADIUS 2
//horizontal kernel
#define H_GROUPSIZE_X 32
#define H_GROUPSIZE_Y 4
#define H_RESULT_STEPS 2
//vertical kernel
#define V_GROUPSIZE_X 32
#define V_GROUPSIZE_Y 16
#define V_RESULT_STEPS 3
*/
#define KERNEL_LENGTH (2 * KERNEL_RADIUS + 1)
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Horizontal convolution filter
/*
c_Kernel stores 2 * KERNEL_RADIUS + 1 weights, use these during the convolution
*/
//require matching work-group size
__kernel __attribute__((reqd_work_group_size(H_GROUPSIZE_X, H_GROUPSIZE_Y, 1)))
void ConvHorizontal(
__global float* d_Dst,
__global const float* d_Src,
__constant float* c_Kernel,
int Width,
int Pitch
)
{
//The size of the local memory: one value for each work-item.
//We even load unused pixels to the halo area, to keep the code and local memory access simple.
//Since these loads are coalesced, they introduce no overhead, except for slightly redundant local memory allocation.
//Each work-item loads H_RESULT_STEPS values + 2 halo values
__local float tile[H_GROUPSIZE_Y][(H_RESULT_STEPS + 2) * H_GROUPSIZE_X];
// TODO:
//const int baseX = ...
//const int baseY = ...
//const int offset = ...
// Load left halo (check for left bound)
// Load main data + right halo (check for right bound)
// for (int tileID = 1; tileID < ...)
// Sync the work-items after loading
// Convolve and store the result
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Vertical convolution filter
//require matching work-group size
__kernel __attribute__((reqd_work_group_size(V_GROUPSIZE_X, V_GROUPSIZE_Y, 1)))
void ConvVertical(
__global float* d_Dst,
__global const float* d_Src,
__constant float* c_Kernel,
int Height,
int Pitch
)
{
__local float tile[(V_RESULT_STEPS + 2) * V_GROUPSIZE_Y][V_GROUPSIZE_X];
//TO DO:
// Conceptually similar to ConvHorizontal
// Load top halo + main data + bottom halo
// Compute and store results
}
#include "Pfm.h"
#include <string.h>
#include <cstring>
#ifdef _MSC_VER
#pragma warning(disable: 4996) //fopen
#endif
//basic constructor
PFM::PFM(){
Reset();
}
//destructor
PFM::~PFM(){
Release();
}
//load a bitmap from a file and represent it correctly
//in memory
bool PFM::LoadRGB(const char *file) {
Release();
FILE *f = fopen( file, "rb" );
if ( !f ) {
fprintf( stderr, "PFM::Load: Error opening file '%s'\n", file );
return false;
}
char tmp[ 1024 ];
fscanf( f, "%s\n", tmp );
if ( strcmp( tmp, "PF" ) != 0 )
return false;
fscanf( f, "%d%d", &width, &height );
float sc;
fscanf( f, "%f", &sc );
pImg = new float[ width * height * 3];
fread( pImg, 1, 1, f);
fread( pImg, sizeof(float) * 3, width * height, f);
fclose( f );
return true;
}
bool PFM::SaveRGB(const char* file) {
FILE *f = fopen( file, "wb" );
if ( !f ) {
fprintf( stderr, "PFM::Save: Error opening file '%s'\n", file );
return false;
}
fprintf(f, "PF\n");
fprintf(f, "%d %d\n", width, height );
fprintf(f, "-1.0000000\n");
fwrite( pImg, sizeof( float ) * 3, width * height, f );
fclose( f );
return true;
}
//load a bitmap from a file and represent it correctly
//in memory
bool PFM::LoadGrayscale(const char *file) {
Release();
FILE *f = fopen( file, "rb" );
if ( !f ) {
fprintf( stderr, "PFM::Load: Error opening file '%s'\n", file );
return false;
}
char tmp[ 1024 ];
fscanf( f, "%s\n", tmp );
if ( strcmp( tmp, "Pf" ) != 0 )
return false;
fscanf( f, "%d%d", &width, &height );
float sc;
fscanf( f, "%f", &sc );
pImg = new float[ width * height];
fread( pImg, 1, 1, f);
fread( pImg, sizeof(float), width * height, f );
fclose( f );
return true;
}
bool PFM::SaveGrayscale(const char* file) {
FILE *f = fopen( file, "wb" );
if ( !f ) {
fprintf( stderr, "PFM::Save: Error opening file '%s'\n", file );
return false;
}
fprintf(f, "Pf\n");
fprintf(f, "%d %d\n", width, height );
fprintf(f, "-1.00000\n");
fwrite( pImg, sizeof(float), width * height, f );
fclose( f );
return true;
}
//function to set the inital values
void PFM::Reset(void) {
height = 0;
width = 0;
pImg = NULL;
}
void PFM::Release(void){
if (pImg)
delete [] pImg;
}
#ifndef PFM_H
#define PFM_H
#include <iostream>
#include <cstdio>
#include <string>
using namespace std;
class PFM {
public:
//variables
int width;
int height;
float *pImg;
//methods
PFM(void);
~PFM();
bool LoadRGB(const char *);
bool SaveRGB(const char*);
bool LoadGrayscale(const char *);
bool SaveGrayscale(const char*);
private:
//methods
void Reset(void);
void Release(void);
};
#endif //_BITMAP_H
__kernel void
set_array_to_constant(
__global int *array,
int num_elements,
int val
)
{
// There is no need to touch this kernel
if(get_global_id(0) < num_elements)
array[get_global_id(0)] = val;
}
__kernel void
compute_histogram(
__global int *histogram, // accumulate histogram here
__global const float *img, // input image
int width, // image width
int height, // image height
int pitch, // image pitch
int num_hist_bins // number of histogram bins
)
{
// Insert your kernel code here
}
__kernel void
compute_histogram_local_memory(
__global int *histogram, // accumulate histogram here
__global const float *img, // input image
int width, // image width
int height, // image height
int pitch, // image pitch
int num_hist_bins, // number of histogram bins
__local int *local_hist
)
{
// Insert your kernel code here
}
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignment3.h"
#include <iostream>
using namespace std;
int main(int argc, char** argv)
{
CAssignment3 myAssignment;
myAssignment.EnterMainLoop(argc, argv);
#ifdef _MSC_VER
cout<<"Press any key..."<<endl;
cin.get();
#endif
}
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignmentBase.h"
#include "CLUtil.h"
#include "CTimer.h"
#include <vector>
#include <iostream>
using namespace std;
#if defined (__APPLE__) || defined(MACOSX)
#define GL_SHARING_EXTENSION "cl_APPLE_gl_sharing"
#else
#define GL_SHARING_EXTENSION "cl_khr_gl_sharing"
#endif
// required for OpenGL interop
#ifdef _WIN32
#include <windows.h>
#endif
#ifdef linux
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenGL/OpenGL.h>
#else
#include <GL/glx.h>
#endif
#endif
///////////////////////////////////////////////////////////////////////////////
// CAssignmentBase
CAssignmentBase::CAssignmentBase()
: m_CLPlatform(nullptr), m_CLDevice(nullptr), m_CLContext(nullptr), m_CLCommandQueue(nullptr)
{
}
CAssignmentBase::~CAssignmentBase()
{
ReleaseCLContext();
}
bool CAssignmentBase::EnterMainLoop(int, char**)
{
if(!InitCLContext())
return false;
bool success = DoCompute();
ReleaseCLContext();
return success;
}
#define PRINT_INFO(title, buffer, bufferSize, maxBufferSize, expr) { expr; buffer[bufferSize] = '\0'; std::cout << title << ": " << buffer << std::endl; }
bool CAssignmentBase::InitCLContext()
{
//////////////////////////////////////////////////////
//(Sect 4.3)
// 1. get all platform IDs
std::vector<cl_platform_id> platformIds;
const cl_uint c_MaxPlatforms = 16;
platformIds.resize(c_MaxPlatforms);
cl_uint countPlatforms;
V_RETURN_FALSE_CL(clGetPlatformIDs(c_MaxPlatforms, &platformIds[0], &countPlatforms), "Failed to get CL platform ID");
platformIds.resize(countPlatforms);
// 2. find all available GPU devices
std::vector<cl_device_id> deviceIds;
const int maxDevices = 16;
deviceIds.resize(maxDevices);
int countAllDevices = 0;
// Searching for the graphics device with the most dedicated video memory.
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
cl_ulong maxGlobalMemorySize = 0;
cl_device_id bestDeviceId = NULL;
for (size_t i = 0; i < platformIds.size(); i++)
{
// Getting the available devices.
cl_uint countDevices;
auto res = clGetDeviceIDs(platformIds[i], deviceType, 1, &deviceIds[countAllDevices], &countDevices);
if(res != CL_SUCCESS) // Maybe there are no GPU devices and some poor implementation doesn't set count devices to zero and return CL_DEVICE_NOT_FOUND.
{
char buffer[1024];
clGetPlatformInfo(platformIds[i], CL_PLATFORM_NAME, 1024, buffer, nullptr);
printf("[WARNING]: clGetDeviceIDs() failed. Error type: %s, Platform name: %s!\n",
CLUtil::GetCLErrorString(res), buffer);
continue;
}
for (size_t j = 0; j < countDevices; j++)
{
cl_device_id currentDeviceId = deviceIds[countAllDevices + j];
cl_ulong globalMemorySize;
cl_bool isUsingUnifiedMemory;
clGetDeviceInfo(currentDeviceId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &globalMemorySize, NULL);
clGetDeviceInfo(currentDeviceId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &isUsingUnifiedMemory, NULL);
if (!isUsingUnifiedMemory && globalMemorySize > maxGlobalMemorySize)
{
bestDeviceId = currentDeviceId;
maxGlobalMemorySize = globalMemorySize;
}
}
countAllDevices += countDevices;
}
deviceIds.resize(countAllDevices);
if (countAllDevices == 0)
{
std::cout << "No device of the selected type with OpenCL support was found.";
return false;
}
// No discrete graphics device was found: falling back to the first found device.
if (bestDeviceId == NULL)
{
bestDeviceId = deviceIds[0];
}
// Choosing the first available device.
m_CLDevice = bestDeviceId;
clGetDeviceInfo(m_CLDevice, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &m_CLPlatform, NULL);
// Printing platform and device data.
const int maxBufferSize = 1024;
char buffer[maxBufferSize];
size_t bufferSize;
std::cout << "OpenCL platform:" << std::endl << std::endl;
PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_NAME, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Version", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Profile", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_PROFILE, maxBufferSize, (void*)buffer, &bufferSize));
std::cout << std::endl << "Device:" << std::endl << std::endl;
PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_NAME, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Driver version", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DRIVER_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
cl_ulong localMemorySize;
clGetDeviceInfo(m_CLDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemorySize, &bufferSize);
std::cout << "Local memory size: " << localMemorySize << " Byte" << std::endl;
std::cout << std::endl << "******************************" << std::endl << std::endl;
cl_int clError;
m_CLContext = clCreateContext(NULL, 1, &m_CLDevice, NULL, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Failed to create OpenCL context.");
// Finally, create a command queue. All the asynchronous commands to the device will be issued
// from the CPU into this queue. This way the host program can continue the execution until some results
// from that device are needed.
m_CLCommandQueue = clCreateCommandQueue(m_CLContext, m_CLDevice, 0, &clError);
V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context");
return true;
}
void CAssignmentBase::ReleaseCLContext()
{
if (m_CLCommandQueue != nullptr)
{
clReleaseCommandQueue(m_CLCommandQueue);
m_CLCommandQueue = nullptr;
}
if (m_CLContext != nullptr)
{
clReleaseContext(m_CLContext);
m_CLContext = nullptr;
}
}
bool CAssignmentBase::RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3])
{
if(m_CLContext == nullptr)
{
std::cerr<<"Error: RunComputeTask() cannot execute because the OpenCL context has not been created first."<<endl;
}
if(!Task.InitResources(m_CLDevice, m_CLContext))
{
std::cerr << "Error during resource allocation. Aborting execution." <<endl;
Task.ReleaseResources();
return false;
}
// Compute the golden result.
cout << "Computing CPU reference result...";
Task.ComputeCPU();
cout << "DONE" << endl;
// Running the same task on the GPU.
cout << "Computing GPU result...";
// Runing the kernel N times. This make the measurement of the execution time more accurate.
Task.ComputeGPU(m_CLContext, m_CLCommandQueue, LocalWorkSize);
cout << "DONE" << endl;
// Validating results.
if (Task.ValidateResults())
{
cout << "GOLD TEST PASSED!" << endl;
}
else
{
cout << "INVALID RESULTS!" << endl;
}
// Cleaning up.
Task.ReleaseResources();
return true;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CASSIGNMENT_BASE_H
#define _CASSIGNMENT_BASE_H
#include "IComputeTask.h"
#include "CommonDefs.h"
//! Base class for all assignments
/*!
Inherit a new class for each specific assignment.
This class is abstract.
Usage of class: from your main CPP you typically call
EnterMainLoop(). This returns when the assignment is finished.
Internally the assignment class should initialize the context,
run one or more compute tasks and then release the context.
*/
class CAssignmentBase
{
public:
CAssignmentBase();
virtual ~CAssignmentBase();
//! Main loop. You only need to overload this if you do some rendering in your assignment.
virtual bool EnterMainLoop(int argc, char** argv);
//! You need to overload this to define a specific behavior for your assignments
virtual bool DoCompute() = 0;
protected:
virtual bool InitCLContext();
virtual void ReleaseCLContext();
virtual bool RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3]);
cl_platform_id m_CLPlatform;
cl_device_id m_CLDevice;
cl_context m_CLContext;
cl_command_queue m_CLCommandQueue;
};
#endif // _CASSIGNMENT_BASE_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CLUtil.h"
#include "CTimer.h"
#include <iostream>
#include <fstream>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CLUtil
size_t CLUtil::GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize)
{
size_t r = DataElemCount % LocalWorkSize;
if(r == 0)
return DataElemCount;
else
return DataElemCount + LocalWorkSize - r;
}
bool CLUtil::LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode)
{
ifstream sourceFile;
sourceFile.open(Path.c_str());
if (!sourceFile.is_open())
{
cerr << "Failed to open file '" << Path << "'." << endl;
return false;
}
// read the entire file into a string
sourceFile.seekg(0, ios::end);
ifstream::pos_type fileSize = sourceFile.tellg();
sourceFile.seekg(0, ios::beg);
SourceCode.clear();
SourceCode.resize((size_t)fileSize);
sourceFile.read(&SourceCode[0], fileSize);
return true;
}
cl_program CLUtil::BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions)
{
// Ignore the last parameter CompileOptions in assignment 1
// This may be used later to pass flags and macro definitions to the OpenCL compiler
cl_program prog = nullptr;
string srcSolution = SourceCode;
const char* src = srcSolution.c_str();
size_t length = srcSolution.size();
cl_int clError;
prog = clCreateProgramWithSource(Context, 1, &src, &length, &clError);
if(CL_SUCCESS != clError)
{
cerr<<"Failed to create CL program from source.";
return nullptr;
}
// program created, now build it:
const char* pCompileOptions = CompileOptions.size() > 0 ? CompileOptions.c_str() : nullptr;
clError = clBuildProgram(prog, 1, &Device, pCompileOptions, NULL, NULL);
PrintBuildLog(prog, Device);
if(CL_SUCCESS != clError)
{
cerr<<"Failed to build CL program.";
SAFE_RELEASE_PROGRAM(prog);
return nullptr;
}
return prog;
}
void CLUtil::PrintBuildLog(cl_program Program, cl_device_id Device)
{
cl_build_status buildStatus;
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &buildStatus, NULL);
// let's print out possible warnings even if the kernel compiled..
//if(buildStatus == CL_SUCCESS)
// return;
//there were some errors.
size_t logSize;
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
string buildLog(logSize, ' ');
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], NULL);
buildLog[logSize] = '\0';
if(buildStatus != CL_SUCCESS)
cout<<"There were build errors!"<<endl;
cout<<"Build log:"<<endl;
cout<<buildLog<<endl;
}
double CLUtil::ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions,
const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations)
{
CTimer timer;
cl_int clErr;
// wait until the command queue is empty...
// Should not be used in production code, but this synchronizes HOST and DEVICE
clErr = clFinish(CommandQueue);
timer.Start();
// run the kernel N times for better average accuracy
for(int i = 0; i < NIterations; i++)
{
clErr |= clEnqueueNDRangeKernel(CommandQueue, Kernel, Dimensions, NULL, pGlobalWorkSize, pLocalWorkSize, 0, NULL, NULL);
}
// wait again to sync
clErr |= clFinish(CommandQueue);
timer.Stop();
if(clErr != CL_SUCCESS)
{
string errorString = GetCLErrorString(clErr);
cerr<<"Kernel execution failure: "<<errorString<<endl;
}
return timer.GetElapsedMilliseconds() / double(NIterations);
}
#define CL_ERROR(x) case (x): return #x;
const char* CLUtil::GetCLErrorString(cl_int CLErrorCode)
{
switch(CLErrorCode)
{
CL_ERROR(CL_SUCCESS);
CL_ERROR(CL_DEVICE_NOT_FOUND);
CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_ERROR(CL_OUT_OF_RESOURCES);
CL_ERROR(CL_OUT_OF_HOST_MEMORY);
CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_ERROR(CL_MEM_COPY_OVERLAP);
CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
CL_ERROR(CL_MAP_FAILURE);
CL_ERROR(CL_INVALID_VALUE);
CL_ERROR(CL_INVALID_DEVICE_TYPE);
CL_ERROR(CL_INVALID_PLATFORM);
CL_ERROR(CL_INVALID_DEVICE);
CL_ERROR(CL_INVALID_CONTEXT);
CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
CL_ERROR(CL_INVALID_COMMAND_QUEUE);
CL_ERROR(CL_INVALID_HOST_PTR);
CL_ERROR(CL_INVALID_MEM_OBJECT);
CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_ERROR(CL_INVALID_IMAGE_SIZE);
CL_ERROR(CL_INVALID_SAMPLER);
CL_ERROR(CL_INVALID_BINARY);
CL_ERROR(CL_INVALID_BUILD_OPTIONS);
CL_ERROR(CL_INVALID_PROGRAM);
CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
CL_ERROR(CL_INVALID_KERNEL_NAME);
CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
CL_ERROR(CL_INVALID_KERNEL);
CL_ERROR(CL_INVALID_ARG_INDEX);
CL_ERROR(CL_INVALID_ARG_VALUE);
CL_ERROR(CL_INVALID_ARG_SIZE);
CL_ERROR(CL_INVALID_KERNEL_ARGS);
CL_ERROR(CL_INVALID_WORK_DIMENSION);
CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
CL_ERROR(CL_INVALID_EVENT);
CL_ERROR(CL_INVALID_OPERATION);
CL_ERROR(CL_INVALID_GL_OBJECT);
CL_ERROR(CL_INVALID_BUFFER_SIZE);
CL_ERROR(CL_INVALID_MIP_LEVEL);
default:
return "Unknown error code";
}
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef CL_UTIL_H
#define CL_UTIL_H
// All OpenCL headers
#if defined(WIN32)
#include <CL/opencl.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "CommonDefs.h"
#include <string>
#include <iostream>
#include <algorithm>
//! Utility class for frequently-needed OpenCL tasks
// TO DO: replace this with a nicer OpenCL wrapper
class CLUtil
{
public:
//! Determines the OpenCL global work size given the number of data elements and threads per workgroup
static size_t GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize);
//! Loads a program source to memory as a string
static bool LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode);
//! Builds a CL program
static cl_program BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions = "");
static void PrintBuildLog(cl_program Program, cl_device_id Device);
//! Measures the execution time of a kernel by executing it N times and returning the average time in milliseconds.
/*!
The scheduling cost of the kernel can be amortized if we enqueue
the kernel multiple times. If your kernel is simple and fast, use a high number of iterations!
*/
static double ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions,
const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations);
static const char* GetCLErrorString(cl_int CLErrorCode);
};
// Some useful shortcuts for handling pointers and validating function calls
#define V_RETURN_FALSE_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return false; }} while(0)
#define V_RETURN_0_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return 0; }} while(0)
#define V_RETURN_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return; }} while(0)
#define SAFE_DELETE(ptr) do {if(ptr){ delete ptr; ptr = NULL; }} while(0)
#define SAFE_DELETE_ARRAY(x) do {if(x){delete [] x; x = NULL;}} while(0)
#define SAFE_RELEASE_KERNEL(ptr) do {if(ptr){ clReleaseKernel(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_PROGRAM(ptr) do {if(ptr){ clReleaseProgram(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_MEMOBJECT(ptr) do {if(ptr){ clReleaseMemObject(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_SAMPLER(ptr) do {if(ptr){ clReleaseSampler(ptr); ptr = NULL; }} while(0)
#define ARRAYLEN(a) (sizeof(a)/sizeof(a[0]))
#endif // CL_UTIL_H
FILE(GLOB CommonSources *.cpp)
FILE(GLOB CommonHeaders *.h)
add_library(GPUCommon
${CommonSources}
${CommonHeaders}
)
\ No newline at end of file
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CTimer.h"
///////////////////////////////////////////////////////////////////////////////
// CTimer
void CTimer::Start()
{
#ifdef _WIN32
QueryPerformanceCounter(&m_StartTime);
#else
gettimeofday(&m_StartTime, NULL);
#endif
}
void CTimer::Stop()
{
#ifdef _WIN32
QueryPerformanceCounter(&m_EndTime);
#else
gettimeofday(&m_EndTime, NULL);
#endif
}
double CTimer::GetElapsedMilliseconds()
{
#ifdef _WIN32
LARGE_INTEGER freq;
if(QueryPerformanceFrequency(&freq))
{
return 1000.0 * double(m_EndTime.QuadPart - m_StartTime.QuadPart) / double(freq.QuadPart);
}
else
{
return -1;
}
#else
double delta = ((double)m_EndTime.tv_sec + 1.0e-6 * (double)m_EndTime.tv_usec) -
((double)m_StartTime.tv_sec + 1.0e-6 * (double)m_StartTime.tv_usec);
return 1000.0 * delta;
#endif
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CTIMER_H
#define _CTIMER_H
//Simple wrapper class that can be used to measure time intervals
//using the built-in precision timer of the OS
// We reverted from std::chrono, because that timer implementation seems to be very imprecise
// (at least under windows)
#ifdef _WIN32
#include <Windows.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <sys/time.h>
#else
#include <sys/time.h>
#include <time.h>
#endif
//! Simple wrapper class for the measurement of time intervals
/*!
Use this timer to measure elapsed time on the HOST side.
Not suitable for measuring the execution of DEVICE code
without synchronization with the HOST.
NOTE: This class is not thread-safe (like most other classes in these
examples), but we are not doing CPU multithreading in the praktikum...
*/
class CTimer
{
public:
CTimer(){};
~CTimer(){};
void Start();
void Stop();
//! Returns the elapsed time between Start() and Stop() in ms.
double GetElapsedMilliseconds();
protected:
#ifdef WIN32
LARGE_INTEGER m_StartTime;
LARGE_INTEGER m_EndTime;
#else
struct timeval m_StartTime;
struct timeval m_EndTime;
#endif
};
#endif // _CTIMER_H
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _ICOMPUTE_TASK_H
#define _ICOMPUTE_TASK_H
// All OpenCL headers
#if defined(WIN32)
#include <CL/opencl.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "CommonDefs.h"
//! Common interface for the tasks within the assignment.
/*!
Inherit a new class for each computing task.
(There are usually more tasks in each assignment).
*/
class IComputeTask
{
public:
virtual ~IComputeTask() {};
//! Init any resources specific to the current task
virtual bool InitResources(cl_device_id Device, cl_context Context) = 0;
//! Release everything allocated in InitResources()
virtual void ReleaseResources() = 0;
//! Perform calculations on the GPU
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) = 0;
//! Compute the "golden" solution on the CPU. The GPU results must be equal to this reference
virtual void ComputeCPU() = 0;
//! Compare the GPU solution to the "golden" solution
virtual bool ValidateResults() = 0;
};
#endif // _ICOMPUTE_TASK_H
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _IGUI_ENABLED_COMPUTE_TASK_H
#define _IGUI_ENABLED_COMPUTE_TASK_H
#include "IComputeTask.h"
//! Common interface for task that have and OpenGL UI
/*!
Currently we only use this interface in Assignment4
to perform GL rendering and respond to user input with keyboard and mouse.
*/
class IGUIEnabledComputeTask : public IComputeTask
{
public:
virtual ~IGUIEnabledComputeTask() {};
// OpenGL render callback
virtual void Render() = 0;
virtual void OnKeyboard(int Key, int Action) = 0;
virtual void OnMouse(int Button, int Action) = 0;
virtual void OnMouseMove(int X, int Y) = 0;
virtual void OnIdle(double Time, float ElapsedTime) = 0;
virtual void OnWindowResized(int Width, int Height) = 0;
};
#endif // _IGUI_ENABLED_COMPUTE_TASK_H
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CAssignment3.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolution3x3Task.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionBilateralTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionSeparableTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionTaskBase.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CHistogramTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\Pfm.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\main.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CAssignment3.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolution3x3Task.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionBilateralTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionSeparableTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionTaskBase.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CHistogramTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\Pfm.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\Convolution3x3.cl" />
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\ConvolutionBilateral.cl" />
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\ConvolutionSeparable.cl" />
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\histogram.cl" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{1F059F54-8912-3D39-B7D1-61ABB7E3B2C4}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{565C5D3D-6819-3026-8DD3-1300B5D33518}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Common\CAssignmentBase.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Common\CLUtil.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Common\CTimer.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CAssignmentBase.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CLUtil.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CTimer.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CommonDefs.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\IComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\IGUIEnabledComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\Common\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{1F059F54-8912-3D39-B7D1-61ABB7E3B2C4}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{565C5D3D-6819-3026-8DD3-1300B5D33518}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2013
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{45BADE66-E423-3F49-BF77-26616651EFA9}"
ProjectSection(ProjectDependencies) = postProject
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E} = {8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308} = {A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514} = {BAB7B38F-E09D-3B3D-BA67-8580A91B6514}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Assignment", "Assignment.vcxproj", "{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}"
ProjectSection(ProjectDependencies) = postProject
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308} = {A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514} = {BAB7B38F-E09D-3B3D-BA67-8580A91B6514}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GPUCommon", "Common\GPUCommon.vcxproj", "{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}"
ProjectSection(ProjectDependencies) = postProject
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514} = {BAB7B38F-E09D-3B3D-BA67-8580A91B6514}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}"
ProjectSection(ProjectDependencies) = postProject
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
MinSizeRel|x64 = MinSizeRel|x64
RelWithDebInfo|x64 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{45BADE66-E423-3F49-BF77-26616651EFA9}.Debug|x64.ActiveCfg = Debug|x64
{45BADE66-E423-3F49-BF77-26616651EFA9}.Release|x64.ActiveCfg = Release|x64
{45BADE66-E423-3F49-BF77-26616651EFA9}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{45BADE66-E423-3F49-BF77-26616651EFA9}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.Debug|x64.ActiveCfg = Debug|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.Debug|x64.Build.0 = Debug|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.Release|x64.ActiveCfg = Release|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.Release|x64.Build.0 = Release|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{8F40C699-BA9A-3E1C-9CFB-8AF0F594211E}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.Debug|x64.ActiveCfg = Debug|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.Debug|x64.Build.0 = Debug|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.Release|x64.ActiveCfg = Release|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.Release|x64.Build.0 = Release|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{A0C096C5-40BE-3FD1-A8AB-50CB9B90C308}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.Debug|x64.ActiveCfg = Debug|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.Debug|x64.Build.0 = Debug|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.Release|x64.ActiveCfg = Release|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.Release|x64.Build.0 = Release|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{BAB7B38F-E09D-3B3D-BA67-8580A91B6514}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
EndGlobalSection
GlobalSection(ExtensibilityAddIns) = postSolution
EndGlobalSection
EndGlobal
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\buildVS13\CMakeFiles\bfc19d30c7fe5088d5f92035f4aa45d8\generate.stamp.rule">
<Filter>CMake Rules</Filter>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<Filter Include="CMake Rules">
<UniqueIdentifier>{E1CDE319-827C-3FCF-892D-63A5F7F2BA83}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \
|| defined(__ARM_ARCH_7A__) \
|| defined(__ARM_ARCH_7R__) \
|| defined(__ARM_ARCH_7M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \
|| defined(__ARM_ARCH_6J__) \
|| defined(__ARM_ARCH_6T2__) \
|| defined(__ARM_ARCH_6Z__) \
|| defined(__ARM_ARCH_6K__) \
|| defined(__ARM_ARCH_6ZK__) \
|| defined(__ARM_ARCH_6M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CAssignment3.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolution3x3Task.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionBilateralTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionSeparableTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionTaskBase.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CHistogramTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\Pfm.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Assignment3\main.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CAssignment3.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolution3x3Task.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionBilateralTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionSeparableTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CConvolutionTaskBase.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CHistogramTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Assignment3\Pfm.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\Assignment3\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\Convolution3x3.cl" />
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\ConvolutionBilateral.cl" />
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\ConvolutionSeparable.cl" />
<None Include="D:\Projekte\GPGPU\Assignment3\Assignment3\histogram.cl" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{2CFED976-7564-3CB4-AA6D-891BC9C8C699}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{39163F82-DE1B-3A5D-A061-050924E730AC}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Common\CAssignmentBase.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Common\CLUtil.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment3\Common\CTimer.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CAssignmentBase.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CLUtil.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CTimer.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\CommonDefs.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\IComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment3\Common\IGUIEnabledComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\Common\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{2CFED976-7564-3CB4-AA6D-891BC9C8C699}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{39163F82-DE1B-3A5D-A061-050924E730AC}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{8D2E2BE9-4B33-3453-AFAF-87FD48B34C15}"
ProjectSection(ProjectDependencies) = postProject
{225E0437-A229-323E-A912-D7CB0149497E} = {225E0437-A229-323E-A912-D7CB0149497E}
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB} = {DA288E5D-EB80-3F4A-AF28-BA196006F7CB}
{33C9CACF-A5F2-3251-B8F7-78A206336EF2} = {33C9CACF-A5F2-3251-B8F7-78A206336EF2}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Assignment", "Assignment.vcxproj", "{225E0437-A229-323E-A912-D7CB0149497E}"
ProjectSection(ProjectDependencies) = postProject
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB} = {DA288E5D-EB80-3F4A-AF28-BA196006F7CB}
{33C9CACF-A5F2-3251-B8F7-78A206336EF2} = {33C9CACF-A5F2-3251-B8F7-78A206336EF2}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GPUCommon", "Common\GPUCommon.vcxproj", "{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}"
ProjectSection(ProjectDependencies) = postProject
{33C9CACF-A5F2-3251-B8F7-78A206336EF2} = {33C9CACF-A5F2-3251-B8F7-78A206336EF2}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{33C9CACF-A5F2-3251-B8F7-78A206336EF2}"
ProjectSection(ProjectDependencies) = postProject
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
MinSizeRel|x64 = MinSizeRel|x64
RelWithDebInfo|x64 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{8D2E2BE9-4B33-3453-AFAF-87FD48B34C15}.Debug|x64.ActiveCfg = Debug|x64
{8D2E2BE9-4B33-3453-AFAF-87FD48B34C15}.Release|x64.ActiveCfg = Release|x64
{8D2E2BE9-4B33-3453-AFAF-87FD48B34C15}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{8D2E2BE9-4B33-3453-AFAF-87FD48B34C15}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{225E0437-A229-323E-A912-D7CB0149497E}.Debug|x64.ActiveCfg = Debug|x64
{225E0437-A229-323E-A912-D7CB0149497E}.Debug|x64.Build.0 = Debug|x64
{225E0437-A229-323E-A912-D7CB0149497E}.Release|x64.ActiveCfg = Release|x64
{225E0437-A229-323E-A912-D7CB0149497E}.Release|x64.Build.0 = Release|x64
{225E0437-A229-323E-A912-D7CB0149497E}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{225E0437-A229-323E-A912-D7CB0149497E}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{225E0437-A229-323E-A912-D7CB0149497E}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{225E0437-A229-323E-A912-D7CB0149497E}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.Debug|x64.ActiveCfg = Debug|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.Debug|x64.Build.0 = Debug|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.Release|x64.ActiveCfg = Release|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.Release|x64.Build.0 = Release|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{DA288E5D-EB80-3F4A-AF28-BA196006F7CB}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.Debug|x64.ActiveCfg = Debug|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.Debug|x64.Build.0 = Debug|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.Release|x64.ActiveCfg = Release|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.Release|x64.Build.0 = Release|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{33C9CACF-A5F2-3251-B8F7-78A206336EF2}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
EndGlobalSection
GlobalSection(ExtensibilityAddIns) = postSolution
EndGlobalSection
EndGlobal
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment3\buildVS15\CMakeFiles\456f3ada6dab407626fd1c1f9bfa040f\generate.stamp.rule">
<Filter>CMake Rules</Filter>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<Filter Include="CMake Rules">
<UniqueIdentifier>{DA97A18F-C2F6-39A2-964A-725614315857}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \
|| defined(__ARM_ARCH_7A__) \
|| defined(__ARM_ARCH_7R__) \
|| defined(__ARM_ARCH_7M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \
|| defined(__ARM_ARCH_6J__) \
|| defined(__ARM_ARCH_6T2__) \
|| defined(__ARM_ARCH_6Z__) \
|| defined(__ARM_ARCH_6K__) \
|| defined(__ARM_ARCH_6ZK__) \
|| defined(__ARM_ARCH_6M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
function(change_workingdir EXE WorkingDir)
#add a user file to auto config the working directory for debugging
if (MSVC)
set(Platform "Win32")
if (CMAKE_CL_64)
set(Platform "x64")
endif (CMAKE_CL_64)
configure_file (
${CMAKE_SOURCE_DIR}/../cmake/WorkingDirectory.vcxproj.user.in
${CMAKE_CURRENT_BINARY_DIR}/${EXE}.vcxproj.user
@ONLY
)
endif()
endfunction()
# - Try to find OpenCL
# This module tries to find an OpenCL implementation on your system. It supports
# AMD / ATI, Apple and NVIDIA implementations, but should work, too.
#
# To set manually the paths, define these environment variables:
# OpenCL_INCPATH - Include path (e.g. OpenCL_INCPATH=/opt/cuda/4.0/cuda/include)
# OpenCL_LIBPATH - Library path (e.h. OpenCL_LIBPATH=/usr/lib64/nvidia)
#
# Once done this will define
# OPENCL_FOUND - system has OpenCL
# OPENCL_INCLUDE_DIRS - the OpenCL include directory
# OPENCL_LIBRARIES - link these to use OpenCL
#
# WIN32 should work, but is untested
FIND_PACKAGE(PackageHandleStandardArgs)
SET (OPENCL_VERSION_STRING "0.1.0")
SET (OPENCL_VERSION_MAJOR 0)
SET (OPENCL_VERSION_MINOR 1)
SET (OPENCL_VERSION_PATCH 0)
include(${CMAKE_SOURCE_DIR}/../cmake/TargetArch.cmake)
target_architecture(TARGET_ARCH)
IF (APPLE)
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX")
FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX")
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX")
ELSE (APPLE)
IF (WIN32)
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
# The AMD SDK currently installs both x86 and x86_64 libraries
# This is only a hack to find out architecture
# The same is true for CUDA SDK
IF( ${TARGET_ARCH} STREQUAL "x86_64" )
SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86_64")
SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/x64")
message(STATUS "Using 64bit libraries")
ELSE (${TARGET_ARCH} STREQUAL "x86_64")
SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86")
SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/Win32")
message(STATUS "Using 32bit libraries")
ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
# Find library
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib PATHS ${OPENCL_AMD_LIB_DIR} ${OPENCL_NVIDIA_LIB_DIR} ENV OpenCL_LIBPATH )
GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
# On Win32 search relative to the library
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
ELSE (WIN32)
# Unix style platforms
IF( ${TARGET_ARCH} STREQUAL "x86_64" )
SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib64 /usr/lib64/nvidia-304xx )
SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib64/nvidia)
message(STATUS "Using 64bit libraries")
ELSE (${TARGET_ARCH} STREQUAL "x86_64")
SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib)
SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib/nvidia)
message(STATUS "Using 32bit libraries")
ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL
PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
)
# Alternatives (for ATIS pool)
FIND_LIBRARY(OPENCL_LIBRARIES libOpenCL.so.1
PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
)
GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)
GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda/ /opt/AMDAPP/ /opt/cuda-5.0/ ENV OpenCL_INCPATH ENV INCLUDE)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda /opt/AMDAPP ENV OpenCL_INCPATH)
ENDIF (WIN32)
ENDIF (APPLE)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS)
IF(_OPENCL_CPP_INCLUDE_DIRS)
SET( OPENCL_HAS_CPP_BINDINGS TRUE )
LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} )
# This is often the same, so clean up
LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )
ENDIF(_OPENCL_CPP_INCLUDE_DIRS)
MARK_AS_ADVANCED(
OPENCL_INCLUDE_DIRS
)
# Based on the Qt 5 processor detection code, so should be very accurate
# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
# Regarding POWER/PowerPC, just as is noted in the Qt source,
# "There are many more known variants/revisions that we do not handle/detect."
set(archdetect_c_code "
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \\
|| defined(__ARM_ARCH_7A__) \\
|| defined(__ARM_ARCH_7R__) \\
|| defined(__ARM_ARCH_7M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \\
|| defined(__ARM_ARCH_6J__) \\
|| defined(__ARM_ARCH_6T2__) \\
|| defined(__ARM_ARCH_6Z__) \\
|| defined(__ARM_ARCH_6K__) \\
|| defined(__ARM_ARCH_6ZK__) \\
|| defined(__ARM_ARCH_6M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
")
# Set ppc_support to TRUE before including this file or ppc and ppc64
# will be treated as invalid architectures since they are no longer supported by Apple
function(target_architecture output_var)
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
# First let's normalize the order of the values
# Note that it's not possible to compile PowerPC applications if you are using
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
# disable it by default
# See this page for more information:
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
set(osx_arch_ppc TRUE)
elseif("${osx_arch}" STREQUAL "i386")
set(osx_arch_i386 TRUE)
elseif("${osx_arch}" STREQUAL "x86_64")
set(osx_arch_x86_64 TRUE)
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
set(osx_arch_ppc64 TRUE)
else()
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
endif()
endforeach()
# Now add all the architectures in our normalized order
if(osx_arch_ppc)
list(APPEND ARCH ppc)
endif()
if(osx_arch_i386)
list(APPEND ARCH i386)
endif()
if(osx_arch_x86_64)
list(APPEND ARCH x86_64)
endif()
if(osx_arch_ppc64)
list(APPEND ARCH ppc64)
endif()
else()
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
enable_language(C)
# Detect the architecture in a rather creative way...
# This compiles a small C program which is a series of ifdefs that selects a
# particular #error preprocessor directive whose message string contains the
# target architecture. The program will always fail to compile (both because
# file is not a valid C program, and obviously because of the presence of the
# #error preprocessor directives... but by exploiting the preprocessor in this
# way, we can detect the correct target architecture even when cross-compiling,
# since the program itself never needs to be run (only the compiler/preprocessor)
try_run(
run_result_unused
compile_result_unused
"${CMAKE_BINARY_DIR}"
"${CMAKE_BINARY_DIR}/arch.c"
COMPILE_OUTPUT_VARIABLE ARCH
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)
# Parse the architecture name from the compiler output
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
# Get rid of the value marker leaving just the architecture name
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
# If we are compiling with an unknown architecture this variable should
# already be set to "unknown" but in the case that it's empty (i.e. due
# to a typo in the code), then set it to unknown
if (NOT ARCH)
set(ARCH unknown)
endif()
endif()
set(${output_var} "${ARCH}" PARENT_SCOPE)
endfunction()
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='RelWithDebInfo|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='MinSizeRel|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
</Project>
\ No newline at end of file
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment