Commit 35e399a3 by Kai Westerkamp

A1

parent 276f62e3
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignment1.h"
#include "CSimpleArraysTask.h"
#include "CMatrixRotateTask.h"
#include <iostream>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CAssignment1
bool CAssignment1::DoCompute()
{
// Task 1: simple array addition.
cout <<"================================"<< endl << "Running vector addition example 1 ..." << endl << endl;
{
size_t localWorkSize[3] = {256, 1, 1};
CSimpleArraysTask task(1048576);
RunComputeTask(task, localWorkSize);
}
cout << "================================" << endl << "Running vector addition example 2 ..." << endl << endl;
{
size_t LocalWorkSize[3] = {512, 1, 1};
CSimpleArraysTask task(1048576);
RunComputeTask(task, LocalWorkSize);
}
// Task 2: matrix rotation.
std::cout << "================================"<< endl << "Running matrix rotation example..." << std::endl << std::endl;
{
size_t LocalWorkSize[3] = {32, 16, 1};
CMatrixRotateTask task(2048, 1025);
RunComputeTask(task, LocalWorkSize);
}
return true;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CASSIGNMENT1_H
#define _CASSIGNMENT1_H
#include "../Common/CAssignmentBase.h"
//! Assignment1 solution
class CAssignment1 : public CAssignmentBase
{
public:
virtual ~CAssignment1() {};
//! This overloaded method contains the specific solution of A1
virtual bool DoCompute();
};
#endif // _CASSIGNMENT1_H
cmake_minimum_required (VERSION 2.8.3)
project (GPUComputing)
# Add our modules to the path
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../cmake/")
include(CheckCXXCompilerFlag)
if (WIN32)
else (WIN32)
#set (EXTRA_COMPILE_FLAGS "-Wall -Werror")
set (EXTRA_COMPILE_FLAGS "-Wall")
CHECK_CXX_COMPILER_FLAG(-std=c++11 HAS_CXX_11)
if (HAS_CXX_11)
set(EXTRA_COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -std=c++11")
message(STATUS "Enabling C++11 support")
else(HAS_CXX_11)
message(WARNING "No C++11 support detected, build will fail.")
endif()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_COMPILE_FLAGS}")
endif (WIN32)
# Include support for changing the working directory in Visual Studio
include(ChangeWorkingDirectory)
# Search for OpenCL and add paths
find_package( OpenCL REQUIRED )
include_directories( ${OPENCL_INCLUDE_DIRS} )
# Include Common module
add_subdirectory (../Common ${CMAKE_BINARY_DIR}/Common)
# Define source files for this assignment
FILE(GLOB Sources *.cpp)
FILE(GLOB Headers *.h)
FILE(GLOB CLSources *.cl)
ADD_EXECUTABLE (Assignment
${Sources}
${Headers}
${CLSources}
)
# Link required libraries
target_link_libraries(Assignment ${OPENCL_LIBRARIES})
target_link_libraries(Assignment GPUCommon)
if (WIN32)
change_workingdir(Assignment ${CMAKE_SOURCE_DIR})
endif()
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CMatrixRotateTask.h"
#include "../Common/CLUtil.h"
#include <string.h>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CMatrixRotateTask
CMatrixRotateTask::CMatrixRotateTask(size_t SizeX, size_t SizeY)
:m_SizeX(static_cast<unsigned>(SizeX)), m_SizeY(static_cast<unsigned>(SizeY)), m_hM(NULL), m_hMR(NULL), m_dM(NULL),
m_dMR(NULL), m_hGPUResultNaive(NULL), m_hGPUResultOpt(NULL), m_Program(NULL),
m_NaiveKernel(NULL), m_OptimizedKernel(NULL)
{
}
CMatrixRotateTask::~CMatrixRotateTask()
{
ReleaseResources();
}
bool CMatrixRotateTask::InitResources(cl_device_id Device, cl_context Context)
{
//CPU resources
m_hM = new float[m_SizeX * m_SizeY];
m_hMR = new float[m_SizeX * m_SizeY];
m_hGPUResultNaive = new float[m_SizeX * m_SizeY];
m_hGPUResultOpt = new float[m_SizeX * m_SizeY];
//fill the matrix with random floats
for(unsigned int i = 0; i < m_SizeX * m_SizeY; i++)
{
m_hM[i] = float(rand()) / float(RAND_MAX);
}
// TO DO: allocate all device resources here
cl_int clError;
m_dM = clCreateBuffer(Context, CL_MEM_READ_ONLY, sizeof(cl_float)*m_SizeX * m_SizeY, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Error create Buffer m_dM");
m_dMR = clCreateBuffer(Context, CL_MEM_WRITE_ONLY, sizeof(cl_float)*m_SizeX * m_SizeY, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Error create Buffer m_dMR");
//load and compile kernels
size_t programSize = 0;
string programCode;
//create program object (this might contain multiple kernel points
if (!CLUtil::LoadProgramSourceToMemory("MatrixRot.cl", programCode))
{
return false;
}
m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode);
if (m_Program == nullptr) return false;
//Naive Kernel
m_NaiveKernel = clCreateKernel(m_Program, "MatrixRotNaive", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel: MatrixRotNaive");
clError = clSetKernelArg(m_NaiveKernel, 0, sizeof(cl_mem), (void*)&m_dM);
clError |= clSetKernelArg(m_NaiveKernel, 1, sizeof(cl_mem), (void*)&m_dMR);
clError |= clSetKernelArg(m_NaiveKernel, 2, sizeof(cl_uint), (void*)&m_SizeX);
clError |= clSetKernelArg(m_NaiveKernel, 3, sizeof(cl_uint), (void*)&m_SizeY);
V_RETURN_FALSE_CL(clError, "Failed to set kernel args: MatrixRotNaive");
//Optiized Kernel
m_OptimizedKernel = clCreateKernel(m_Program, "MatrixRotOptimized", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel: MatrixRotOptimized");
clError = clSetKernelArg(m_OptimizedKernel, 0, sizeof(cl_mem), (void*)&m_dM);
clError |= clSetKernelArg(m_OptimizedKernel, 1, sizeof(cl_mem), (void*)&m_dMR);
clError |= clSetKernelArg(m_OptimizedKernel, 2, sizeof(cl_int), (void*)&m_SizeX);
clError |= clSetKernelArg(m_OptimizedKernel, 3, sizeof(cl_int), (void*)&m_SizeY);
V_RETURN_FALSE_CL(clError, "Failed to set kernel arhs: MatrixRotOptimized");
return true;
}
void CMatrixRotateTask::ReleaseResources()
{
//CPU resources
SAFE_DELETE_ARRAY(m_hM);
SAFE_DELETE_ARRAY(m_hMR);
SAFE_DELETE_ARRAY(m_hGPUResultNaive);
SAFE_DELETE_ARRAY(m_hGPUResultOpt);
// TO DO: release device resources
SAFE_RELEASE_MEMOBJECT(m_dM);
SAFE_RELEASE_MEMOBJECT(m_dMR);
SAFE_RELEASE_PROGRAM(m_Program);
SAFE_RELEASE_KERNEL(m_NaiveKernel);
SAFE_RELEASE_KERNEL(m_OptimizedKernel);
}
void CMatrixRotateTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// TO DO: write input data to the GPU
cl_int clError;
clError = clEnqueueWriteBuffer(CommandQueue, m_dM, CL_FALSE, 0, m_SizeX * m_SizeY * sizeof(float), m_hM, 0, NULL, NULL);
V_RETURN_CL(clError, "Error copying data from host to device!");
//launch kernels
// TO DO: detemine the necessary number of global work items
size_t globalWorkSize[2];
size_t nGroups[2];
globalWorkSize[0] = CLUtil::GetGlobalWorkSize(m_SizeX, LocalWorkSize[0]);
globalWorkSize[1] = CLUtil::GetGlobalWorkSize(m_SizeY, LocalWorkSize[1]);
nGroups[0] = globalWorkSize[0] / LocalWorkSize[0];
nGroups[1] = globalWorkSize[1] / LocalWorkSize[1];
cout << endl << "Executing (" << globalWorkSize[0] << "/" << globalWorkSize[1] << ") threads in " <<
"(" << nGroups[0] << "/" << nGroups[1] << ") groups of size " <<
"(" << LocalWorkSize[0] << "/" << LocalWorkSize[1] << ")" << endl;
//naive kernel
// TO DO: time = CLUtil::ProfileKernel...
double time = CLUtil::ProfileKernel(CommandQueue, m_NaiveKernel, 2, globalWorkSize, LocalWorkSize, 10000);
cout << "Executed naive kernel in " << time << " ms." << endl;
// TO DO: read back the results synchronously.
//this command has to be blocking, since we want to check the valid data
clError = clEnqueueReadBuffer(CommandQueue, m_dMR, CL_TRUE, 0, m_SizeX * m_SizeY * sizeof(float), m_hGPUResultNaive, 0, NULL, NULL);
V_RETURN_CL(clError, "Error reading Back naive Data!");
//optimized kernel
// TO DO: allocate shared (local) memory for the kernel
clError = clSetKernelArg(m_OptimizedKernel, 4, LocalWorkSize[0] * LocalWorkSize[1] * sizeof(float), NULL);
V_RETURN_CL(clError, "Error allocationg shared memory!");
// run kernel
// TO DO: time = GLUtil::ProfileKernel...
time = CLUtil::ProfileKernel(CommandQueue, m_OptimizedKernel, 2, globalWorkSize, LocalWorkSize, 10000);
cout << "Executed optimized kernel in " << time << " ms." << endl;
// TO DO: read back the data to the host
clError = clEnqueueReadBuffer(CommandQueue, m_dMR, CL_TRUE, 0, m_SizeX * m_SizeY * sizeof(float), m_hGPUResultOpt, 0, NULL, NULL);
V_RETURN_CL(clError, "Error reading Back naive Data!");
}
void CMatrixRotateTask::ComputeCPU()
{
for(unsigned int x = 0; x < m_SizeX; x++)
{
for(unsigned int y = 0; y < m_SizeY; y++)
{
m_hMR[ x * m_SizeY + (m_SizeY - y - 1) ] = m_hM[ y * m_SizeX + x ];
}
}
}
bool CMatrixRotateTask::ValidateResults()
{
if(!(memcmp(m_hMR, m_hGPUResultNaive, m_SizeX * m_SizeY * sizeof(float)) == 0))
{
cout<<"Results of the naive kernel are incorrect!"<<endl;
return false;
}
if(!(memcmp(m_hMR, m_hGPUResultOpt, m_SizeX * m_SizeY * sizeof(float)) == 0))
{
cout<<"Results of the optimized kernel are incorrect!"<<endl;
return false;
}
return true;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CMATRIX_ROTATE_TASK_H
#define _CMATRIX_ROTATE_TASK_H
#include "../Common/IComputeTask.h"
//! A1/T2: Matrix rotation
class CMatrixRotateTask : public IComputeTask
{
public:
CMatrixRotateTask(size_t SizeX, size_t SizeY);
virtual ~CMatrixRotateTask();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual void ReleaseResources();
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
virtual void ComputeCPU();
virtual bool ValidateResults();
protected:
//NOTE: we have two memory address spaces, so we mark pointers with a prefix
//to avoid confusions: 'h' - host, 'd' - device
unsigned int m_SizeX;
unsigned int m_SizeY;
//float data on the CPU
//M: original matrix, MR: rotated matrix
float *m_hM, *m_hMR;
//pointers on the GPU
//(result buffers for both kernels)
cl_mem m_dM, m_dMR;
//(..and a pointer to read back the result)
float *m_hGPUResultNaive, *m_hGPUResultOpt;
//OpenCL program and kernels
cl_program m_Program;
cl_kernel m_NaiveKernel;
cl_kernel m_OptimizedKernel;
};
#endif // _CMATRIX_ROTATE_TASK_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CSimpleArraysTask.h"
#include "../Common/CLUtil.h"
#include <string.h>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CSimpleArraysTask
CSimpleArraysTask::CSimpleArraysTask(size_t ArraySize)
: m_ArraySize(ArraySize)
{
}
CSimpleArraysTask::~CSimpleArraysTask()
{
ReleaseResources();
}
bool CSimpleArraysTask::InitResources(cl_device_id Device, cl_context Context)
{
//CPU resources
m_hA = new int[m_ArraySize];
m_hB = new int[m_ArraySize];
m_hC = new int[m_ArraySize];
m_hGPUResult = new int[m_ArraySize];
//fill A and B with random integers
for(unsigned int i = 0; i < m_ArraySize; i++)
{
m_hA[i] = rand() % 1024;
m_hB[i] = rand() % 1024;
}
//device resources
/////////////////////////////////////////
// Sect. 4.5
//TO DO: allocate arrays!
cl_int clError;
m_dA = clCreateBuffer(Context, CL_MEM_READ_ONLY, sizeof(cl_int)*m_ArraySize, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Error create Buffer A");
m_dB = clCreateBuffer(Context, CL_MEM_READ_ONLY, sizeof(cl_int)*m_ArraySize, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Error create Buffer B");
m_dC = clCreateBuffer(Context, CL_MEM_WRITE_ONLY, sizeof(cl_int)*m_ArraySize, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Error create Buffer C");
/////////////////////////////////////////
// Sect. 4.6.
//TO DO: load and compile kernels
size_t programSize = 0;
string programCode;
//create program object (this might contain multiple kernel points
if (!CLUtil::LoadProgramSourceToMemory("VectorAdd.cl", programCode))
{
cout << "Failed to load ProgramScourceToMemory" << endl;;
return false;
}
if (Device == NULL || Context == NULL) {
cout << "Setu error" << endl;
}
m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode);
if (m_Program == nullptr) {
cout << "Failed to build OpenCL Program" << endl<< programCode;;
return false;
}
m_Kernel = clCreateKernel(m_Program, "VecAdd", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel: VecAdd");
//TO DO: bind kernel arguments
clError = clSetKernelArg(m_Kernel, 0, sizeof(cl_mem), (void*)&m_dA);
clError = clSetKernelArg(m_Kernel, 1, sizeof(cl_mem), (void*)&m_dB);
clError = clSetKernelArg(m_Kernel, 2, sizeof(cl_mem), (void*)&m_dC);
clError = clSetKernelArg(m_Kernel, 3, sizeof(cl_int), (void*)&m_ArraySize);
V_RETURN_FALSE_CL(clError, "Failed to set kernel args: VecAdd");
return true;
}
void CSimpleArraysTask::ReleaseResources()
{
//CPU resources
SAFE_DELETE_ARRAY(m_hA);
SAFE_DELETE_ARRAY(m_hB);
SAFE_DELETE_ARRAY(m_hC);
SAFE_DELETE_ARRAY(m_hGPUResult);
/////////////////////////////////////////////////
// Sect. 4.5., 4.6.
SAFE_RELEASE_MEMOBJECT(m_dA);
SAFE_RELEASE_MEMOBJECT(m_dB);
SAFE_RELEASE_MEMOBJECT(m_dC);
SAFE_RELEASE_PROGRAM(m_Program);
SAFE_RELEASE_KERNEL(m_Kernel);
// TO DO: free resources on the GPU
}
void CSimpleArraysTask::ComputeCPU()
{
for(unsigned int i = 0; i < m_ArraySize; i++)
{
m_hC[i] = m_hA[i] + m_hB[m_ArraySize - i - 1];
}
}
void CSimpleArraysTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
/////////////////////////////////////////////////
// Sect. 4.5
// TO DO: Write input data to the GPU
cl_int clError;
clError = clEnqueueWriteBuffer(CommandQueue, m_dA, CL_FALSE, 0, m_ArraySize * sizeof(int), m_hA, 0, NULL, NULL);
clError |= clEnqueueWriteBuffer(CommandQueue, m_dB, CL_FALSE, 0, m_ArraySize * sizeof(int), m_hB, 0, NULL, NULL);
V_RETURN_CL(clError, "Error copying data from host to device!");
/////////////////////////////////////////
// Sect. 4.6.
//execute the kernel: one thread for each element!
// Sect. 4.7.: rewrite the kernel call to use our ProfileKernel()
// utility function to measure execution time.
// Also print out the execution time.
// TO DO: Determine number of thread groups and launch kernel
size_t globalWorksize = CLUtil::GetGlobalWorkSize(m_ArraySize, LocalWorkSize[0]);
size_t nGroups = globalWorksize / LocalWorkSize[0];
cout << "Executing " << globalWorksize << " threads in " << nGroups << " groups of size " << LocalWorkSize[0] << endl;
double time = CLUtil::ProfileKernel(CommandQueue, m_Kernel, 1, &globalWorksize, LocalWorkSize, 10000);
cout << "Average kernel Time: " << time << endl;
// TO DO: read back results synchronously.
//This command has to be blocking, since we need the data
clError = clEnqueueReadBuffer(CommandQueue, m_dC, CL_TRUE, 0, m_ArraySize * sizeof(int), m_hGPUResult, 0, NULL, NULL);
V_RETURN_CL(clError, "Error reading Back Data!");
}
bool CSimpleArraysTask::ValidateResults()
{
return (memcmp(m_hC, m_hGPUResult, m_ArraySize * sizeof(float)) == 0);
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CSIMPLE_ARRAYS_TASK_H
#define _CSIMPLE_ARRAYS_TASK_H
#include "../Common/IComputeTask.h"
//! A1/T1: Simple vector addition
class CSimpleArraysTask : public IComputeTask
{
public:
CSimpleArraysTask(size_t ArraySize);
virtual ~CSimpleArraysTask();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual void ReleaseResources();
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
virtual void ComputeCPU();
virtual bool ValidateResults();
protected:
//NOTE: we have two memory address spaces, so we mark pointers with a prefix
//to avoid confusions: 'h' - host, 'd' - device
//number of array elements
size_t m_ArraySize = 0;
//integer arrays on the CPU
int *m_hA = nullptr, *m_hB = nullptr, *m_hC = nullptr;
//integer arrays on the GPU (and a buffer to read the result back to the host)
cl_mem m_dA = nullptr, m_dB = nullptr, m_dC = nullptr;
int *m_hGPUResult = nullptr;
//OpenCL program and kernels
cl_program m_Program = nullptr;
cl_kernel m_Kernel = nullptr;
};
#endif // _CSIMPLE_ARRAYS_TASK_H
// Rotate the matrix CLOCKWISE
//naive implementation: move the elements of the matrix directly to their destinations
//this will cause unaligned memory accessed which - as we will see - should be avoided on the GPU
__kernel void MatrixRotNaive(__global const float* M, __global float* MR, uint SizeX, uint SizeY)
{
int2 GID;
GID.x = get_global_id(0);
GID.y = get_global_id(1);
if (GID.x < SizeX && GID.x >= 0
&& GID.y < SizeY && GID.y >= 0){
MR[GID.x * SizeY + (SizeY - GID.y - 1)] = M[GID.y * SizeX + GID.x];
}
}
//this kernel does the same thing, however, the local memory is used to
//transform a small chunk of the matrix locally
//then write it back after synchronization in a coalesced access pattern
__kernel void MatrixRotOptimized(__global const float* M, __global float* MR, uint SizeX, uint SizeY,
__local float* block)
{
int2 GID;
GID.x = get_global_id(0);
GID.y = get_global_id(1);
int2 LID;
LID.x = get_local_id(0);
LID.y = get_local_id(1);
if (GID.x < SizeX && GID.x >= 0
&& GID.y < SizeY && GID.y >= 0){
int blockindex = LID.y * get_local_size(0) + LID.x;
block[blockindex] = M[GID.y * SizeX + GID.x];
barrier(CLK_LOCAL_MEM_FENCE);
int2 newLID;
newLID.x = blockindex / get_local_size(1);
newLID.y = blockindex % get_local_size(1);
int2 newGID = GID - LID + newLID;
if (newGID.x < SizeX && newGID.x >= 0
&& newGID.y < SizeY && newGID.y >= 0){
MR[newGID.x * SizeY + (SizeY - newGID.y - 1)] = block[newLID.y * get_local_size(0) + newLID.x];
}
}
}
\ No newline at end of file
// TO DO: Add kernel code function
__kernel void VecAdd(__global const int* a, __global const int* b, __global int* c, int numElements){
int GID = get_global_id(0);
if (GID < numElements && GID >= 0){
c[GID] = a[GID] + b[numElements - GID-1];
}
}
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignment1.h"
#include <iostream>
using namespace std;
int main(int argc, char** argv)
{
CAssignment1 myAssignment;
myAssignment.EnterMainLoop(argc, argv);
#ifdef _MSC_VER
cout << "Press 'Enter'..." << endl;
cin.get();
#endif
}
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignmentBase.h"
#include "CLUtil.h"
#include "CTimer.h"
#include <vector>
using namespace std;
#if defined (__APPLE__) || defined(MACOSX)
#define GL_SHARING_EXTENSION "cl_APPLE_gl_sharing"
#else
#define GL_SHARING_EXTENSION "cl_khr_gl_sharing"
#endif
// required for OpenGL interop
#ifdef _WIN32
#include <windows.h>
#endif
#ifdef linux
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenGL/OpenGL.h>
#else
#include <GL/glx.h>
#endif
#endif
///////////////////////////////////////////////////////////////////////////////
// CAssignmentBase
CAssignmentBase::CAssignmentBase()
: m_CLPlatform(nullptr), m_CLDevice(nullptr), m_CLContext(nullptr), m_CLCommandQueue(nullptr)
{
}
CAssignmentBase::~CAssignmentBase()
{
ReleaseCLContext();
}
bool CAssignmentBase::EnterMainLoop(int, char**)
{
if(!InitCLContext())
return false;
bool success = DoCompute();
ReleaseCLContext();
return success;
}
#define PRINT_INFO(title, buffer, bufferSize, maxBufferSize, expr) { expr; buffer[bufferSize] = '\0'; std::cout << title << ": " << buffer << std::endl; }
bool CAssignmentBase::InitCLContext()
{
//////////////////////////////////////////////////////
//(Sect 4.3)
// 1. get all platform IDs
std::vector<cl_platform_id> platformIds;
const cl_uint c_MaxPlatforms = 16;
platformIds.resize(c_MaxPlatforms);
cl_uint countPlatforms;
V_RETURN_FALSE_CL(clGetPlatformIDs(c_MaxPlatforms, &platformIds[0], &countPlatforms), "Failed to get CL platform ID");
platformIds.resize(countPlatforms);
// 2. find all available GPU devices
std::vector<cl_device_id> deviceIds;
const int maxDevices = 16;
deviceIds.resize(maxDevices);
int countAllDevices = 0;
// Searching for the graphics device with the most dedicated video memory.
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
cl_ulong maxGlobalMemorySize = 0;
cl_device_id bestDeviceId = NULL;
for (size_t i = 0; i < platformIds.size(); i++)
{
// Getting the available devices.
cl_uint countDevices;
clGetDeviceIDs(platformIds[i], deviceType, 1, &deviceIds[countAllDevices], &countDevices);
for (size_t j = 0; j < countDevices; j++)
{
cl_device_id currentDeviceId = deviceIds[countAllDevices + j];
cl_ulong globalMemorySize;
cl_bool isUsingUnifiedMemory;
clGetDeviceInfo(currentDeviceId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &globalMemorySize, NULL);
clGetDeviceInfo(currentDeviceId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &isUsingUnifiedMemory, NULL);
if (!isUsingUnifiedMemory && globalMemorySize > maxGlobalMemorySize)
{
bestDeviceId = currentDeviceId;
maxGlobalMemorySize = globalMemorySize;
}
}
countAllDevices += countDevices;
}
deviceIds.resize(countAllDevices);
if (countAllDevices == 0)
{
std::cout << "No device of the selected type with OpenCL support was found.";
return false;
}
// No discrete graphics device was found: falling back to the first found device.
if (bestDeviceId == NULL)
{
bestDeviceId = deviceIds[0];
}
// Choosing the first available device.
m_CLDevice = bestDeviceId;
clGetDeviceInfo(m_CLDevice, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &m_CLPlatform, NULL);
// Printing platform and device data.
const int maxBufferSize = 1024;
char buffer[maxBufferSize];
size_t bufferSize;
std::cout << "OpenCL platform:" << std::endl << std::endl;
PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_NAME, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Version", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Profile", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_PROFILE, maxBufferSize, (void*)buffer, &bufferSize));
std::cout << std::endl << "Device:" << std::endl << std::endl;
PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_NAME, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Driver version", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DRIVER_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
cl_ulong localMemorySize;
clGetDeviceInfo(m_CLDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemorySize, &bufferSize);
std::cout << "Local memory size: " << localMemorySize << " Byte" << std::endl;
std::cout << std::endl << "******************************" << std::endl << std::endl;
// TO DO:
// Create a new OpenCL context on the selected device.
cl_int clError;
m_CLContext = clCreateContext(0, 1, &m_CLDevice, NULL, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Failed to create OpenCL context.");
// Finally, create a command queue. All the asynchronous commands to the device will be issued
// from the CPU into this queue. This way the host program can continue the execution until some results
// from that device are needed.
// TODO : Create command queue
m_CLCommandQueue = clCreateCommandQueue(m_CLContext, m_CLDevice, 0, &clError);
V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context");
cout << "OpenCL Context Created" << endl;
return true;
}
void CAssignmentBase::ReleaseCLContext()
{
// TO DO: release the command queue and the context!
cout << "OpenCL Context Cleanup" << endl;
if (m_CLCommandQueue != nullptr)
{
clReleaseCommandQueue(m_CLCommandQueue);
m_CLCommandQueue = nullptr;
}
if (m_CLContext != nullptr)
{
clReleaseContext(m_CLContext);
m_CLContext = nullptr;
}
}
bool CAssignmentBase::RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3])
{
if(m_CLContext == nullptr)
{
std::cerr<<"Error: RunComputeTask() cannot execute because the OpenCL context has not been created first."<<endl;
}
if(!Task.InitResources(m_CLDevice, m_CLContext))
{
std::cerr << "Error during resource allocation. Aborting execution." <<endl;
Task.ReleaseResources();
return false;
}
// Compute the golden result.
cout << "Computing CPU reference result...";
Task.ComputeCPU();
cout << "DONE" << endl;
// Running the same task on the GPU.
cout << "Computing GPU result...";
// Runing the kernel N times. This make the measurement of the execution time more accurate.
Task.ComputeGPU(m_CLContext, m_CLCommandQueue, LocalWorkSize);
cout << "DONE" << endl;
// Validating results.
if (Task.ValidateResults())
{
cout << "GOLD TEST PASSED!" << endl;
}
else
{
cout << "INVALID RESULTS!" << endl;
}
// Cleaning up.
Task.ReleaseResources();
return true;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CASSIGNMENT_BASE_H
#define _CASSIGNMENT_BASE_H
#include "IComputeTask.h"
#include "CommonDefs.h"
//! Base class for all assignments
/*!
Inherit a new class for each specific assignment.
This class is abstract.
Usage of class: from your main CPP you typically call
EnterMainLoop(). This returns when the assignment is finished.
Internally the assignment class should initialize the context,
run one or more compute tasks and then release the context.
*/
class CAssignmentBase
{
public:
CAssignmentBase();
virtual ~CAssignmentBase();
//! Main loop. You only need to overload this if you do some rendering in your assignment.
virtual bool EnterMainLoop(int argc, char** argv);
//! You need to overload this to define a specific behavior for your assignments
virtual bool DoCompute() = 0;
protected:
virtual bool InitCLContext();
virtual void ReleaseCLContext();
virtual bool RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3]);
cl_platform_id m_CLPlatform;
cl_device_id m_CLDevice;
cl_context m_CLContext;
cl_command_queue m_CLCommandQueue;
};
#endif // _CASSIGNMENT_BASE_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CLUtil.h"
#include "CTimer.h"
#include <iostream>
#include <fstream>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CLUtil
size_t CLUtil::GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize)
{
// TO DO: replace with correct work group sizing code here (Sec. 4.6)
size_t r = DataElemCount % LocalWorkSize;
if (r == 0) {
return DataElemCount;
}
else
{
return DataElemCount + LocalWorkSize - r;
}
}
bool CLUtil::LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode)
{
ifstream sourceFile;
sourceFile.open(Path.c_str());
if (!sourceFile.is_open())
{
cerr << "Failed to open file '" << Path << "'." << endl;
return false;
}
// read the entire file into a string
sourceFile.seekg(0, ios::end);
ifstream::pos_type fileSize = sourceFile.tellg();
sourceFile.seekg(0, ios::beg);
SourceCode.clear();
SourceCode.resize((size_t)fileSize);
sourceFile.read(&SourceCode[0], fileSize);
return true;
}
cl_program CLUtil::BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions)
{
// Ignore the last parameter CompileOptions in assignment 1
// This may be used later to pass flags and macro definitions to the OpenCL compiler
// Ignore the last parameter CompileOptions in assignment 1
// This may be used later to pass flags and macro definitions to the OpenCL compiler
CTimer timer;
timer.Start();
const char* src = SourceCode.c_str();
size_t length = SourceCode.size();
cl_int clError;
cl_program prog = clCreateProgramWithSource(Context, 1, &src, &length, &clError);
if (CL_SUCCESS != clError)
{
cerr << "Faild to create CL programm from scource.";
return nullptr;
}
clError = clBuildProgram(prog, 1, &Device, NULL, NULL, NULL);
if (CL_SUCCESS != clError)
{
PrintBuildLog(prog, Device);
cerr << "Faild to build CL program.";
SAFE_RELEASE_PROGRAM(prog);
return nullptr;
}
timer.Stop();
cout << "Build completed in " << timer.GetElapsedMilliseconds() << "ms" << endl << endl;
return prog;
}
void CLUtil::PrintBuildLog(cl_program Program, cl_device_id Device)
{
cl_build_status buildStatus;
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &buildStatus, NULL);
// let's print out possible warnings even if the kernel compiled..
//if(buildStatus == CL_SUCCESS)
// return;
//there were some errors.
size_t logSize;
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
string buildLog(logSize, ' ');
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], NULL);
buildLog[logSize] = '\0';
if(buildStatus != CL_SUCCESS)
cout<<"There were build errors!"<<endl;
cout<<"Build log:"<<endl;
cout<<buildLog<<endl;
}
double CLUtil::ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions,
const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations)
{
CTimer timer;
cl_int clError;
clError = clFinish(CommandQueue);
if (clError != CL_SUCCESS) {
cout << "Finish Commad Queue error: " << CLUtil::GetCLErrorString(clError) << endl;
return -1;
}
timer.Start();
for (unsigned int i = 0; i < NIterations; i++)
{
clError |= clEnqueueNDRangeKernel(CommandQueue, Kernel, Dimensions, NULL, pGlobalWorkSize, pLocalWorkSize, 0, NULL, NULL);
}
if (clError != CL_SUCCESS) {
cout << "Kernel execution failure: " << CLUtil::GetCLErrorString(clError) << endl;
return -1;
}
clError = clFinish(CommandQueue);
timer.Stop();
if (clError != CL_SUCCESS) {
cout << "Finish Commad Queue 2 error:" << CLUtil::GetCLErrorString(clError) << endl;
return -1;
}
return (timer.GetElapsedMilliseconds() / double(NIterations));
}
#define CL_ERROR(x) case (x): return #x;
const char* CLUtil::GetCLErrorString(cl_int CLErrorCode)
{
switch(CLErrorCode)
{
CL_ERROR(CL_SUCCESS);
CL_ERROR(CL_DEVICE_NOT_FOUND);
CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_ERROR(CL_OUT_OF_RESOURCES);
CL_ERROR(CL_OUT_OF_HOST_MEMORY);
CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_ERROR(CL_MEM_COPY_OVERLAP);
CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
CL_ERROR(CL_MAP_FAILURE);
CL_ERROR(CL_INVALID_VALUE);
CL_ERROR(CL_INVALID_DEVICE_TYPE);
CL_ERROR(CL_INVALID_PLATFORM);
CL_ERROR(CL_INVALID_DEVICE);
CL_ERROR(CL_INVALID_CONTEXT);
CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
CL_ERROR(CL_INVALID_COMMAND_QUEUE);
CL_ERROR(CL_INVALID_HOST_PTR);
CL_ERROR(CL_INVALID_MEM_OBJECT);
CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_ERROR(CL_INVALID_IMAGE_SIZE);
CL_ERROR(CL_INVALID_SAMPLER);
CL_ERROR(CL_INVALID_BINARY);
CL_ERROR(CL_INVALID_BUILD_OPTIONS);
CL_ERROR(CL_INVALID_PROGRAM);
CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
CL_ERROR(CL_INVALID_KERNEL_NAME);
CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
CL_ERROR(CL_INVALID_KERNEL);
CL_ERROR(CL_INVALID_ARG_INDEX);
CL_ERROR(CL_INVALID_ARG_VALUE);
CL_ERROR(CL_INVALID_ARG_SIZE);
CL_ERROR(CL_INVALID_KERNEL_ARGS);
CL_ERROR(CL_INVALID_WORK_DIMENSION);
CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
CL_ERROR(CL_INVALID_EVENT);
CL_ERROR(CL_INVALID_OPERATION);
CL_ERROR(CL_INVALID_GL_OBJECT);
CL_ERROR(CL_INVALID_BUFFER_SIZE);
CL_ERROR(CL_INVALID_MIP_LEVEL);
default:
return "Unknown error code";
}
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef CL_UTIL_H
#define CL_UTIL_H
// All OpenCL headers
#if defined(WIN32)
#include <CL/opencl.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "CommonDefs.h"
#include <string>
#include <iostream>
#include <algorithm>
//! Utility class for frequently-needed OpenCL tasks
// TO DO: replace this with a nicer OpenCL wrapper
class CLUtil
{
public:
//! Determines the OpenCL global work size given the number of data elements and threads per workgroup
static size_t GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize);
//! Loads a program source to memory as a string
static bool LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode);
//! Builds a CL program
static cl_program BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions = "");
static void PrintBuildLog(cl_program Program, cl_device_id Device);
//! Measures the execution time of a kernel by executing it N times and returning the average time in milliseconds.
/*!
The scheduling cost of the kernel can be amortized if we enqueue
the kernel multiple times. If your kernel is simple and fast, use a high number of iterations!
*/
static double ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions,
const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations);
static const char* GetCLErrorString(cl_int CLErrorCode);
};
// Some useful shortcuts for handling pointers and validating function calls
#define V_RETURN_FALSE_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return false; }} while(0)
#define V_RETURN_0_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return 0; }} while(0)
#define V_RETURN_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return; }} while(0)
#define SAFE_DELETE(ptr) do {if(ptr){ delete ptr; ptr = NULL; }} while(0)
#define SAFE_DELETE_ARRAY(x) do {if(x){delete [] x; x = NULL;}} while(0)
#define SAFE_RELEASE_KERNEL(ptr) do {if(ptr){ clReleaseKernel(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_PROGRAM(ptr) do {if(ptr){ clReleaseProgram(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_MEMOBJECT(ptr) do {if(ptr){ clReleaseMemObject(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_SAMPLER(ptr) do {if(ptr){ clReleaseSampler(ptr); ptr = NULL; }} while(0)
#define ARRAYLEN(a) (sizeof(a)/sizeof(a[0]))
#endif // CL_UTIL_H
FILE(GLOB CommonSources *.cpp)
FILE(GLOB CommonHeaders *.h)
add_library(GPUCommon
${CommonSources}
${CommonHeaders}
)
\ No newline at end of file
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CTimer.h"
///////////////////////////////////////////////////////////////////////////////
// CTimer
void CTimer::Start()
{
#ifdef _WIN32
QueryPerformanceCounter(&m_StartTime);
#else
gettimeofday(&m_StartTime, NULL);
#endif
}
void CTimer::Stop()
{
#ifdef _WIN32
QueryPerformanceCounter(&m_EndTime);
#else
gettimeofday(&m_EndTime, NULL);
#endif
}
double CTimer::GetElapsedMilliseconds()
{
#ifdef _WIN32
LARGE_INTEGER freq;
if(QueryPerformanceFrequency(&freq))
{
return 1000.0 * double(m_EndTime.QuadPart - m_StartTime.QuadPart) / double(freq.QuadPart);
}
else
{
return -1;
}
#else
double delta = ((double)m_EndTime.tv_sec + 1.0e-6 * (double)m_EndTime.tv_usec) -
((double)m_StartTime.tv_sec + 1.0e-6 * (double)m_StartTime.tv_usec);
return 1000.0 * delta;
#endif
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CTIMER_H
#define _CTIMER_H
//Simple wrapper class that can be used to measure time intervals
//using the built-in precision timer of the OS
// We reverted from std::chrono, because that timer implementation seems to be very imprecise
// (at least under windows)
#ifdef _WIN32
#include <Windows.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <sys/time.h>
#else
#include <sys/time.h>
#include <time.h>
#endif
//! Simple wrapper class for the measurement of time intervals
/*!
Use this timer to measure elapsed time on the HOST side.
Not suitable for measuring the execution of DEVICE code
without synchronization with the HOST.
NOTE: This class is not thread-safe (like most other classes in these
examples), but we are not doing CPU multithreading in the praktikum...
*/
class CTimer
{
public:
CTimer(){};
~CTimer(){};
void Start();
void Stop();
//! Returns the elapsed time between Start() and Stop() in ms.
double GetElapsedMilliseconds();
protected:
#ifdef WIN32
LARGE_INTEGER m_StartTime;
LARGE_INTEGER m_EndTime;
#else
struct timeval m_StartTime;
struct timeval m_EndTime;
#endif
};
#endif // _CTIMER_H
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _ICOMPUTE_TASK_H
#define _ICOMPUTE_TASK_H
// All OpenCL headers
#if defined(WIN32)
#include <CL/opencl.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "CommonDefs.h"
//! Common interface for the tasks within the assignment.
/*!
Inherit a new class for each computing task.
(There are usually more tasks in each assignment).
*/
class IComputeTask
{
public:
virtual ~IComputeTask() {};
//! Init any resources specific to the current task
virtual bool InitResources(cl_device_id Device, cl_context Context) = 0;
//! Release everything allocated in InitResources()
virtual void ReleaseResources() = 0;
//! Perform calculations on the GPU
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) = 0;
//! Compute the "golden" solution on the CPU. The GPU results must be equal to this reference
virtual void ComputeCPU() = 0;
//! Compare the GPU solution to the "golden" solution
virtual bool ValidateResults() = 0;
};
#endif // _ICOMPUTE_TASK_H
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _IGUI_ENABLED_COMPUTE_TASK_H
#define _IGUI_ENABLED_COMPUTE_TASK_H
#include "IComputeTask.h"
//! Common interface for task that have and OpenGL UI
/*!
Currently we only use this interface in Assignment4
to perform GL rendering and respond to user input with keyboard and mouse.
*/
class IGUIEnabledComputeTask : public IComputeTask
{
public:
virtual ~IGUIEnabledComputeTask() {};
// OpenGL render callback
virtual void Render() = 0;
virtual void OnKeyboard(int Key, int Action) = 0;
virtual void OnMouse(int Button, int Action) = 0;
virtual void OnMouseMove(int X, int Y) = 0;
virtual void OnIdle(double Time, float ElapsedTime) = 0;
virtual void OnWindowResized(int Width, int Height) = 0;
};
#endif // _IGUI_ENABLED_COMPUTE_TASK_H
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CAssignment1.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CMatrixRotateTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CSimpleArraysTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment1\Assignment1\main.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CAssignment1.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CMatrixRotateTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CSimpleArraysTask.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment1\Assignment1\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<None Include="D:\Projekte\GPGPU\Assignment1\Assignment1\MatrixRot.cl" />
<None Include="D:\Projekte\GPGPU\Assignment1\Assignment1\VectorAdd.cl" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{CE4C157A-7EF7-35EC-A10B-2C11A3407C33}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{C051DAFD-DC0D-31B5-9E03-42F156CAF70F}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment1\Common\CAssignmentBase.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment1\Common\CLUtil.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment1\Common\CTimer.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Common\CAssignmentBase.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Common\CLUtil.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Common\CTimer.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Common\CommonDefs.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Common\IComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment1\Common\IGUIEnabledComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment1\Common\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{CE4C157A-7EF7-35EC-A10B-2C11A3407C33}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{C051DAFD-DC0D-31B5-9E03-42F156CAF70F}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{C32994AE-6437-3C3C-B8D8-301A1C2D51F2}"
ProjectSection(ProjectDependencies) = postProject
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F} = {1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}
{741C1C53-4D7B-37C0-A073-FA93D361DCCB} = {741C1C53-4D7B-37C0-A073-FA93D361DCCB}
{74CEE071-D828-365A-9299-E7A045EDA812} = {74CEE071-D828-365A-9299-E7A045EDA812}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Assignment", "Assignment.vcxproj", "{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}"
ProjectSection(ProjectDependencies) = postProject
{741C1C53-4D7B-37C0-A073-FA93D361DCCB} = {741C1C53-4D7B-37C0-A073-FA93D361DCCB}
{74CEE071-D828-365A-9299-E7A045EDA812} = {74CEE071-D828-365A-9299-E7A045EDA812}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GPUCommon", "Common\GPUCommon.vcxproj", "{741C1C53-4D7B-37C0-A073-FA93D361DCCB}"
ProjectSection(ProjectDependencies) = postProject
{74CEE071-D828-365A-9299-E7A045EDA812} = {74CEE071-D828-365A-9299-E7A045EDA812}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{74CEE071-D828-365A-9299-E7A045EDA812}"
ProjectSection(ProjectDependencies) = postProject
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
MinSizeRel|x64 = MinSizeRel|x64
RelWithDebInfo|x64 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{C32994AE-6437-3C3C-B8D8-301A1C2D51F2}.Debug|x64.ActiveCfg = Debug|x64
{C32994AE-6437-3C3C-B8D8-301A1C2D51F2}.Release|x64.ActiveCfg = Release|x64
{C32994AE-6437-3C3C-B8D8-301A1C2D51F2}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{C32994AE-6437-3C3C-B8D8-301A1C2D51F2}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.Debug|x64.ActiveCfg = Debug|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.Debug|x64.Build.0 = Debug|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.Release|x64.ActiveCfg = Release|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.Release|x64.Build.0 = Release|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{1B53DB6D-1FC8-374D-86DB-9A5FAF6BEA3F}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.Debug|x64.ActiveCfg = Debug|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.Debug|x64.Build.0 = Debug|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.Release|x64.ActiveCfg = Release|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.Release|x64.Build.0 = Release|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{741C1C53-4D7B-37C0-A073-FA93D361DCCB}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.Debug|x64.ActiveCfg = Debug|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.Debug|x64.Build.0 = Debug|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.Release|x64.ActiveCfg = Release|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.Release|x64.Build.0 = Release|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{74CEE071-D828-365A-9299-E7A045EDA812}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
EndGlobalSection
GlobalSection(ExtensibilityAddIns) = postSolution
EndGlobalSection
EndGlobal
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment1\build\CMakeFiles\f165d8b59505599d15f3128f188e69ce\generate.stamp.rule">
<Filter>CMake Rules</Filter>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<Filter Include="CMake Rules">
<UniqueIdentifier>{6664886B-6441-3DF0-A4E9-836DB02EDD54}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \
|| defined(__ARM_ARCH_7A__) \
|| defined(__ARM_ARCH_7R__) \
|| defined(__ARM_ARCH_7M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \
|| defined(__ARM_ARCH_6J__) \
|| defined(__ARM_ARCH_6T2__) \
|| defined(__ARM_ARCH_6Z__) \
|| defined(__ARM_ARCH_6K__) \
|| defined(__ARM_ARCH_6ZK__) \
|| defined(__ARM_ARCH_6M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
function(change_workingdir EXE WorkingDir)
#add a user file to auto config the working directory for debugging
if (MSVC)
set(Platform "Win32")
if (CMAKE_CL_64)
set(Platform "x64")
endif (CMAKE_CL_64)
configure_file (
${CMAKE_SOURCE_DIR}/../cmake/WorkingDirectory.vcxproj.user.in
${CMAKE_CURRENT_BINARY_DIR}/${EXE}.vcxproj.user
@ONLY
)
endif()
endfunction()
# - Try to find OpenCL
# This module tries to find an OpenCL implementation on your system. It supports
# AMD / ATI, Apple and NVIDIA implementations, but should work, too.
#
# To set manually the paths, define these environment variables:
# OpenCL_INCPATH - Include path (e.g. OpenCL_INCPATH=/opt/cuda/4.0/cuda/include)
# OpenCL_LIBPATH - Library path (e.h. OpenCL_LIBPATH=/usr/lib64/nvidia)
#
# Once done this will define
# OPENCL_FOUND - system has OpenCL
# OPENCL_INCLUDE_DIRS - the OpenCL include directory
# OPENCL_LIBRARIES - link these to use OpenCL
#
# WIN32 should work, but is untested
FIND_PACKAGE(PackageHandleStandardArgs)
SET (OPENCL_VERSION_STRING "0.1.0")
SET (OPENCL_VERSION_MAJOR 0)
SET (OPENCL_VERSION_MINOR 1)
SET (OPENCL_VERSION_PATCH 0)
include(${CMAKE_SOURCE_DIR}/../cmake/TargetArch.cmake)
target_architecture(TARGET_ARCH)
IF (APPLE)
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX")
FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX")
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX")
ELSE (APPLE)
IF (WIN32)
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
# The AMD SDK currently installs both x86 and x86_64 libraries
# This is only a hack to find out architecture
# The same is true for CUDA SDK
IF( ${TARGET_ARCH} STREQUAL "x86_64" )
SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86_64")
SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/x64")
message(STATUS "Using 64bit libraries")
ELSE (${TARGET_ARCH} STREQUAL "x86_64")
SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86")
SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/Win32")
message(STATUS "Using 32bit libraries")
ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
# Find library
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib PATHS ${OPENCL_AMD_LIB_DIR} ${OPENCL_NVIDIA_LIB_DIR} ENV OpenCL_LIBPATH )
GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
# On Win32 search relative to the library
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
ELSE (WIN32)
# Unix style platforms
IF( ${TARGET_ARCH} STREQUAL "x86_64" )
SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib64 /usr/lib64/nvidia-304xx )
SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib64/nvidia)
message(STATUS "Using 64bit libraries")
ELSE (${TARGET_ARCH} STREQUAL "x86_64")
SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib)
SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib/nvidia)
message(STATUS "Using 32bit libraries")
ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL
PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
)
# Alternatives (for ATIS pool)
FIND_LIBRARY(OPENCL_LIBRARIES libOpenCL.so.1
PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
)
GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)
GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda/ /opt/AMDAPP/ /opt/cuda-5.0/ ENV OpenCL_INCPATH ENV INCLUDE)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda /opt/AMDAPP ENV OpenCL_INCPATH)
ENDIF (WIN32)
ENDIF (APPLE)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS)
IF(_OPENCL_CPP_INCLUDE_DIRS)
SET( OPENCL_HAS_CPP_BINDINGS TRUE )
LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} )
# This is often the same, so clean up
LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )
ENDIF(_OPENCL_CPP_INCLUDE_DIRS)
MARK_AS_ADVANCED(
OPENCL_INCLUDE_DIRS
)
# Based on the Qt 5 processor detection code, so should be very accurate
# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
# Regarding POWER/PowerPC, just as is noted in the Qt source,
# "There are many more known variants/revisions that we do not handle/detect."
set(archdetect_c_code "
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \\
|| defined(__ARM_ARCH_7A__) \\
|| defined(__ARM_ARCH_7R__) \\
|| defined(__ARM_ARCH_7M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \\
|| defined(__ARM_ARCH_6J__) \\
|| defined(__ARM_ARCH_6T2__) \\
|| defined(__ARM_ARCH_6Z__) \\
|| defined(__ARM_ARCH_6K__) \\
|| defined(__ARM_ARCH_6ZK__) \\
|| defined(__ARM_ARCH_6M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
")
# Set ppc_support to TRUE before including this file or ppc and ppc64
# will be treated as invalid architectures since they are no longer supported by Apple
function(target_architecture output_var)
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
# First let's normalize the order of the values
# Note that it's not possible to compile PowerPC applications if you are using
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
# disable it by default
# See this page for more information:
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
set(osx_arch_ppc TRUE)
elseif("${osx_arch}" STREQUAL "i386")
set(osx_arch_i386 TRUE)
elseif("${osx_arch}" STREQUAL "x86_64")
set(osx_arch_x86_64 TRUE)
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
set(osx_arch_ppc64 TRUE)
else()
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
endif()
endforeach()
# Now add all the architectures in our normalized order
if(osx_arch_ppc)
list(APPEND ARCH ppc)
endif()
if(osx_arch_i386)
list(APPEND ARCH i386)
endif()
if(osx_arch_x86_64)
list(APPEND ARCH x86_64)
endif()
if(osx_arch_ppc64)
list(APPEND ARCH ppc64)
endif()
else()
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
enable_language(C)
# Detect the architecture in a rather creative way...
# This compiles a small C program which is a series of ifdefs that selects a
# particular #error preprocessor directive whose message string contains the
# target architecture. The program will always fail to compile (both because
# file is not a valid C program, and obviously because of the presence of the
# #error preprocessor directives... but by exploiting the preprocessor in this
# way, we can detect the correct target architecture even when cross-compiling,
# since the program itself never needs to be run (only the compiler/preprocessor)
try_run(
run_result_unused
compile_result_unused
"${CMAKE_BINARY_DIR}"
"${CMAKE_BINARY_DIR}/arch.c"
COMPILE_OUTPUT_VARIABLE ARCH
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)
# Parse the architecture name from the compiler output
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
# Get rid of the value marker leaving just the architecture name
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
# If we are compiling with an unknown architecture this variable should
# already be set to "unknown" but in the case that it's empty (i.e. due
# to a typo in the code), then set it to unknown
if (NOT ARCH)
set(ARCH unknown)
endif()
endif()
set(${output_var} "${ARCH}" PARENT_SCOPE)
endfunction()
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='RelWithDebInfo|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='MinSizeRel|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
</Project>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment