Commit e1738542 by Kai Westerkamp

A2 base

parent 95b044bd
......@@ -27,7 +27,7 @@ bool CAssignment1::DoCompute()
}
cout << "================================" << endl << "Running vector addition example 2 ..." << endl << endl;
{
size_t LocalWorkSize[3] = {512, 1, 1};
size_t LocalWorkSize[3] = {1028, 1, 1};
CSimpleArraysTask task(1048576);
RunComputeTask(task, LocalWorkSize);
}
......@@ -36,7 +36,7 @@ bool CAssignment1::DoCompute()
std::cout << "================================"<< endl << "Running matrix rotation example..." << std::endl << std::endl;
{
size_t LocalWorkSize[3] = {32, 16, 1};
CMatrixRotateTask task(2048, 1025);
CMatrixRotateTask task(2049, 1025);
RunComputeTask(task, LocalWorkSize);
}
......
......@@ -132,7 +132,7 @@ void CMatrixRotateTask::ComputeGPU(cl_context Context, cl_command_queue CommandQ
//naive kernel
// TO DO: time = CLUtil::ProfileKernel...
double time = CLUtil::ProfileKernel(CommandQueue, m_NaiveKernel, 2, globalWorkSize, LocalWorkSize, 10000);
double time = CLUtil::ProfileKernel(CommandQueue, m_NaiveKernel, 2, globalWorkSize, LocalWorkSize, 1000);
cout << "Executed naive kernel in " << time << " ms." << endl;
// TO DO: read back the results synchronously.
......
......@@ -4,7 +4,7 @@
__kernel void VecAdd(__global const int* a, __global const int* b, __global int* c, int numElements){
int GID = get_global_id(0);
if (GID < numElements && GID >= 0){
if (GID < numElements){
c[GID] = a[GID] + b[numElements - GID-1];
}
}
File added
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignment2.h"
#include "CReductionTask.h"
#include "CScanTask.h"
#include <iostream>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CAssignment2
bool CAssignment2::DoCompute()
{
// Task 1: parallel reduction
cout<<"########################################"<<endl;
cout<<"Running parallel reduction task..."<<endl<<endl;
{
size_t LocalWorkSize[3] = {256, 1, 1};
CReductionTask reduction(1024 * 1024 * 16);
RunComputeTask(reduction, LocalWorkSize);
}
// Task 2: parallel prefix sum
cout<<"########################################"<<endl;
cout<<"Running parallel prefix sum task..."<<endl<<endl;
{
size_t LocalWorkSize[3] = {256, 1, 1};
CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
RunComputeTask(scan, LocalWorkSize);
}
return true;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CASSIGNMENT2_H
#define _CASSIGNMENT2_H
#include "../Common/CAssignmentBase.h"
//! Assignment2 solution
class CAssignment2 : public CAssignmentBase
{
public:
virtual ~CAssignment2() {};
//! This overloaded method contains the specific solution of A2
virtual bool DoCompute();
};
#endif // _CASSIGNMENT2_H
cmake_minimum_required (VERSION 2.8.3)
project (GPUComputing)
# Add our modules to the path
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../cmake/")
include(CheckCXXCompilerFlag)
if (WIN32)
else (WIN32)
#set (EXTRA_COMPILE_FLAGS "-Wall -Werror")
set (EXTRA_COMPILE_FLAGS "-Wall")
CHECK_CXX_COMPILER_FLAG(-std=c++11 HAS_CXX_11)
if (HAS_CXX_11)
set(EXTRA_COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -std=c++11")
message(STATUS "Enabling C++11 support")
else(HAS_CXX_11)
message(WARNING "No C++11 support detected, build will fail.")
endif()
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_COMPILE_FLAGS}")
endif (WIN32)
# Include support for changing the working directory in Visual Studio
include(ChangeWorkingDirectory)
# Search for OpenCL and add paths
find_package( OpenCL REQUIRED )
include_directories( ${OPENCL_INCLUDE_DIRS} )
# Include Common module
add_subdirectory (../Common ${CMAKE_BINARY_DIR}/Common)
# Define source files for this assignment
FILE(GLOB Sources *.cpp)
FILE(GLOB Headers *.h)
FILE(GLOB CLSources *.cl)
ADD_EXECUTABLE (Assignment
${Sources}
${Headers}
${CLSources}
)
# Link required libraries
target_link_libraries(Assignment ${OPENCL_LIBRARIES})
target_link_libraries(Assignment GPUCommon)
if (WIN32)
change_workingdir(Assignment ${CMAKE_SOURCE_DIR})
endif()
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CReductionTask.h"
#include "../Common/CLUtil.h"
#include "../Common/CTimer.h"
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CReductionTask
string g_kernelNames[4] = {
"interleavedAddressing",
"sequentialAddressing",
"kernelDecomposition",
"kernelDecompositionUnroll"
};
CReductionTask::CReductionTask(size_t ArraySize)
: m_N(ArraySize), m_hInput(NULL),
m_dPingArray(NULL),
m_dPongArray(NULL),
m_Program(NULL),
m_InterleavedAddressingKernel(NULL), m_SequentialAddressingKernel(NULL), m_DecompKernel(NULL), m_DecompUnrollKernel(NULL)
{
}
CReductionTask::~CReductionTask()
{
ReleaseResources();
}
bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
{
//CPU resources
m_hInput = new unsigned int[m_N];
//fill the array with some values
for(unsigned int i = 0; i < m_N; i++)
//m_hInput[i] = 1; // Use this for debugging
m_hInput[i] = rand() & 15;
//device resources
cl_int clError, clError2;
m_dPingArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
clError = clError2;
m_dPongArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
clError |= clError2;
V_RETURN_FALSE_CL(clError, "Error allocating device arrays");
//load and compile kernels
string programCode;
CLUtil::LoadProgramSourceToMemory("Reduction.cl", programCode);
m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode);
if(m_Program == nullptr) return false;
//create kernels
m_InterleavedAddressingKernel = clCreateKernel(m_Program, "Reduction_InterleavedAddressing", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_InterleavedAddressing.");
m_SequentialAddressingKernel = clCreateKernel(m_Program, "Reduction_SequentialAddressing", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_SequentialAddressing.");
m_DecompKernel = clCreateKernel(m_Program, "Reduction_Decomp", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_Decomp.");
m_DecompUnrollKernel = clCreateKernel(m_Program, "Reduction_DecompUnroll", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_DecompUnroll.");
return true;
}
void CReductionTask::ReleaseResources()
{
// host resources
SAFE_DELETE_ARRAY(m_hInput);
// device resources
SAFE_RELEASE_MEMOBJECT(m_dPingArray);
SAFE_RELEASE_MEMOBJECT(m_dPongArray);
SAFE_RELEASE_KERNEL(m_InterleavedAddressingKernel);
SAFE_RELEASE_KERNEL(m_SequentialAddressingKernel);
SAFE_RELEASE_KERNEL(m_DecompKernel);
SAFE_RELEASE_KERNEL(m_DecompUnrollKernel);
SAFE_RELEASE_PROGRAM(m_Program);
}
void CReductionTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
ExecuteTask(Context, CommandQueue, LocalWorkSize, 0);
ExecuteTask(Context, CommandQueue, LocalWorkSize, 1);
ExecuteTask(Context, CommandQueue, LocalWorkSize, 2);
ExecuteTask(Context, CommandQueue, LocalWorkSize, 3);
TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
TestPerformance(Context, CommandQueue, LocalWorkSize, 2);
TestPerformance(Context, CommandQueue, LocalWorkSize, 3);
}
void CReductionTask::ComputeCPU()
{
CTimer timer;
timer.Start();
unsigned int nIterations = 10;
for(unsigned int j = 0; j < nIterations; j++) {
m_resultCPU = m_hInput[0];
for(unsigned int i = 1; i < m_N; i++) {
m_resultCPU += m_hInput[i];
}
}
timer.Stop();
double ms = timer.GetElapsedMilliseconds() / double(nIterations);
cout << " average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
}
bool CReductionTask::ValidateResults()
{
bool success = true;
for(int i = 0; i < 4; i++)
if(m_resultGPU[i] != m_resultCPU)
{
cout<<"Validation of reduction kernel "<<g_kernelNames[i]<<" failed." << endl;
success = false;
}
return success;
}
void CReductionTask::Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
//cl_int clErr;
//size_t globalWorkSize[1];
//size_t localWorkSize[1];
//unsigned int stride = ...;
// TO DO: Implement reduction with interleaved addressing
//for (...) {
//}
}
void CReductionTask::Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// TO DO: Implement reduction with sequential addressing
}
void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// TO DO: Implement reduction with kernel decomposition
// NOTE: make sure that the final result is always in the variable m_dPingArray
// as this is read back for the correctness check
// (CReductionTask::ExecuteTask)
//
// hint: for example, you can use swap(m_dPingArray, m_dPongArray) at the end of your for loop...
}
void CReductionTask::Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// TO DO: Implement reduction with loop unrolling
// NOTE: make sure that the final result is always in the variable m_dPingArray
// as this is read back for the correctness check
// (CReductionTask::ExecuteTask)
//
// hint: for example, you can use swap(m_dPingArray, m_dPongArray) at the end of your for loop...
}
void CReductionTask::ExecuteTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
{
//write input data to the GPU
V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hInput, 0, NULL, NULL), "Error copying data from host to device!");
//run selected task
switch (Task){
case 0:
Reduction_InterleavedAddressing(Context, CommandQueue, LocalWorkSize);
break;
case 1:
Reduction_SequentialAddressing(Context, CommandQueue, LocalWorkSize);
break;
case 2:
Reduction_Decomp(Context, CommandQueue, LocalWorkSize);
break;
case 3:
Reduction_DecompUnroll(Context, CommandQueue, LocalWorkSize);
break;
}
//read back the results synchronously.
m_resultGPU[Task] = 0;
V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, 1 * sizeof(cl_uint), &m_resultGPU[Task], 0, NULL, NULL), "Error reading data from device!");
}
void CReductionTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
{
cout << "Testing performance of task " << g_kernelNames[Task] << endl;
//write input data to the GPU
V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hInput, 0, NULL, NULL), "Error copying data from host to device!");
//finish all before we start meassuring the time
V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
CTimer timer;
timer.Start();
//run the kernel N times
unsigned int nIterations = 100;
for(unsigned int i = 0; i < nIterations; i++) {
//run selected task
switch (Task){
case 0:
Reduction_InterleavedAddressing(Context, CommandQueue, LocalWorkSize);
break;
case 1:
Reduction_SequentialAddressing(Context, CommandQueue, LocalWorkSize);
break;
case 2:
Reduction_Decomp(Context, CommandQueue, LocalWorkSize);
break;
case 3:
Reduction_DecompUnroll(Context, CommandQueue, LocalWorkSize);
break;
}
}
//wait until the command queue is empty again
V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
timer.Stop();
double ms = timer.GetElapsedMilliseconds() / double(nIterations);
cout << " average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CREDUCTION_TASK_H
#define _CREDUCTION_TASK_H
#include "../Common/IComputeTask.h"
//! A2/T1: Parallel reduction
class CReductionTask : public IComputeTask
{
public:
CReductionTask(size_t ArraySize);
virtual ~CReductionTask();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual void ReleaseResources();
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
virtual void ComputeCPU();
virtual bool ValidateResults();
protected:
void Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
void Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
void Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
void Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
void ExecuteTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int task);
void TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int task);
//NOTE: we have two memory address spaces, so we mark pointers with a prefix
//to avoid confusions: 'h' - host, 'd' - device
unsigned int m_N;
// input data
unsigned int *m_hInput;
// results
unsigned int m_resultCPU;
unsigned int m_resultGPU[4];
cl_mem m_dPingArray;
cl_mem m_dPongArray;
//OpenCL program and kernels
cl_program m_Program;
cl_kernel m_InterleavedAddressingKernel;
cl_kernel m_SequentialAddressingKernel;
cl_kernel m_DecompKernel;
cl_kernel m_DecompUnrollKernel;
};
#endif // _CREDUCTION_TASK_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CScanTask.h"
#include "../Common/CLUtil.h"
#include "../Common/CTimer.h"
#include <string.h>
using namespace std;
// number of banks in the local memory. This can be used to avoid bank conflicts
// but we also need to allocate more local memory for that.
#define NUM_BANKS 32
///////////////////////////////////////////////////////////////////////////////
// CScanTask
// only useful for debug info
const string g_kernelNames[2] =
{
"scanNaive",
"scanWorkEfficient"
};
CScanTask::CScanTask(size_t ArraySize, size_t MinLocalWorkSize)
: m_N(ArraySize), m_hArray(NULL), m_hResultCPU(NULL), m_hResultGPU(NULL),
m_dPingArray(NULL), m_dPongArray(NULL),
m_Program(NULL),
m_ScanNaiveKernel(NULL), m_ScanWorkEfficientKernel(NULL), m_ScanWorkEfficientAddKernel(NULL)
{
// compute the number of levels that we need for the work-efficient algorithm
m_MinLocalWorkSize = MinLocalWorkSize;
m_nLevels = 1;
size_t N = ArraySize;
while (N > 0){
N /= 2 * m_MinLocalWorkSize;
m_nLevels++;
}
// Reset validation results
for (int i = 0; i < (int)ARRAYLEN(m_bValidationResults); i++)
m_bValidationResults[i] = false;
}
CScanTask::~CScanTask()
{
ReleaseResources();
}
bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
{
//CPU resources
m_hArray = new unsigned int[m_N];
m_hResultCPU = new unsigned int[m_N];
m_hResultGPU = new unsigned int[m_N];
//fill the array with some values
for(unsigned int i = 0; i < m_N; i++)
//m_hArray[i] = 1; // Use this for debugging
m_hArray[i] = rand() & 15;
//device resources
// ping-pong buffers
cl_int clError, clError2;
m_dPingArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
clError = clError2;
m_dPongArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
clError |= clError2;
// level buffer
m_dLevelArrays = new cl_mem[m_nLevels];
unsigned int N = m_N;
for (unsigned int i = 0; i < m_nLevels; i++) {
m_dLevelArrays[i] = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * N, NULL, &clError2);
clError |= clError2;
N = max(N / (2 * m_MinLocalWorkSize), m_MinLocalWorkSize);
}
V_RETURN_FALSE_CL(clError, "Error allocating device arrays");
//load and compile kernels
string programCode;
CLUtil::LoadProgramSourceToMemory("Scan.cl", programCode);
m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode);
if(m_Program == nullptr) return false;
//create kernels
m_ScanNaiveKernel = clCreateKernel(m_Program, "Scan_Naive", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel.");
m_ScanWorkEfficientKernel = clCreateKernel(m_Program, "Scan_WorkEfficient", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel.");
m_ScanWorkEfficientAddKernel = clCreateKernel(m_Program, "Scan_WorkEfficientAdd", &clError);
V_RETURN_FALSE_CL(clError, "Failed to create kernel.");
return true;
}
void CScanTask::ReleaseResources()
{
// host resources
SAFE_DELETE_ARRAY(m_hArray);
SAFE_DELETE_ARRAY(m_hResultCPU);
SAFE_DELETE_ARRAY(m_hResultGPU);
// device resources
SAFE_RELEASE_MEMOBJECT(m_dPingArray);
SAFE_RELEASE_MEMOBJECT(m_dPongArray);
if(m_dLevelArrays)
for (unsigned int i = 0; i < m_nLevels; i++) {
SAFE_RELEASE_MEMOBJECT(m_dLevelArrays[i]);
}
SAFE_DELETE_ARRAY(m_dLevelArrays);
SAFE_RELEASE_KERNEL(m_ScanNaiveKernel);
SAFE_RELEASE_KERNEL(m_ScanWorkEfficientKernel);
SAFE_RELEASE_KERNEL(m_ScanWorkEfficientAddKernel);
SAFE_RELEASE_PROGRAM(m_Program);
}
void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
cout << endl;
ValidateTask(Context, CommandQueue, LocalWorkSize, 0);
ValidateTask(Context, CommandQueue, LocalWorkSize, 1);
cout << endl;
TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
cout << endl;
}
void CScanTask::ComputeCPU()
{
CTimer timer;
timer.Start();
unsigned int nIterations = 1;
for(unsigned int j = 0; j < nIterations; j++) {
unsigned int sum = 0;
for(unsigned int i = 0; i < m_N; i++) {
sum += m_hArray[i];
m_hResultCPU[i] = sum;
}
}
timer.Stop();
double ms = timer.GetElapsedMilliseconds() / double(nIterations);
cout << " average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
}
bool CScanTask::ValidateResults()
{
bool success = true;
for(int i = 0; i < 2; i++)
if(!m_bValidationResults[i])
{
cout<<"Validation of reduction kernel "<<g_kernelNames[i]<<" failed." << endl;
success = false;
}
return success;
}
void CScanTask::Scan_Naive(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// TO DO: Implement naive version of scan
// NOTE: make sure that the final result is always in the variable m_dPingArray
// as this is read back for the correctness check
// (CReductionTask::ValidateTask)
//
// hint: for example, you can use swap(m_dPingArray, m_dPongArray) at the end of your for loop...
}
void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
{
// TO DO: Implement efficient version of scan
// Make sure that the local prefix sum works before you start experimenting with large arrays
}
void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
{
//run selected task
switch (Task){
case 0:
V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hArray, 0, NULL, NULL), "Error copying data from host to device!");
Scan_Naive(Context, CommandQueue, LocalWorkSize);
V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
break;
case 1:
V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dLevelArrays[0], CL_FALSE, 0, m_N * sizeof(cl_uint), m_hArray, 0, NULL, NULL), "Error copying data from host to device!");
Scan_WorkEfficient(Context, CommandQueue, LocalWorkSize);
V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[0], CL_TRUE, 0, m_N * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
break;
}
// validate results
m_bValidationResults[Task] =( memcmp(m_hResultCPU, m_hResultGPU, m_N * sizeof(unsigned int)) == 0);
}
void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
{
cout << "Testing performance of task " << g_kernelNames[Task] << endl;
//write input data to the GPU
V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hArray, 0, NULL, NULL), "Error copying data from host to device!");
//finish all before we start meassuring the time
V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
CTimer timer;
timer.Start();
//run the kernel N times
unsigned int nIterations = 100;
for(unsigned int i = 0; i < nIterations; i++) {
//run selected task
switch (Task){
case 0:
Scan_Naive(Context, CommandQueue, LocalWorkSize);
break;
case 1:
Scan_WorkEfficient(Context, CommandQueue, LocalWorkSize);
break;
}
}
//wait until the command queue is empty again
V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
timer.Stop();
double ms = timer.GetElapsedMilliseconds() / double(nIterations);
cout << " average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CSCAN_TASK_H
#define _CSCAN_TASK_H
#include "../Common/IComputeTask.h"
//! A2 / T2 Parallel prefix sum (scan)
class CScanTask : public IComputeTask
{
public:
//! The second parameter is necessary to pre-allocate the multi-level arrays
CScanTask(size_t ArraySize, size_t MinLocalWorkSize);
virtual ~CScanTask();
// IComputeTask
virtual bool InitResources(cl_device_id Device, cl_context Context);
virtual void ReleaseResources();
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
virtual void ComputeCPU();
virtual bool ValidateResults();
protected:
void Scan_Naive(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
void Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
void ValidateTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task);
void TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task);
unsigned int m_N;
//float data on the CPU
unsigned int *m_hArray;
unsigned int *m_hResultCPU;
unsigned int *m_hResultGPU;
bool m_bValidationResults[2];
// ping-pong arrays for the naive scan
cl_mem m_dPingArray;
cl_mem m_dPongArray;
// arrays for each level of the work-efficient scan
size_t m_MinLocalWorkSize;
unsigned int m_nLevels;
cl_mem *m_dLevelArrays;
//OpenCL program and kernels
cl_program m_Program;
cl_kernel m_ScanNaiveKernel;
cl_kernel m_ScanWorkEfficientKernel;
cl_kernel m_ScanWorkEfficientAddKernel;
};
#endif // _CSCAN_TASK_H
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_InterleavedAddressing(__global uint* array, uint stride)
{
// TO DO: Kernel implementation
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_SequentialAddressing(__global uint* array, uint stride)
{
// TO DO: Kernel implementation
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
{
// TO DO: Kernel implementation
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Reduction_DecompUnroll(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
{
// TO DO: Kernel implementation
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray, uint N, uint offset)
{
// TO DO: Kernel implementation
}
// Why did we not have conflicts in the Reduction? Because of the sequential addressing (here we use interleaved => we have conflicts).
#define UNROLL
#define NUM_BANKS 32
#define NUM_BANKS_LOG 5
#define SIMD_GROUP_SIZE 32
// Bank conflicts
#define AVOID_BANK_CONFLICTS
#ifdef AVOID_BANK_CONFLICTS
// TO DO: define your conflict-free macro here
#else
#define OFFSET(A) (A)
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLevelArray, __local uint* localBlock)
{
// TO DO: Kernel implementation
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void Scan_WorkEfficientAdd(__global uint* higherLevelArray, __global uint* array, __local uint* localBlock)
{
// TO DO: Kernel implementation (large arrays)
// Kernel that should add the group PPS to the local PPS (Figure 14)
}
\ No newline at end of file
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignment2.h"
#include <iostream>
using namespace std;
int main(int argc, char** argv)
{
CAssignment2 myAssignment;
auto success = myAssignment.EnterMainLoop(argc, argv);
#ifdef _MSC_VER
cout<<"Press any key..."<<endl;
cin.get();
#endif
return success ? 0 : 1;
}
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CAssignmentBase.h"
#include "CLUtil.h"
#include "CTimer.h"
#include <vector>
#include <iostream>
using namespace std;
#if defined (__APPLE__) || defined(MACOSX)
#define GL_SHARING_EXTENSION "cl_APPLE_gl_sharing"
#else
#define GL_SHARING_EXTENSION "cl_khr_gl_sharing"
#endif
// required for OpenGL interop
#ifdef _WIN32
#include <windows.h>
#endif
#ifdef linux
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenGL/OpenGL.h>
#else
#include <GL/glx.h>
#endif
#endif
///////////////////////////////////////////////////////////////////////////////
// CAssignmentBase
CAssignmentBase::CAssignmentBase()
: m_CLPlatform(nullptr), m_CLDevice(nullptr), m_CLContext(nullptr), m_CLCommandQueue(nullptr)
{
}
CAssignmentBase::~CAssignmentBase()
{
ReleaseCLContext();
}
bool CAssignmentBase::EnterMainLoop(int, char**)
{
if(!InitCLContext())
return false;
bool success = DoCompute();
ReleaseCLContext();
return success;
}
#define PRINT_INFO(title, buffer, bufferSize, maxBufferSize, expr) { expr; buffer[bufferSize] = '\0'; std::cout << title << ": " << buffer << std::endl; }
bool CAssignmentBase::InitCLContext()
{
//////////////////////////////////////////////////////
//(Sect 4.3)
// 1. get all platform IDs
std::vector<cl_platform_id> platformIds;
const cl_uint c_MaxPlatforms = 16;
platformIds.resize(c_MaxPlatforms);
cl_uint countPlatforms;
V_RETURN_FALSE_CL(clGetPlatformIDs(c_MaxPlatforms, &platformIds[0], &countPlatforms), "Failed to get CL platform ID");
platformIds.resize(countPlatforms);
// 2. find all available GPU devices
std::vector<cl_device_id> deviceIds;
const int maxDevices = 16;
deviceIds.resize(maxDevices);
int countAllDevices = 0;
// Searching for the graphics device with the most dedicated video memory.
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
cl_ulong maxGlobalMemorySize = 0;
cl_device_id bestDeviceId = NULL;
for (size_t i = 0; i < platformIds.size(); i++)
{
// Getting the available devices.
cl_uint countDevices;
auto res = clGetDeviceIDs(platformIds[i], deviceType, 1, &deviceIds[countAllDevices], &countDevices);
if(res != CL_SUCCESS) // Maybe there are no GPU devices and some poor implementation doesn't set count devices to zero and return CL_DEVICE_NOT_FOUND.
{
char buffer[1024];
clGetPlatformInfo(platformIds[i], CL_PLATFORM_NAME, 1024, buffer, nullptr);
printf("[WARNING]: clGetDeviceIDs() failed. Error type: %s, Platform name: %s!\n",
CLUtil::GetCLErrorString(res), buffer);
continue;
}
for (size_t j = 0; j < countDevices; j++)
{
cl_device_id currentDeviceId = deviceIds[countAllDevices + j];
cl_ulong globalMemorySize;
cl_bool isUsingUnifiedMemory;
clGetDeviceInfo(currentDeviceId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &globalMemorySize, NULL);
clGetDeviceInfo(currentDeviceId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &isUsingUnifiedMemory, NULL);
if (!isUsingUnifiedMemory && globalMemorySize > maxGlobalMemorySize)
{
bestDeviceId = currentDeviceId;
maxGlobalMemorySize = globalMemorySize;
}
}
countAllDevices += countDevices;
}
deviceIds.resize(countAllDevices);
if (countAllDevices == 0)
{
std::cout << "No device of the selected type with OpenCL support was found.";
return false;
}
// No discrete graphics device was found: falling back to the first found device.
if (bestDeviceId == NULL)
{
bestDeviceId = deviceIds[0];
}
// Choosing the first available device.
m_CLDevice = bestDeviceId;
clGetDeviceInfo(m_CLDevice, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &m_CLPlatform, NULL);
// Printing platform and device data.
const int maxBufferSize = 1024;
char buffer[maxBufferSize];
size_t bufferSize;
std::cout << "OpenCL platform:" << std::endl << std::endl;
PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_NAME, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Version", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Profile", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_PROFILE, maxBufferSize, (void*)buffer, &bufferSize));
std::cout << std::endl << "Device:" << std::endl << std::endl;
PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_NAME, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
PRINT_INFO("Driver version", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DRIVER_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
cl_ulong localMemorySize;
clGetDeviceInfo(m_CLDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemorySize, &bufferSize);
std::cout << "Local memory size: " << localMemorySize << " Byte" << std::endl;
std::cout << std::endl << "******************************" << std::endl << std::endl;
cl_int clError;
m_CLContext = clCreateContext(NULL, 1, &m_CLDevice, NULL, NULL, &clError);
V_RETURN_FALSE_CL(clError, "Failed to create OpenCL context.");
// Finally, create a command queue. All the asynchronous commands to the device will be issued
// from the CPU into this queue. This way the host program can continue the execution until some results
// from that device are needed.
m_CLCommandQueue = clCreateCommandQueue(m_CLContext, m_CLDevice, 0, &clError);
V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context");
return true;
}
void CAssignmentBase::ReleaseCLContext()
{
if (m_CLCommandQueue != nullptr)
{
clReleaseCommandQueue(m_CLCommandQueue);
m_CLCommandQueue = nullptr;
}
if (m_CLContext != nullptr)
{
clReleaseContext(m_CLContext);
m_CLContext = nullptr;
}
}
bool CAssignmentBase::RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3])
{
if(m_CLContext == nullptr)
{
std::cerr<<"Error: RunComputeTask() cannot execute because the OpenCL context has not been created first."<<endl;
}
if(!Task.InitResources(m_CLDevice, m_CLContext))
{
std::cerr << "Error during resource allocation. Aborting execution." <<endl;
Task.ReleaseResources();
return false;
}
// Compute the golden result.
cout << "Computing CPU reference result...";
Task.ComputeCPU();
cout << "DONE" << endl;
// Running the same task on the GPU.
cout << "Computing GPU result...";
// Runing the kernel N times. This make the measurement of the execution time more accurate.
Task.ComputeGPU(m_CLContext, m_CLCommandQueue, LocalWorkSize);
cout << "DONE" << endl;
// Validating results.
if (Task.ValidateResults())
{
cout << "GOLD TEST PASSED!" << endl;
}
else
{
cout << "INVALID RESULTS!" << endl;
}
// Cleaning up.
Task.ReleaseResources();
return true;
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CASSIGNMENT_BASE_H
#define _CASSIGNMENT_BASE_H
#include "IComputeTask.h"
#include "CommonDefs.h"
//! Base class for all assignments
/*!
Inherit a new class for each specific assignment.
This class is abstract.
Usage of class: from your main CPP you typically call
EnterMainLoop(). This returns when the assignment is finished.
Internally the assignment class should initialize the context,
run one or more compute tasks and then release the context.
*/
class CAssignmentBase
{
public:
CAssignmentBase();
virtual ~CAssignmentBase();
//! Main loop. You only need to overload this if you do some rendering in your assignment.
virtual bool EnterMainLoop(int argc, char** argv);
//! You need to overload this to define a specific behavior for your assignments
virtual bool DoCompute() = 0;
protected:
virtual bool InitCLContext();
virtual void ReleaseCLContext();
virtual bool RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3]);
cl_platform_id m_CLPlatform;
cl_device_id m_CLDevice;
cl_context m_CLContext;
cl_command_queue m_CLCommandQueue;
};
#endif // _CASSIGNMENT_BASE_H
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CLUtil.h"
#include "CTimer.h"
#include <iostream>
#include <fstream>
using namespace std;
///////////////////////////////////////////////////////////////////////////////
// CLUtil
size_t CLUtil::GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize)
{
size_t r = DataElemCount % LocalWorkSize;
if(r == 0)
return DataElemCount;
else
return DataElemCount + LocalWorkSize - r;
}
bool CLUtil::LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode)
{
ifstream sourceFile;
sourceFile.open(Path.c_str());
if (!sourceFile.is_open())
{
cerr << "Failed to open file '" << Path << "'." << endl;
return false;
}
// read the entire file into a string
sourceFile.seekg(0, ios::end);
ifstream::pos_type fileSize = sourceFile.tellg();
sourceFile.seekg(0, ios::beg);
SourceCode.clear();
SourceCode.resize((size_t)fileSize);
sourceFile.read(&SourceCode[0], fileSize);
return true;
}
cl_program CLUtil::BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions)
{
// Ignore the last parameter CompileOptions in assignment 1
// This may be used later to pass flags and macro definitions to the OpenCL compiler
cl_program prog = nullptr;
string srcSolution = SourceCode;
const char* src = srcSolution.c_str();
size_t length = srcSolution.size();
cl_int clError;
prog = clCreateProgramWithSource(Context, 1, &src, &length, &clError);
if(CL_SUCCESS != clError)
{
cerr<<"Failed to create CL program from source.";
return nullptr;
}
// program created, now build it:
const char* pCompileOptions = CompileOptions.size() > 0 ? CompileOptions.c_str() : nullptr;
clError = clBuildProgram(prog, 1, &Device, pCompileOptions, NULL, NULL);
PrintBuildLog(prog, Device);
if(CL_SUCCESS != clError)
{
cerr<<"Failed to build CL program.";
SAFE_RELEASE_PROGRAM(prog);
return nullptr;
}
return prog;
}
void CLUtil::PrintBuildLog(cl_program Program, cl_device_id Device)
{
cl_build_status buildStatus;
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &buildStatus, NULL);
// let's print out possible warnings even if the kernel compiled..
//if(buildStatus == CL_SUCCESS)
// return;
//there were some errors.
size_t logSize;
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
string buildLog(logSize, ' ');
clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], NULL);
buildLog[logSize] = '\0';
if(buildStatus != CL_SUCCESS)
cout<<"There were build errors!"<<endl;
cout<<"Build log:"<<endl;
cout<<buildLog<<endl;
}
double CLUtil::ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions,
const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations)
{
CTimer timer;
cl_int clErr;
// wait until the command queue is empty...
// Should not be used in production code, but this synchronizes HOST and DEVICE
clErr = clFinish(CommandQueue);
timer.Start();
// run the kernel N times for better average accuracy
for(int i = 0; i < NIterations; i++)
{
clErr |= clEnqueueNDRangeKernel(CommandQueue, Kernel, Dimensions, NULL, pGlobalWorkSize, pLocalWorkSize, 0, NULL, NULL);
}
// wait again to sync
clErr |= clFinish(CommandQueue);
timer.Stop();
if(clErr != CL_SUCCESS)
{
string errorString = GetCLErrorString(clErr);
cerr<<"Kernel execution failure: "<<errorString<<endl;
}
return timer.GetElapsedMilliseconds() / double(NIterations);
}
#define CL_ERROR(x) case (x): return #x;
const char* CLUtil::GetCLErrorString(cl_int CLErrorCode)
{
switch(CLErrorCode)
{
CL_ERROR(CL_SUCCESS);
CL_ERROR(CL_DEVICE_NOT_FOUND);
CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
CL_ERROR(CL_OUT_OF_RESOURCES);
CL_ERROR(CL_OUT_OF_HOST_MEMORY);
CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
CL_ERROR(CL_MEM_COPY_OVERLAP);
CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
CL_ERROR(CL_MAP_FAILURE);
CL_ERROR(CL_INVALID_VALUE);
CL_ERROR(CL_INVALID_DEVICE_TYPE);
CL_ERROR(CL_INVALID_PLATFORM);
CL_ERROR(CL_INVALID_DEVICE);
CL_ERROR(CL_INVALID_CONTEXT);
CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
CL_ERROR(CL_INVALID_COMMAND_QUEUE);
CL_ERROR(CL_INVALID_HOST_PTR);
CL_ERROR(CL_INVALID_MEM_OBJECT);
CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
CL_ERROR(CL_INVALID_IMAGE_SIZE);
CL_ERROR(CL_INVALID_SAMPLER);
CL_ERROR(CL_INVALID_BINARY);
CL_ERROR(CL_INVALID_BUILD_OPTIONS);
CL_ERROR(CL_INVALID_PROGRAM);
CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
CL_ERROR(CL_INVALID_KERNEL_NAME);
CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
CL_ERROR(CL_INVALID_KERNEL);
CL_ERROR(CL_INVALID_ARG_INDEX);
CL_ERROR(CL_INVALID_ARG_VALUE);
CL_ERROR(CL_INVALID_ARG_SIZE);
CL_ERROR(CL_INVALID_KERNEL_ARGS);
CL_ERROR(CL_INVALID_WORK_DIMENSION);
CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
CL_ERROR(CL_INVALID_EVENT);
CL_ERROR(CL_INVALID_OPERATION);
CL_ERROR(CL_INVALID_GL_OBJECT);
CL_ERROR(CL_INVALID_BUFFER_SIZE);
CL_ERROR(CL_INVALID_MIP_LEVEL);
default:
return "Unknown error code";
}
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef CL_UTIL_H
#define CL_UTIL_H
// All OpenCL headers
#if defined(WIN32)
#include <CL/opencl.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "CommonDefs.h"
#include <string>
#include <iostream>
#include <algorithm>
//! Utility class for frequently-needed OpenCL tasks
// TO DO: replace this with a nicer OpenCL wrapper
class CLUtil
{
public:
//! Determines the OpenCL global work size given the number of data elements and threads per workgroup
static size_t GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize);
//! Loads a program source to memory as a string
static bool LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode);
//! Builds a CL program
static cl_program BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions = "");
static void PrintBuildLog(cl_program Program, cl_device_id Device);
//! Measures the execution time of a kernel by executing it N times and returning the average time in milliseconds.
/*!
The scheduling cost of the kernel can be amortized if we enqueue
the kernel multiple times. If your kernel is simple and fast, use a high number of iterations!
*/
static double ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions,
const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations);
static const char* GetCLErrorString(cl_int CLErrorCode);
};
// Some useful shortcuts for handling pointers and validating function calls
#define V_RETURN_FALSE_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return false; }} while(0)
#define V_RETURN_0_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return 0; }} while(0)
#define V_RETURN_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return; }} while(0)
#define SAFE_DELETE(ptr) do {if(ptr){ delete ptr; ptr = NULL; }} while(0)
#define SAFE_DELETE_ARRAY(x) do {if(x){delete [] x; x = NULL;}} while(0)
#define SAFE_RELEASE_KERNEL(ptr) do {if(ptr){ clReleaseKernel(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_PROGRAM(ptr) do {if(ptr){ clReleaseProgram(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_MEMOBJECT(ptr) do {if(ptr){ clReleaseMemObject(ptr); ptr = NULL; }} while(0)
#define SAFE_RELEASE_SAMPLER(ptr) do {if(ptr){ clReleaseSampler(ptr); ptr = NULL; }} while(0)
#define ARRAYLEN(a) (sizeof(a)/sizeof(a[0]))
#endif // CL_UTIL_H
FILE(GLOB CommonSources *.cpp)
FILE(GLOB CommonHeaders *.h)
add_library(GPUCommon
${CommonSources}
${CommonHeaders}
)
\ No newline at end of file
/******************************************************************************
GPU Computing / GPGPU Praktikum source code.
******************************************************************************/
#include "CTimer.h"
///////////////////////////////////////////////////////////////////////////////
// CTimer
void CTimer::Start()
{
#ifdef _WIN32
QueryPerformanceCounter(&m_StartTime);
#else
gettimeofday(&m_StartTime, NULL);
#endif
}
void CTimer::Stop()
{
#ifdef _WIN32
QueryPerformanceCounter(&m_EndTime);
#else
gettimeofday(&m_EndTime, NULL);
#endif
}
double CTimer::GetElapsedMilliseconds()
{
#ifdef _WIN32
LARGE_INTEGER freq;
if(QueryPerformanceFrequency(&freq))
{
return 1000.0 * double(m_EndTime.QuadPart - m_StartTime.QuadPart) / double(freq.QuadPart);
}
else
{
return -1;
}
#else
double delta = ((double)m_EndTime.tv_sec + 1.0e-6 * (double)m_EndTime.tv_usec) -
((double)m_StartTime.tv_sec + 1.0e-6 * (double)m_StartTime.tv_usec);
return 1000.0 * delta;
#endif
}
///////////////////////////////////////////////////////////////////////////////
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _CTIMER_H
#define _CTIMER_H
//Simple wrapper class that can be used to measure time intervals
//using the built-in precision timer of the OS
// We reverted from std::chrono, because that timer implementation seems to be very imprecise
// (at least under windows)
#ifdef _WIN32
#include <Windows.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <sys/time.h>
#else
#include <sys/time.h>
#include <time.h>
#endif
//! Simple wrapper class for the measurement of time intervals
/*!
Use this timer to measure elapsed time on the HOST side.
Not suitable for measuring the execution of DEVICE code
without synchronization with the HOST.
NOTE: This class is not thread-safe (like most other classes in these
examples), but we are not doing CPU multithreading in the praktikum...
*/
class CTimer
{
public:
CTimer(){};
~CTimer(){};
void Start();
void Stop();
//! Returns the elapsed time between Start() and Stop() in ms.
double GetElapsedMilliseconds();
protected:
#ifdef WIN32
LARGE_INTEGER m_StartTime;
LARGE_INTEGER m_EndTime;
#else
struct timeval m_StartTime;
struct timeval m_EndTime;
#endif
};
#endif // _CTIMER_H
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _ICOMPUTE_TASK_H
#define _ICOMPUTE_TASK_H
// All OpenCL headers
#if defined(WIN32)
#include <CL/opencl.h>
#elif defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "CommonDefs.h"
//! Common interface for the tasks within the assignment.
/*!
Inherit a new class for each computing task.
(There are usually more tasks in each assignment).
*/
class IComputeTask
{
public:
virtual ~IComputeTask() {};
//! Init any resources specific to the current task
virtual bool InitResources(cl_device_id Device, cl_context Context) = 0;
//! Release everything allocated in InitResources()
virtual void ReleaseResources() = 0;
//! Perform calculations on the GPU
virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) = 0;
//! Compute the "golden" solution on the CPU. The GPU results must be equal to this reference
virtual void ComputeCPU() = 0;
//! Compare the GPU solution to the "golden" solution
virtual bool ValidateResults() = 0;
};
#endif // _ICOMPUTE_TASK_H
/******************************************************************************
.88888. 888888ba dP dP
d8' `88 88 `8b 88 88
88 a88aaaa8P' 88 88
88 YP88 88 88 88
Y8. .88 88 Y8. .8P
`88888' dP `Y88888P'
a88888b. dP oo
d8' `88 88
88 .d8888b. 88d8b.d8b. 88d888b. dP dP d8888P dP 88d888b. .d8888b.
88 88' `88 88'`88'`88 88' `88 88 88 88 88 88' `88 88' `88
Y8. .88 88. .88 88 88 88 88. .88 88. .88 88 88 88 88 88. .88
Y88888P' `88888P' dP dP dP 88Y888P' `88888P' dP dP dP dP `8888P88
88 .88
dP d8888P
******************************************************************************/
#ifndef _IGUI_ENABLED_COMPUTE_TASK_H
#define _IGUI_ENABLED_COMPUTE_TASK_H
#include "IComputeTask.h"
//! Common interface for task that have and OpenGL UI
/*!
Currently we only use this interface in Assignment4
to perform GL rendering and respond to user input with keyboard and mouse.
*/
class IGUIEnabledComputeTask : public IComputeTask
{
public:
virtual ~IGUIEnabledComputeTask() {};
// OpenGL render callback
virtual void Render() = 0;
virtual void OnKeyboard(int Key, int Action) = 0;
virtual void OnMouse(int Button, int Action) = 0;
virtual void OnMouseMove(int X, int Y) = 0;
virtual void OnIdle(double Time, float ElapsedTime) = 0;
virtual void OnWindowResized(int Width, int Height) = 0;
};
#endif // _IGUI_ENABLED_COMPUTE_TASK_H
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CAssignment2.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CReductionTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CScanTask.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\main.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CAssignment2.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CReductionTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CScanTask.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<None Include="D:\Projekte\GPGPU\Assignment2\Assignment2\Reduction.cl" />
<None Include="D:\Projekte\GPGPU\Assignment2\Assignment2\Scan.cl" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{F6E304CD-0084-3FF4-B85F-1EAEDA42C629}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{B2679C4E-9B86-304A-AA4C-74F70A0EAC52}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="D:\Projekte\GPGPU\Assignment2\Common\CAssignmentBase.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment2\Common\CLUtil.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="D:\Projekte\GPGPU\Assignment2\Common\CTimer.cpp">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CAssignmentBase.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CLUtil.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CTimer.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CommonDefs.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\IComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\IGUIEnabledComputeTask.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment2\Common\CMakeLists.txt" />
</ItemGroup>
<ItemGroup>
<Filter Include="Source Files">
<UniqueIdentifier>{F6E304CD-0084-3FF4-B85F-1EAEDA42C629}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{B2679C4E-9B86-304A-AA4C-74F70A0EAC52}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 14
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}"
ProjectSection(ProjectDependencies) = postProject
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE} = {17879C2F-1E39-30B7-AD1F-1D74A7912DEE}
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F} = {7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84} = {B3F98345-39AE-339D-9A68-0AFAC3D8FE84}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Assignment", "Assignment.vcxproj", "{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}"
ProjectSection(ProjectDependencies) = postProject
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F} = {7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84} = {B3F98345-39AE-339D-9A68-0AFAC3D8FE84}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GPUCommon", "Common\GPUCommon.vcxproj", "{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}"
ProjectSection(ProjectDependencies) = postProject
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84} = {B3F98345-39AE-339D-9A68-0AFAC3D8FE84}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}"
ProjectSection(ProjectDependencies) = postProject
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
MinSizeRel|x64 = MinSizeRel|x64
RelWithDebInfo|x64 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.Debug|x64.ActiveCfg = Debug|x64
{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.Release|x64.ActiveCfg = Release|x64
{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Debug|x64.ActiveCfg = Debug|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Debug|x64.Build.0 = Debug|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Release|x64.ActiveCfg = Release|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Release|x64.Build.0 = Release|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Debug|x64.ActiveCfg = Debug|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Debug|x64.Build.0 = Debug|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Release|x64.ActiveCfg = Release|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Release|x64.Build.0 = Release|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Debug|x64.ActiveCfg = Debug|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Debug|x64.Build.0 = Debug|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Release|x64.ActiveCfg = Release|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Release|x64.Build.0 = Release|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
EndGlobalSection
GlobalSection(ExtensibilityAddIns) = postSolution
EndGlobalSection
EndGlobal
<?xml version="1.0" encoding="UTF-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<CustomBuild Include="D:\Projekte\GPGPU\Assignment2\buildVS15\CMakeFiles\348703b5db2c0151a605f81047f5b4d1\generate.stamp.rule">
<Filter>CMake Rules</Filter>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<Filter Include="CMake Rules">
<UniqueIdentifier>{D943AC8E-153A-31B2-98BF-B32B53A81E2C}</UniqueIdentifier>
</Filter>
</ItemGroup>
</Project>
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \
|| defined(__ARM_ARCH_7A__) \
|| defined(__ARM_ARCH_7R__) \
|| defined(__ARM_ARCH_7M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \
|| defined(__ARM_ARCH_6J__) \
|| defined(__ARM_ARCH_6T2__) \
|| defined(__ARM_ARCH_6Z__) \
|| defined(__ARM_ARCH_6K__) \
|| defined(__ARM_ARCH_6ZK__) \
|| defined(__ARM_ARCH_6M__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
function(change_workingdir EXE WorkingDir)
#add a user file to auto config the working directory for debugging
if (MSVC)
set(Platform "Win32")
if (CMAKE_CL_64)
set(Platform "x64")
endif (CMAKE_CL_64)
configure_file (
${CMAKE_SOURCE_DIR}/../cmake/WorkingDirectory.vcxproj.user.in
${CMAKE_CURRENT_BINARY_DIR}/${EXE}.vcxproj.user
@ONLY
)
endif()
endfunction()
# - Try to find OpenCL
# This module tries to find an OpenCL implementation on your system. It supports
# AMD / ATI, Apple and NVIDIA implementations, but should work, too.
#
# To set manually the paths, define these environment variables:
# OpenCL_INCPATH - Include path (e.g. OpenCL_INCPATH=/opt/cuda/4.0/cuda/include)
# OpenCL_LIBPATH - Library path (e.h. OpenCL_LIBPATH=/usr/lib64/nvidia)
#
# Once done this will define
# OPENCL_FOUND - system has OpenCL
# OPENCL_INCLUDE_DIRS - the OpenCL include directory
# OPENCL_LIBRARIES - link these to use OpenCL
#
# WIN32 should work, but is untested
FIND_PACKAGE(PackageHandleStandardArgs)
SET (OPENCL_VERSION_STRING "0.1.0")
SET (OPENCL_VERSION_MAJOR 0)
SET (OPENCL_VERSION_MINOR 1)
SET (OPENCL_VERSION_PATCH 0)
include(${CMAKE_SOURCE_DIR}/../cmake/TargetArch.cmake)
target_architecture(TARGET_ARCH)
IF (APPLE)
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX")
FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX")
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX")
ELSE (APPLE)
IF (WIN32)
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
# The AMD SDK currently installs both x86 and x86_64 libraries
# This is only a hack to find out architecture
# The same is true for CUDA SDK
IF( ${TARGET_ARCH} STREQUAL "x86_64" )
SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86_64")
SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/x64")
message(STATUS "Using 64bit libraries")
ELSE (${TARGET_ARCH} STREQUAL "x86_64")
SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86")
SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/Win32")
message(STATUS "Using 32bit libraries")
ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
# Find library
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib PATHS ${OPENCL_AMD_LIB_DIR} ${OPENCL_NVIDIA_LIB_DIR} ENV OpenCL_LIBPATH )
GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
# On Win32 search relative to the library
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
ELSE (WIN32)
# Unix style platforms
IF( ${TARGET_ARCH} STREQUAL "x86_64" )
SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib64 /usr/lib64/nvidia-304xx )
SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib64/nvidia)
message(STATUS "Using 64bit libraries")
ELSE (${TARGET_ARCH} STREQUAL "x86_64")
SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib)
SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib/nvidia)
message(STATUS "Using 32bit libraries")
ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
FIND_LIBRARY(OPENCL_LIBRARIES OpenCL
PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
)
# Alternatives (for ATIS pool)
FIND_LIBRARY(OPENCL_LIBRARIES libOpenCL.so.1
PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
)
GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)
GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda/ /opt/AMDAPP/ /opt/cuda-5.0/ ENV OpenCL_INCPATH ENV INCLUDE)
FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda /opt/AMDAPP ENV OpenCL_INCPATH)
ENDIF (WIN32)
ENDIF (APPLE)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS)
IF(_OPENCL_CPP_INCLUDE_DIRS)
SET( OPENCL_HAS_CPP_BINDINGS TRUE )
LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} )
# This is often the same, so clean up
LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )
ENDIF(_OPENCL_CPP_INCLUDE_DIRS)
MARK_AS_ADVANCED(
OPENCL_INCLUDE_DIRS
)
# Based on the Qt 5 processor detection code, so should be very accurate
# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
# Regarding POWER/PowerPC, just as is noted in the Qt source,
# "There are many more known variants/revisions that we do not handle/detect."
set(archdetect_c_code "
#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
#if defined(__ARM_ARCH_7__) \\
|| defined(__ARM_ARCH_7A__) \\
|| defined(__ARM_ARCH_7R__) \\
|| defined(__ARM_ARCH_7M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \\
|| defined(__ARM_ARCH_6J__) \\
|| defined(__ARM_ARCH_6T2__) \\
|| defined(__ARM_ARCH_6Z__) \\
|| defined(__ARM_ARCH_6K__) \\
|| defined(__ARM_ARCH_6ZK__) \\
|| defined(__ARM_ARCH_6M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif
#error cmake_ARCH unknown
")
# Set ppc_support to TRUE before including this file or ppc and ppc64
# will be treated as invalid architectures since they are no longer supported by Apple
function(target_architecture output_var)
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
# First let's normalize the order of the values
# Note that it's not possible to compile PowerPC applications if you are using
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
# disable it by default
# See this page for more information:
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
set(osx_arch_ppc TRUE)
elseif("${osx_arch}" STREQUAL "i386")
set(osx_arch_i386 TRUE)
elseif("${osx_arch}" STREQUAL "x86_64")
set(osx_arch_x86_64 TRUE)
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
set(osx_arch_ppc64 TRUE)
else()
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
endif()
endforeach()
# Now add all the architectures in our normalized order
if(osx_arch_ppc)
list(APPEND ARCH ppc)
endif()
if(osx_arch_i386)
list(APPEND ARCH i386)
endif()
if(osx_arch_x86_64)
list(APPEND ARCH x86_64)
endif()
if(osx_arch_ppc64)
list(APPEND ARCH ppc64)
endif()
else()
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
enable_language(C)
# Detect the architecture in a rather creative way...
# This compiles a small C program which is a series of ifdefs that selects a
# particular #error preprocessor directive whose message string contains the
# target architecture. The program will always fail to compile (both because
# file is not a valid C program, and obviously because of the presence of the
# #error preprocessor directives... but by exploiting the preprocessor in this
# way, we can detect the correct target architecture even when cross-compiling,
# since the program itself never needs to be run (only the compiler/preprocessor)
try_run(
run_result_unused
compile_result_unused
"${CMAKE_BINARY_DIR}"
"${CMAKE_BINARY_DIR}/arch.c"
COMPILE_OUTPUT_VARIABLE ARCH
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)
# Parse the architecture name from the compiler output
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
# Get rid of the value marker leaving just the architecture name
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
# If we are compiling with an unknown architecture this variable should
# already be set to "unknown" but in the case that it's empty (i.e. due
# to a typo in the code), then set it to unknown
if (NOT ARCH)
set(ARCH unknown)
endif()
endif()
set(${output_var} "${ARCH}" PARENT_SCOPE)
endfunction()
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='RelWithDebInfo|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='MinSizeRel|@Platform@'">
<LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
</PropertyGroup>
</Project>
\ No newline at end of file
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment