A2 base

e1738542 · Kai Westerkamp · 95b044bd · e1738542 · e1738542 · e1738542
Commit e1738542 authored Nov 02, 2016 by Kai Westerkamp
40 changed files
--- a/Assignment1/Assignment1/CAssignment1.cpp
+++ b/Assignment1/Assignment1/CAssignment1.cpp
@@ -27,7 +27,7 @@ bool CAssignment1::DoCompute()
 	}
 	cout << "================================" << endl << "Running vector addition example 2 ..." << endl << endl;
 	{
-		size_t LocalWorkSize[3] = {512, 1, 1};
+		size_t LocalWorkSize[3] = {1028, 1, 1};
 		CSimpleArraysTask task(1048576);
 		RunComputeTask(task, LocalWorkSize);
 	}
@@ -36,7 +36,7 @@ bool CAssignment1::DoCompute()
 	std::cout << "================================"<< endl << "Running matrix rotation example..." << std::endl << std::endl;
 	{
 		size_t LocalWorkSize[3] = {32, 16, 1};
-		CMatrixRotateTask task(2048, 1025);
+		CMatrixRotateTask task(2049, 1025);
 		RunComputeTask(task, LocalWorkSize);
 	}


--- a/Assignment1/Assignment1/CMatrixRotateTask.cpp
+++ b/Assignment1/Assignment1/CMatrixRotateTask.cpp
@@ -132,7 +132,7 @@ void CMatrixRotateTask::ComputeGPU(cl_context Context, cl_command_queue CommandQ

 	//naive kernel
 	// TO DO: time = CLUtil::ProfileKernel...
-	double time = CLUtil::ProfileKernel(CommandQueue, m_NaiveKernel, 2, globalWorkSize, LocalWorkSize, 10000);
+	double time = CLUtil::ProfileKernel(CommandQueue, m_NaiveKernel, 2, globalWorkSize, LocalWorkSize, 1000);
 	cout << "Executed naive kernel in " << time << " ms." << endl;

 	// TO DO: read back the results synchronously.

--- a/Assignment1/Assignment1/VectorAdd.cl
+++ b/Assignment1/Assignment1/VectorAdd.cl
@@ -4,7 +4,7 @@
 __kernel void VecAdd(__global const int* a, __global const int* b, __global int* c, int numElements){
 	int GID = get_global_id(0);

-	if (GID < numElements && GID >= 0){
+	if (GID < numElements){
 		c[GID] = a[GID] + b[numElements - GID-1];
 	}
 }
--- a/Assignment1_KaiWesterkamp_1650042.zip
+++ b/Assignment1_KaiWesterkamp_1650042.zip
--- a/Assignment2.zip
+++ b/Assignment2.zip
--- a/Assignment2/Assignment2/CAssignment2.cpp
+++ b/Assignment2/Assignment2/CAssignment2.cpp
+/******************************************************************************
+GPU Computing / GPGPU Praktikum source code.
+
+******************************************************************************/
+
+#include "CAssignment2.h"
+
+#include "CReductionTask.h"
+#include "CScanTask.h"
+
+#include <iostream>
+
+using namespace std;
+
+///////////////////////////////////////////////////////////////////////////////
+// CAssignment2
+
+bool CAssignment2::DoCompute()
+{
+	// Task 1: parallel reduction
+	cout<<"########################################"<<endl;
+	cout<<"Running parallel reduction task..."<<endl<<endl;
+	{
+		size_t LocalWorkSize[3] = {256, 1, 1};
+		CReductionTask reduction(1024 * 1024 * 16);
+		RunComputeTask(reduction, LocalWorkSize);
+	}
+
+	// Task 2: parallel prefix sum
+	cout<<"########################################"<<endl;
+	cout<<"Running parallel prefix sum task..."<<endl<<endl;
+	{
+		size_t LocalWorkSize[3] = {256, 1, 1};
+		CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
+		RunComputeTask(scan, LocalWorkSize);
+	}
+
+
+	return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
--- a/Assignment2/Assignment2/CAssignment2.h
+++ b/Assignment2/Assignment2/CAssignment2.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef _CASSIGNMENT2_H
+#define _CASSIGNMENT2_H
+
+#include "../Common/CAssignmentBase.h"
+
+//! Assignment2 solution
+class CAssignment2 : public CAssignmentBase
+{
+public:
+	virtual ~CAssignment2() {};
+
+	//! This overloaded method contains the specific solution of A2
+	virtual bool DoCompute();
+};
+
+#endif // _CASSIGNMENT2_H
--- a/Assignment2/Assignment2/CMakeLists.txt
+++ b/Assignment2/Assignment2/CMakeLists.txt
+cmake_minimum_required (VERSION 2.8.3) 
+project (GPUComputing) 
+
+# Add our modules to the path
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../cmake/")
+
+
+include(CheckCXXCompilerFlag)
+if (WIN32)
+else (WIN32)
+    #set (EXTRA_COMPILE_FLAGS "-Wall -Werror")
+    set (EXTRA_COMPILE_FLAGS "-Wall")
+    CHECK_CXX_COMPILER_FLAG(-std=c++11 HAS_CXX_11)
+    if (HAS_CXX_11)
+        set(EXTRA_COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -std=c++11")
+        message(STATUS "Enabling C++11 support")
+    else(HAS_CXX_11)
+        message(WARNING "No C++11 support detected, build will fail.")
+    endif()
+    set (CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} ${EXTRA_COMPILE_FLAGS}")
+endif (WIN32)
+
+# Include support for changing the working directory in Visual Studio
+include(ChangeWorkingDirectory)
+
+# Search for OpenCL and add paths
+find_package( OpenCL REQUIRED )
+
+include_directories( ${OPENCL_INCLUDE_DIRS} )
+
+# Include Common module
+add_subdirectory (../Common ${CMAKE_BINARY_DIR}/Common) 
+
+# Define source files for this assignment
+FILE(GLOB Sources *.cpp)
+FILE(GLOB Headers *.h)
+FILE(GLOB CLSources *.cl)
+ADD_EXECUTABLE (Assignment 
+	${Sources}
+	${Headers}
+	${CLSources}
+	)
+
+# Link required libraries
+target_link_libraries(Assignment ${OPENCL_LIBRARIES})
+target_link_libraries(Assignment GPUCommon)
+
+if (WIN32)
+	change_workingdir(Assignment ${CMAKE_SOURCE_DIR})
+endif()
--- a/Assignment2/Assignment2/CReductionTask.cpp
+++ b/Assignment2/Assignment2/CReductionTask.cpp
+/******************************************************************************
+GPU Computing / GPGPU Praktikum source code.
+
+******************************************************************************/
+
+#include "CReductionTask.h"
+
+#include "../Common/CLUtil.h"
+#include "../Common/CTimer.h"
+
+using namespace std;
+
+///////////////////////////////////////////////////////////////////////////////
+// CReductionTask
+
+string g_kernelNames[4] = {
+	"interleavedAddressing",
+	"sequentialAddressing",
+	"kernelDecomposition",
+	"kernelDecompositionUnroll"
+};
+
+CReductionTask::CReductionTask(size_t ArraySize)
+	: m_N(ArraySize), m_hInput(NULL), 
+	m_dPingArray(NULL),
+	m_dPongArray(NULL),
+	m_Program(NULL), 
+	m_InterleavedAddressingKernel(NULL), m_SequentialAddressingKernel(NULL), m_DecompKernel(NULL), m_DecompUnrollKernel(NULL)
+{
+}
+
+CReductionTask::~CReductionTask()
+{
+	ReleaseResources();
+}
+
+bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
+{
+	//CPU resources
+	m_hInput = new unsigned int[m_N];
+
+	//fill the array with some values
+	for(unsigned int i = 0; i < m_N; i++) 
+		//m_hInput[i] = 1;			// Use this for debugging
+		m_hInput[i] = rand() & 15;
+
+	//device resources
+	cl_int clError, clError2;
+	m_dPingArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
+	clError = clError2;
+	m_dPongArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
+	clError |= clError2;
+	V_RETURN_FALSE_CL(clError, "Error allocating device arrays");
+
+	//load and compile kernels
+	string programCode;
+
+	CLUtil::LoadProgramSourceToMemory("Reduction.cl", programCode);
+	m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode);
+	if(m_Program == nullptr) return false;
+
+	//create kernels
+	m_InterleavedAddressingKernel = clCreateKernel(m_Program, "Reduction_InterleavedAddressing", &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_InterleavedAddressing.");
+
+	m_SequentialAddressingKernel = clCreateKernel(m_Program, "Reduction_SequentialAddressing", &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_SequentialAddressing.");
+
+	m_DecompKernel = clCreateKernel(m_Program, "Reduction_Decomp", &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_Decomp.");
+
+	m_DecompUnrollKernel = clCreateKernel(m_Program, "Reduction_DecompUnroll", &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create kernel: Reduction_DecompUnroll.");
+
+	return true;
+}
+
+void CReductionTask::ReleaseResources()
+{
+	// host resources
+	SAFE_DELETE_ARRAY(m_hInput);
+
+	// device resources
+	SAFE_RELEASE_MEMOBJECT(m_dPingArray);
+	SAFE_RELEASE_MEMOBJECT(m_dPongArray);
+
+	SAFE_RELEASE_KERNEL(m_InterleavedAddressingKernel);
+	SAFE_RELEASE_KERNEL(m_SequentialAddressingKernel);
+	SAFE_RELEASE_KERNEL(m_DecompKernel);
+	SAFE_RELEASE_KERNEL(m_DecompUnrollKernel);
+
+	SAFE_RELEASE_PROGRAM(m_Program);
+}
+
+void CReductionTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+	ExecuteTask(Context, CommandQueue, LocalWorkSize, 0);
+	ExecuteTask(Context, CommandQueue, LocalWorkSize, 1);
+	ExecuteTask(Context, CommandQueue, LocalWorkSize, 2);
+	ExecuteTask(Context, CommandQueue, LocalWorkSize, 3);
+
+	TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
+	TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
+	TestPerformance(Context, CommandQueue, LocalWorkSize, 2);
+	TestPerformance(Context, CommandQueue, LocalWorkSize, 3);
+
+}
+
+void CReductionTask::ComputeCPU()
+{
+	CTimer timer;
+	timer.Start();
+
+	unsigned int nIterations = 10;
+	for(unsigned int j = 0; j < nIterations; j++) {
+		m_resultCPU = m_hInput[0];
+		for(unsigned int i = 1; i < m_N; i++) {
+			m_resultCPU += m_hInput[i]; 
+		}
+	}
+
+	timer.Stop();
+
+	double ms = timer.GetElapsedMilliseconds() / double(nIterations);
+	cout << "  average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
+}
+
+bool CReductionTask::ValidateResults()
+{
+	bool success = true;
+
+	for(int i = 0; i < 4; i++)
+		if(m_resultGPU[i] != m_resultCPU)
+		{
+			cout<<"Validation of reduction kernel "<<g_kernelNames[i]<<" failed." << endl;
+			success = false;
+		}
+
+	return success;
+}
+
+void CReductionTask::Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+	//cl_int clErr;
+	//size_t globalWorkSize[1];
+	//size_t localWorkSize[1];
+	//unsigned int stride = ...;
+
+	// TO DO: Implement reduction with interleaved addressing
+
+	//for (...) {
+	//}
+}
+
+void CReductionTask::Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+
+	// TO DO: Implement reduction with sequential addressing
+
+}
+
+void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+
+	// TO DO: Implement reduction with kernel decomposition
+
+	// NOTE: make sure that the final result is always in the variable m_dPingArray
+	// as this is read back for the correctness check
+	// (CReductionTask::ExecuteTask)
+	//
+	// hint: for example, you can use swap(m_dPingArray, m_dPongArray) at the end of your for loop...
+}
+
+void CReductionTask::Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+
+	// TO DO: Implement reduction with loop unrolling
+
+	// NOTE: make sure that the final result is always in the variable m_dPingArray
+	// as this is read back for the correctness check
+	// (CReductionTask::ExecuteTask)
+	//
+	// hint: for example, you can use swap(m_dPingArray, m_dPongArray) at the end of your for loop...
+
+}
+
+void CReductionTask::ExecuteTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
+{
+	//write input data to the GPU
+	V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hInput, 0, NULL, NULL), "Error copying data from host to device!");
+
+	//run selected task
+	switch (Task){
+		case 0:
+			Reduction_InterleavedAddressing(Context, CommandQueue, LocalWorkSize);
+			break;
+		case 1:
+			Reduction_SequentialAddressing(Context, CommandQueue, LocalWorkSize);
+			break;
+		case 2:
+			Reduction_Decomp(Context, CommandQueue, LocalWorkSize);
+			break;
+		case 3:
+			Reduction_DecompUnroll(Context, CommandQueue, LocalWorkSize);
+			break;
+
+	}
+
+	//read back the results synchronously.
+	m_resultGPU[Task] = 0;
+	V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, 1 * sizeof(cl_uint), &m_resultGPU[Task], 0, NULL, NULL), "Error reading data from device!");
+	
+}
+
+void CReductionTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
+{
+	cout << "Testing performance of task " << g_kernelNames[Task] << endl;
+
+	//write input data to the GPU
+	V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hInput, 0, NULL, NULL), "Error copying data from host to device!");
+	//finish all before we start meassuring the time
+	V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
+
+	CTimer timer;
+	timer.Start();
+
+	//run the kernel N times
+	unsigned int nIterations = 100;
+	for(unsigned int i = 0; i < nIterations; i++) {
+		//run selected task
+		switch (Task){
+			case 0:
+				Reduction_InterleavedAddressing(Context, CommandQueue, LocalWorkSize);
+				break;
+			case 1:
+				Reduction_SequentialAddressing(Context, CommandQueue, LocalWorkSize);
+				break;
+			case 2:
+				Reduction_Decomp(Context, CommandQueue, LocalWorkSize);
+				break;
+			case 3:
+				Reduction_DecompUnroll(Context, CommandQueue, LocalWorkSize);
+				break;
+		}
+	}
+
+	//wait until the command queue is empty again
+	V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
+
+	timer.Stop();
+
+	double ms = timer.GetElapsedMilliseconds() / double(nIterations);
+	cout << "  average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
+}
+
+///////////////////////////////////////////////////////////////////////////////
--- a/Assignment2/Assignment2/CReductionTask.h
+++ b/Assignment2/Assignment2/CReductionTask.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef _CREDUCTION_TASK_H
+#define _CREDUCTION_TASK_H
+
+#include "../Common/IComputeTask.h"
+
+//! A2/T1: Parallel reduction
+class CReductionTask : public IComputeTask
+{
+public:
+	CReductionTask(size_t ArraySize);
+
+	virtual ~CReductionTask();
+
+	// IComputeTask
+
+	virtual bool InitResources(cl_device_id Device, cl_context Context);
+	
+	virtual void ReleaseResources();
+
+	virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+
+	virtual void ComputeCPU();
+
+	virtual bool ValidateResults();
+
+protected:
+
+	void Reduction_InterleavedAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+	void Reduction_SequentialAddressing(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+	void Reduction_Decomp(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+	void Reduction_DecompUnroll(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+
+	void ExecuteTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int task);
+	void TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int task);
+
+	//NOTE: we have two memory address spaces, so we mark pointers with a prefix
+	//to avoid confusions: 'h' - host, 'd' - device
+
+	unsigned int		m_N;
+
+	// input data
+	unsigned int		*m_hInput;
+	// results
+	unsigned int		m_resultCPU;
+	unsigned int		m_resultGPU[4];
+
+	cl_mem				m_dPingArray;
+	cl_mem				m_dPongArray;
+
+	//OpenCL program and kernels
+	cl_program			m_Program;
+	cl_kernel			m_InterleavedAddressingKernel;
+	cl_kernel			m_SequentialAddressingKernel;
+	cl_kernel			m_DecompKernel;
+	cl_kernel			m_DecompUnrollKernel;
+
+};
+
+#endif // _CREDUCTION_TASK_H
--- a/Assignment2/Assignment2/CScanTask.cpp
+++ b/Assignment2/Assignment2/CScanTask.cpp
+/******************************************************************************
+GPU Computing / GPGPU Praktikum source code.
+
+******************************************************************************/
+
+#include "CScanTask.h"
+
+#include "../Common/CLUtil.h"
+#include "../Common/CTimer.h"
+
+#include <string.h>
+
+using namespace std;
+
+// number of banks in the local memory. This can be used to avoid bank conflicts
+// but we also need to allocate more local memory for that.
+#define NUM_BANKS	32
+
+///////////////////////////////////////////////////////////////////////////////
+// CScanTask
+
+// only useful for debug info
+const string g_kernelNames[2] = 
+{
+	"scanNaive",
+	"scanWorkEfficient"
+};
+
+CScanTask::CScanTask(size_t ArraySize, size_t MinLocalWorkSize)
+	: m_N(ArraySize), m_hArray(NULL), m_hResultCPU(NULL), m_hResultGPU(NULL),
+	m_dPingArray(NULL), m_dPongArray(NULL),
+	m_Program(NULL), 
+	m_ScanNaiveKernel(NULL), m_ScanWorkEfficientKernel(NULL), m_ScanWorkEfficientAddKernel(NULL)
+{
+	// compute the number of levels that we need for the work-efficient algorithm
+
+	m_MinLocalWorkSize = MinLocalWorkSize;
+
+	m_nLevels = 1;
+	size_t N = ArraySize;
+	while (N > 0){
+		N /= 2 * m_MinLocalWorkSize;
+		m_nLevels++;
+	}
+
+	// Reset validation results
+	for (int i = 0; i < (int)ARRAYLEN(m_bValidationResults); i++)
+		m_bValidationResults[i] = false;
+}
+
+CScanTask::~CScanTask()
+{
+	ReleaseResources();
+}
+
+bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
+{
+	//CPU resources
+	m_hArray	 = new unsigned int[m_N];
+	m_hResultCPU = new unsigned int[m_N];
+	m_hResultGPU = new unsigned int[m_N];
+
+	//fill the array with some values
+	for(unsigned int i = 0; i < m_N; i++)
+		//m_hArray[i] = 1;			// Use this for debugging
+		m_hArray[i] = rand() & 15;
+
+	//device resources
+	// ping-pong buffers
+	cl_int clError, clError2;
+	m_dPingArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
+	clError = clError2;
+	m_dPongArray = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * m_N, NULL, &clError2);
+	clError |= clError2;
+
+	// level buffer
+	m_dLevelArrays = new cl_mem[m_nLevels];
+	unsigned int N = m_N;
+	for (unsigned int i = 0; i < m_nLevels; i++) {
+		m_dLevelArrays[i] = clCreateBuffer(Context, CL_MEM_READ_WRITE, sizeof(cl_uint) * N, NULL, &clError2);
+		clError |= clError2;
+		N = max(N / (2 * m_MinLocalWorkSize), m_MinLocalWorkSize);
+	}
+	V_RETURN_FALSE_CL(clError, "Error allocating device arrays");
+
+	//load and compile kernels
+	string programCode;
+
+	CLUtil::LoadProgramSourceToMemory("Scan.cl", programCode);
+	m_Program = CLUtil::BuildCLProgramFromMemory(Device, Context, programCode);
+	if(m_Program == nullptr) return false;
+
+	//create kernels
+	m_ScanNaiveKernel = clCreateKernel(m_Program, "Scan_Naive", &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create kernel.");
+
+	m_ScanWorkEfficientKernel = clCreateKernel(m_Program, "Scan_WorkEfficient", &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create kernel.");
+
+	m_ScanWorkEfficientAddKernel = clCreateKernel(m_Program, "Scan_WorkEfficientAdd", &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create kernel.");
+
+	return true;
+}
+
+void CScanTask::ReleaseResources()
+{
+	// host resources
+	SAFE_DELETE_ARRAY(m_hArray);
+
+	SAFE_DELETE_ARRAY(m_hResultCPU);
+	SAFE_DELETE_ARRAY(m_hResultGPU);
+
+	// device resources
+	SAFE_RELEASE_MEMOBJECT(m_dPingArray);
+	SAFE_RELEASE_MEMOBJECT(m_dPongArray);
+
+	if(m_dLevelArrays)
+		for (unsigned int i = 0; i < m_nLevels; i++) {
+			SAFE_RELEASE_MEMOBJECT(m_dLevelArrays[i]);
+		}
+	SAFE_DELETE_ARRAY(m_dLevelArrays);
+
+	SAFE_RELEASE_KERNEL(m_ScanNaiveKernel);
+	SAFE_RELEASE_KERNEL(m_ScanWorkEfficientKernel);
+	SAFE_RELEASE_KERNEL(m_ScanWorkEfficientAddKernel);
+
+	SAFE_RELEASE_PROGRAM(m_Program);
+}
+
+void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+	cout << endl;
+
+	ValidateTask(Context, CommandQueue, LocalWorkSize, 0);
+	ValidateTask(Context, CommandQueue, LocalWorkSize, 1);
+
+	cout << endl;
+
+	TestPerformance(Context, CommandQueue, LocalWorkSize, 0);
+	TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
+
+	cout << endl;
+}
+
+void CScanTask::ComputeCPU()
+{
+	CTimer timer;
+	timer.Start();
+
+	unsigned int nIterations = 1;
+	for(unsigned int j = 0; j < nIterations; j++) {
+		unsigned int sum = 0;
+		for(unsigned int i = 0; i < m_N; i++) {
+			sum += m_hArray[i];
+			m_hResultCPU[i] = sum; 
+		}
+	}
+
+	timer.Stop();
+	double ms = timer.GetElapsedMilliseconds() / double(nIterations);
+	cout << "  average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
+}
+
+bool CScanTask::ValidateResults()
+{
+	bool success = true;
+
+	for(int i = 0; i < 2; i++)
+		if(!m_bValidationResults[i])
+		{
+			cout<<"Validation of reduction kernel "<<g_kernelNames[i]<<" failed." << endl;
+			success = false;
+		}
+
+	return success;
+}
+
+void CScanTask::Scan_Naive(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+
+	// TO DO: Implement naive version of scan
+
+	// NOTE: make sure that the final result is always in the variable m_dPingArray
+	// as this is read back for the correctness check
+	// (CReductionTask::ValidateTask)
+	//
+	// hint: for example, you can use swap(m_dPingArray, m_dPongArray) at the end of your for loop...
+}
+
+void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3])
+{
+
+	// TO DO: Implement efficient version of scan
+
+	// Make sure that the local prefix sum works before you start experimenting with large arrays
+
+}
+
+void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
+{
+	//run selected task
+	switch (Task){
+		case 0:
+			V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hArray, 0, NULL, NULL), "Error copying data from host to device!");
+			Scan_Naive(Context, CommandQueue, LocalWorkSize);
+			V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
+			break;
+		case 1:
+			V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dLevelArrays[0], CL_FALSE, 0, m_N * sizeof(cl_uint), m_hArray, 0, NULL, NULL), "Error copying data from host to device!");
+			Scan_WorkEfficient(Context, CommandQueue, LocalWorkSize);
+			V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[0], CL_TRUE, 0, m_N * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
+			break;
+	}
+
+	// validate results
+	m_bValidationResults[Task] =( memcmp(m_hResultCPU, m_hResultGPU, m_N * sizeof(unsigned int)) == 0);
+}
+
+void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task)
+{
+	cout << "Testing performance of task " << g_kernelNames[Task] << endl;
+
+	//write input data to the GPU
+	V_RETURN_CL(clEnqueueWriteBuffer(CommandQueue, m_dPingArray, CL_FALSE, 0, m_N * sizeof(cl_uint), m_hArray, 0, NULL, NULL), "Error copying data from host to device!");
+	//finish all before we start meassuring the time
+	V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
+
+	CTimer timer;
+	timer.Start();
+
+	//run the kernel N times
+	unsigned int nIterations = 100;
+	for(unsigned int i = 0; i < nIterations; i++) {
+		//run selected task
+		switch (Task){
+			case 0:
+				Scan_Naive(Context, CommandQueue, LocalWorkSize);
+				break;
+			case 1:
+				Scan_WorkEfficient(Context, CommandQueue, LocalWorkSize);
+				break;
+		}
+	}
+
+	//wait until the command queue is empty again
+	V_RETURN_CL(clFinish(CommandQueue), "Error finishing the queue!");
+
+	timer.Stop();
+
+	double ms = timer.GetElapsedMilliseconds() / double(nIterations);
+	cout << "  average time: " << ms << " ms, throughput: " << 1.0e-6 * (double)m_N / ms << " Gelem/s" <<endl;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////
--- a/Assignment2/Assignment2/CScanTask.h
+++ b/Assignment2/Assignment2/CScanTask.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef _CSCAN_TASK_H
+#define _CSCAN_TASK_H
+
+#include "../Common/IComputeTask.h"
+
+//! A2 / T2 Parallel prefix sum (scan)
+class CScanTask : public IComputeTask
+{
+public:
+	//! The second parameter is necessary to pre-allocate the multi-level arrays
+	CScanTask(size_t ArraySize, size_t MinLocalWorkSize);
+
+	virtual ~CScanTask();
+
+	// IComputeTask
+	virtual bool InitResources(cl_device_id Device, cl_context Context);
+	
+	virtual void ReleaseResources();
+
+	virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+
+	virtual void ComputeCPU();
+
+	virtual bool ValidateResults();
+
+protected:
+
+	void Scan_Naive(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+	void Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]);
+
+	void ValidateTask(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task);
+	void TestPerformance(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3], unsigned int Task);
+
+	unsigned int		m_N;
+
+	//float data on the CPU
+	unsigned int		*m_hArray;
+
+	unsigned int		*m_hResultCPU;
+	unsigned int		*m_hResultGPU;
+	bool				m_bValidationResults[2];
+
+	// ping-pong arrays for the naive scan
+	cl_mem				m_dPingArray;
+	cl_mem				m_dPongArray;
+
+	// arrays for each level of the work-efficient scan
+	size_t				m_MinLocalWorkSize;
+	unsigned int		m_nLevels;
+	cl_mem				*m_dLevelArrays;
+
+	//OpenCL program and kernels
+	cl_program			m_Program;
+	cl_kernel			m_ScanNaiveKernel;
+	cl_kernel			m_ScanWorkEfficientKernel;
+	cl_kernel			m_ScanWorkEfficientAddKernel;
+};
+
+#endif // _CSCAN_TASK_H
--- a/Assignment2/Assignment2/Reduction.cl
+++ b/Assignment2/Assignment2/Reduction.cl
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void Reduction_InterleavedAddressing(__global uint* array, uint stride) 
+{
+	// TO DO: Kernel implementation
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void Reduction_SequentialAddressing(__global uint* array, uint stride) 
+{
+	// TO DO: Kernel implementation
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
+{
+	// TO DO: Kernel implementation
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void Reduction_DecompUnroll(const __global uint* inArray, __global uint* outArray, uint N, __local uint* localBlock)
+{
+	// TO DO: Kernel implementation
+}
--- a/Assignment2/Assignment2/Scan.cl
+++ b/Assignment2/Assignment2/Scan.cl
+
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray, uint N, uint offset) 
+{
+	// TO DO: Kernel implementation
+}
+
+
+
+// Why did we not have conflicts in the Reduction? Because of the sequential addressing (here we use interleaved => we have conflicts).
+
+#define UNROLL
+#define NUM_BANKS			32
+#define NUM_BANKS_LOG		5
+#define SIMD_GROUP_SIZE		32
+
+// Bank conflicts
+#define AVOID_BANK_CONFLICTS
+#ifdef AVOID_BANK_CONFLICTS
+	// TO DO: define your conflict-free macro here
+#else
+	#define OFFSET(A) (A)
+#endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLevelArray, __local uint* localBlock) 
+{
+	// TO DO: Kernel implementation
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+__kernel void Scan_WorkEfficientAdd(__global uint* higherLevelArray, __global uint* array, __local uint* localBlock) 
+{
+	// TO DO: Kernel implementation (large arrays)
+	// Kernel that should add the group PPS to the local PPS (Figure 14)
+}
\ No newline at end of file
--- a/Assignment2/Assignment2/main.cpp
+++ b/Assignment2/Assignment2/main.cpp
+/******************************************************************************
+GPU Computing / GPGPU Praktikum source code.
+
+******************************************************************************/
+
+#include "CAssignment2.h"
+
+#include <iostream>
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+	CAssignment2 myAssignment;
+
+	auto success = myAssignment.EnterMainLoop(argc, argv);
+
+#ifdef _MSC_VER
+	cout<<"Press any key..."<<endl;
+	cin.get();
+#endif
+
+	return success ? 0 : 1;
+}
--- a/Assignment2/Common/CAssignmentBase.cpp
+++ b/Assignment2/Common/CAssignmentBase.cpp
+/******************************************************************************
+GPU Computing / GPGPU Praktikum source code.
+
+******************************************************************************/
+
+#include "CAssignmentBase.h"
+
+#include "CLUtil.h"
+#include "CTimer.h"
+
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+#if defined (__APPLE__) || defined(MACOSX)
+   #define GL_SHARING_EXTENSION "cl_APPLE_gl_sharing"
+#else
+   #define GL_SHARING_EXTENSION "cl_khr_gl_sharing"
+#endif
+
+// required for OpenGL interop
+#ifdef _WIN32
+    #include <windows.h>
+#endif
+
+#ifdef linux
+    #if defined (__APPLE__) || defined(MACOSX)
+        #include <OpenGL/OpenGL.h>
+    #else
+        #include <GL/glx.h>
+    #endif
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// CAssignmentBase
+
+CAssignmentBase::CAssignmentBase()
+	: m_CLPlatform(nullptr), m_CLDevice(nullptr), m_CLContext(nullptr), m_CLCommandQueue(nullptr)
+{
+}
+
+CAssignmentBase::~CAssignmentBase()
+{
+	ReleaseCLContext();
+}
+
+bool CAssignmentBase::EnterMainLoop(int, char**)
+{
+	if(!InitCLContext())
+		return false;
+
+	bool success = DoCompute();
+
+	ReleaseCLContext();
+
+	return success;
+}
+
+#define PRINT_INFO(title, buffer, bufferSize, maxBufferSize, expr) { expr; buffer[bufferSize] = '\0'; std::cout << title << ": " << buffer << std::endl; }
+
+bool CAssignmentBase::InitCLContext()
+{
+	//////////////////////////////////////////////////////
+	//(Sect 4.3)
+
+	// 1. get all platform IDs
+
+	std::vector<cl_platform_id> platformIds;
+	const cl_uint c_MaxPlatforms = 16;
+	platformIds.resize(c_MaxPlatforms);
+	
+	cl_uint countPlatforms;
+	V_RETURN_FALSE_CL(clGetPlatformIDs(c_MaxPlatforms, &platformIds[0], &countPlatforms), "Failed to get CL platform ID");
+	platformIds.resize(countPlatforms);
+
+	// 2. find all available GPU devices
+	std::vector<cl_device_id> deviceIds;
+	const int maxDevices = 16;
+	deviceIds.resize(maxDevices);
+	int countAllDevices = 0;
+
+	// Searching for the graphics device with the most dedicated video memory.
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+
+	cl_ulong maxGlobalMemorySize = 0;
+	cl_device_id bestDeviceId = NULL;
+
+	for (size_t i = 0; i < platformIds.size(); i++)
+	{
+		// Getting the available devices.
+		cl_uint countDevices;
+        auto res = clGetDeviceIDs(platformIds[i], deviceType, 1, &deviceIds[countAllDevices], &countDevices);
+        if(res != CL_SUCCESS) // Maybe there are no GPU devices and some poor implementation doesn't set count devices to zero and return CL_DEVICE_NOT_FOUND.
+        {
+            char buffer[1024];
+            clGetPlatformInfo(platformIds[i], CL_PLATFORM_NAME, 1024, buffer, nullptr);
+            printf("[WARNING]: clGetDeviceIDs() failed. Error type: %s, Platform name: %s!\n",
+				CLUtil::GetCLErrorString(res), buffer);
+            continue;
+        }
+		for (size_t j = 0; j < countDevices; j++)
+		{
+			cl_device_id currentDeviceId = deviceIds[countAllDevices + j];
+			cl_ulong globalMemorySize;
+			cl_bool isUsingUnifiedMemory;
+			clGetDeviceInfo(currentDeviceId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &globalMemorySize, NULL);
+			clGetDeviceInfo(currentDeviceId, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), &isUsingUnifiedMemory, NULL);
+
+			if (!isUsingUnifiedMemory && globalMemorySize > maxGlobalMemorySize)
+			{
+				bestDeviceId = currentDeviceId;
+				maxGlobalMemorySize = globalMemorySize;
+			}
+		}
+
+		countAllDevices += countDevices;
+	}
+	deviceIds.resize(countAllDevices);
+
+	if (countAllDevices == 0)
+	{
+		std::cout << "No device of the selected type with OpenCL support was found.";
+		return false;
+	}
+
+	// No discrete graphics device was found: falling back to the first found device.
+	if (bestDeviceId == NULL)
+	{
+		bestDeviceId = deviceIds[0];
+	}
+
+	// Choosing the first available device.
+	m_CLDevice = bestDeviceId;
+	clGetDeviceInfo(m_CLDevice, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &m_CLPlatform, NULL);
+
+	// Printing platform and device data.
+	const int maxBufferSize = 1024;
+	char buffer[maxBufferSize];
+	size_t bufferSize;
+	std::cout << "OpenCL platform:" << std::endl << std::endl;
+	PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_NAME, maxBufferSize, (void*)buffer, &bufferSize));
+	PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
+	PRINT_INFO("Version", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
+	PRINT_INFO("Profile", buffer, bufferSize, maxBufferSize, clGetPlatformInfo(m_CLPlatform, CL_PLATFORM_PROFILE, maxBufferSize, (void*)buffer, &bufferSize));
+	std::cout << std::endl << "Device:" << std::endl << std::endl;
+	PRINT_INFO("Name", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_NAME, maxBufferSize, (void*)buffer, &bufferSize));
+	PRINT_INFO("Vendor", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DEVICE_VENDOR, maxBufferSize, (void*)buffer, &bufferSize));
+	PRINT_INFO("Driver version", buffer, bufferSize, maxBufferSize, clGetDeviceInfo(m_CLDevice, CL_DRIVER_VERSION, maxBufferSize, (void*)buffer, &bufferSize));
+	cl_ulong localMemorySize;
+	clGetDeviceInfo(m_CLDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemorySize, &bufferSize);
+	std::cout << "Local memory size: " << localMemorySize << " Byte" << std::endl;
+	std::cout << std::endl << "******************************" << std::endl << std::endl;
+
+        
+	cl_int clError;
+
+	m_CLContext = clCreateContext(NULL, 1, &m_CLDevice, NULL, NULL, &clError);
+		
+	V_RETURN_FALSE_CL(clError, "Failed to create OpenCL context.");
+
+	// Finally, create a command queue. All the asynchronous commands to the device will be issued
+	// from the CPU into this queue. This way the host program can continue the execution until some results
+	// from that device are needed.
+
+	m_CLCommandQueue = clCreateCommandQueue(m_CLContext, m_CLDevice, 0, &clError);
+	V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context");
+
+	return true;
+}
+
+void CAssignmentBase::ReleaseCLContext()
+{
+	if (m_CLCommandQueue != nullptr)
+	{
+		clReleaseCommandQueue(m_CLCommandQueue);
+		m_CLCommandQueue = nullptr;
+	}
+
+	if (m_CLContext != nullptr)
+	{
+		clReleaseContext(m_CLContext);
+		m_CLContext = nullptr;
+	}
+}
+
+bool CAssignmentBase::RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3])
+{
+	if(m_CLContext == nullptr)
+	{
+		std::cerr<<"Error: RunComputeTask() cannot execute because the OpenCL context has not been created first."<<endl;
+	}
+	
+	if(!Task.InitResources(m_CLDevice, m_CLContext))
+	{
+		std::cerr << "Error during resource allocation. Aborting execution." <<endl;
+		Task.ReleaseResources();
+		return false;
+	}
+
+	// Compute the golden result.
+	cout << "Computing CPU reference result...";
+	Task.ComputeCPU();
+	cout << "DONE" << endl;
+
+	// Running the same task on the GPU.
+	cout << "Computing GPU result...";
+
+	// Runing the kernel N times. This make the measurement of the execution time more accurate.
+	Task.ComputeGPU(m_CLContext, m_CLCommandQueue, LocalWorkSize);
+	cout << "DONE" << endl;
+
+	// Validating results.
+	if (Task.ValidateResults())
+	{
+		cout << "GOLD TEST PASSED!" << endl;
+	}
+	else
+	{
+		cout << "INVALID RESULTS!" << endl;
+	}
+	
+	// Cleaning up.
+	Task.ReleaseResources();
+
+	return true;
+}
+
+///////////////////////////////////////////////////////////////////////////////
--- a/Assignment2/Common/CAssignmentBase.h
+++ b/Assignment2/Common/CAssignmentBase.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef _CASSIGNMENT_BASE_H
+#define _CASSIGNMENT_BASE_H
+
+#include "IComputeTask.h"
+
+#include "CommonDefs.h"
+
+//! Base class for all assignments
+/*! 
+	Inherit a new class for each specific assignment.
+	This class is abstract.
+
+	Usage of class: from your main CPP you typically call
+	EnterMainLoop(). This returns when the assignment is finished.
+
+	Internally the assignment class should initialize the context,
+	run one or more compute tasks and then release the context.
+*/
+class CAssignmentBase
+{
+public:
+	CAssignmentBase();
+
+	virtual ~CAssignmentBase();
+
+	//! Main loop. You only need to overload this if you do some rendering in your assignment.
+	virtual bool EnterMainLoop(int argc, char** argv);
+
+	//! You need to overload this to define a specific behavior for your assignments
+	virtual bool DoCompute() = 0;
+
+protected:	
+	virtual bool InitCLContext();
+
+	virtual void ReleaseCLContext();
+
+	virtual bool RunComputeTask(IComputeTask& Task, size_t LocalWorkSize[3]);
+
+	cl_platform_id		m_CLPlatform;
+	cl_device_id		m_CLDevice;
+	cl_context			m_CLContext;
+	cl_command_queue	m_CLCommandQueue;
+};
+
+#endif // _CASSIGNMENT_BASE_H
--- a/Assignment2/Common/CLUtil.cpp
+++ b/Assignment2/Common/CLUtil.cpp
+/******************************************************************************
+GPU Computing / GPGPU Praktikum source code.
+
+******************************************************************************/
+
+#include "CLUtil.h"
+#include "CTimer.h"
+
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+///////////////////////////////////////////////////////////////////////////////
+// CLUtil
+
+size_t CLUtil::GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize)
+{
+	size_t r = DataElemCount % LocalWorkSize;
+	if(r == 0)
+		return DataElemCount;
+	else
+		return DataElemCount + LocalWorkSize - r;
+}
+
+bool CLUtil::LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode)
+{
+	ifstream sourceFile;
+	
+	sourceFile.open(Path.c_str());
+	if (!sourceFile.is_open())
+	{
+		cerr << "Failed to open file '" << Path << "'." << endl;
+		return false;
+	}
+
+	// read the entire file into a string
+	sourceFile.seekg(0, ios::end);
+	ifstream::pos_type fileSize = sourceFile.tellg();
+	sourceFile.seekg(0, ios::beg);
+
+	SourceCode.clear();
+	SourceCode.resize((size_t)fileSize);
+	sourceFile.read(&SourceCode[0], fileSize);
+
+	return true;
+}
+
+cl_program CLUtil::BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions)
+{
+	
+	// Ignore the last parameter CompileOptions in assignment 1
+	// This may be used later to pass flags and macro definitions to the OpenCL compiler
+
+	cl_program prog = nullptr;
+
+
+		string srcSolution = SourceCode;
+
+	const char* src = srcSolution.c_str();
+	size_t length = srcSolution.size();
+
+	cl_int clError;
+	prog = clCreateProgramWithSource(Context, 1, &src, &length, &clError);
+	if(CL_SUCCESS != clError)
+	{
+		cerr<<"Failed to create CL program from source.";
+		return nullptr;
+	}
+
+	// program created, now build it:
+	const char* pCompileOptions = CompileOptions.size() > 0 ? CompileOptions.c_str() : nullptr;
+	clError = clBuildProgram(prog, 1, &Device, pCompileOptions, NULL, NULL);
+	PrintBuildLog(prog, Device);
+	if(CL_SUCCESS != clError)
+	{
+		cerr<<"Failed to build CL program.";
+		SAFE_RELEASE_PROGRAM(prog);
+		return nullptr;
+	}
+
+
+	return prog;
+}
+
+void CLUtil::PrintBuildLog(cl_program Program, cl_device_id Device)
+{
+	cl_build_status buildStatus;
+	clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &buildStatus, NULL);
+
+	// let's print out possible warnings even if the kernel compiled..
+	//if(buildStatus == CL_SUCCESS)
+	//	return;
+
+	//there were some errors.
+	size_t logSize;
+	clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+	string buildLog(logSize, ' ');
+
+	clGetProgramBuildInfo(Program, Device, CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], NULL);
+	buildLog[logSize] = '\0';
+
+	if(buildStatus != CL_SUCCESS)
+		cout<<"There were build errors!"<<endl;
+	cout<<"Build log:"<<endl;
+	cout<<buildLog<<endl;
+}
+
+double CLUtil::ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions, 
+		const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations)
+{
+	CTimer timer;
+	cl_int clErr;
+
+	// wait until the command queue is empty...
+	// Should not be used in production code, but this synchronizes HOST and DEVICE
+	clErr = clFinish(CommandQueue);
+
+	timer.Start();
+
+	// run the kernel N times for better average accuracy
+	for(int i = 0; i < NIterations; i++)
+	{
+		clErr |= clEnqueueNDRangeKernel(CommandQueue, Kernel, Dimensions, NULL, pGlobalWorkSize, pLocalWorkSize, 0, NULL, NULL);
+	}
+	// wait again to sync
+	clErr |= clFinish(CommandQueue);
+
+	timer.Stop();
+
+	if(clErr != CL_SUCCESS)
+	{
+		string errorString = GetCLErrorString(clErr);
+		cerr<<"Kernel execution failure: "<<errorString<<endl;
+	}
+
+	return timer.GetElapsedMilliseconds() / double(NIterations);
+}
+
+#define CL_ERROR(x) case (x): return #x;
+
+const char* CLUtil::GetCLErrorString(cl_int CLErrorCode)
+{
+	switch(CLErrorCode)
+	{
+        CL_ERROR(CL_SUCCESS);
+        CL_ERROR(CL_DEVICE_NOT_FOUND);
+        CL_ERROR(CL_DEVICE_NOT_AVAILABLE);
+        CL_ERROR(CL_COMPILER_NOT_AVAILABLE);
+        CL_ERROR(CL_MEM_OBJECT_ALLOCATION_FAILURE);
+        CL_ERROR(CL_OUT_OF_RESOURCES);
+        CL_ERROR(CL_OUT_OF_HOST_MEMORY);
+        CL_ERROR(CL_PROFILING_INFO_NOT_AVAILABLE);
+        CL_ERROR(CL_MEM_COPY_OVERLAP);
+        CL_ERROR(CL_IMAGE_FORMAT_MISMATCH);
+        CL_ERROR(CL_IMAGE_FORMAT_NOT_SUPPORTED);
+        CL_ERROR(CL_BUILD_PROGRAM_FAILURE);
+        CL_ERROR(CL_MAP_FAILURE);
+        CL_ERROR(CL_INVALID_VALUE);
+        CL_ERROR(CL_INVALID_DEVICE_TYPE);
+        CL_ERROR(CL_INVALID_PLATFORM);
+        CL_ERROR(CL_INVALID_DEVICE);
+        CL_ERROR(CL_INVALID_CONTEXT);
+        CL_ERROR(CL_INVALID_QUEUE_PROPERTIES);
+        CL_ERROR(CL_INVALID_COMMAND_QUEUE);
+        CL_ERROR(CL_INVALID_HOST_PTR);
+        CL_ERROR(CL_INVALID_MEM_OBJECT);
+        CL_ERROR(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+        CL_ERROR(CL_INVALID_IMAGE_SIZE);
+        CL_ERROR(CL_INVALID_SAMPLER);
+        CL_ERROR(CL_INVALID_BINARY);
+        CL_ERROR(CL_INVALID_BUILD_OPTIONS);
+        CL_ERROR(CL_INVALID_PROGRAM);
+        CL_ERROR(CL_INVALID_PROGRAM_EXECUTABLE);
+        CL_ERROR(CL_INVALID_KERNEL_NAME);
+        CL_ERROR(CL_INVALID_KERNEL_DEFINITION);
+        CL_ERROR(CL_INVALID_KERNEL);
+        CL_ERROR(CL_INVALID_ARG_INDEX);
+        CL_ERROR(CL_INVALID_ARG_VALUE);
+        CL_ERROR(CL_INVALID_ARG_SIZE);
+        CL_ERROR(CL_INVALID_KERNEL_ARGS);
+        CL_ERROR(CL_INVALID_WORK_DIMENSION);
+        CL_ERROR(CL_INVALID_WORK_GROUP_SIZE);
+        CL_ERROR(CL_INVALID_WORK_ITEM_SIZE);
+        CL_ERROR(CL_INVALID_GLOBAL_OFFSET);
+        CL_ERROR(CL_INVALID_EVENT_WAIT_LIST);
+        CL_ERROR(CL_INVALID_EVENT);
+        CL_ERROR(CL_INVALID_OPERATION);
+        CL_ERROR(CL_INVALID_GL_OBJECT);
+        CL_ERROR(CL_INVALID_BUFFER_SIZE);
+        CL_ERROR(CL_INVALID_MIP_LEVEL);
+        default:
+			return "Unknown error code";
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////
--- a/Assignment2/Common/CLUtil.h
+++ b/Assignment2/Common/CLUtil.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef CL_UTIL_H
+#define CL_UTIL_H
+
+// All OpenCL headers
+#if defined(WIN32)
+    #include <CL/opencl.h>
+#elif defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/cl.h>
+#endif 
+
+#include "CommonDefs.h"
+
+#include <string>
+#include <iostream>
+#include <algorithm>
+
+//! Utility class for frequently-needed OpenCL tasks
+// TO DO: replace this with a nicer OpenCL wrapper
+class CLUtil
+{
+public:
+	//! Determines the OpenCL global work size given the number of data elements and threads per workgroup
+	static size_t GetGlobalWorkSize(size_t DataElemCount, size_t LocalWorkSize);
+
+	//! Loads a program source to memory as a string
+	static bool LoadProgramSourceToMemory(const std::string& Path, std::string& SourceCode);
+
+	//! Builds a CL program
+	static cl_program BuildCLProgramFromMemory(cl_device_id Device, cl_context Context, const std::string& SourceCode, const std::string& CompileOptions = "");
+
+	static void PrintBuildLog(cl_program Program, cl_device_id Device);
+
+	//! Measures the execution time of a kernel by executing it N times and returning the average time in milliseconds.
+	/*!
+		The scheduling cost of the kernel can be amortized if we enqueue
+		the kernel multiple times. If your kernel is simple and fast, use a high number of iterations!		
+	*/
+	static double ProfileKernel(cl_command_queue CommandQueue, cl_kernel Kernel, cl_uint Dimensions, 
+		const size_t* pGlobalWorkSize, const size_t* pLocalWorkSize, int NIterations);
+
+	static const char* GetCLErrorString(cl_int CLErrorCode);
+};
+
+// Some useful shortcuts for handling pointers and validating function calls
+#define V_RETURN_FALSE_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return false; }} while(0)
+#define V_RETURN_0_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return 0; }} while(0)
+#define V_RETURN_CL(expr, errmsg) do {cl_int e=(expr);if(CL_SUCCESS!=e){std::cerr<<"Error: "<<errmsg<<" ["<<CLUtil::GetCLErrorString(e)<<"]"<<std::endl; return; }} while(0)
+
+#define SAFE_DELETE(ptr) do {if(ptr){ delete ptr; ptr = NULL; }} while(0)
+#define SAFE_DELETE_ARRAY(x) do {if(x){delete [] x; x = NULL;}} while(0)
+
+#define SAFE_RELEASE_KERNEL(ptr)    do {if(ptr){ clReleaseKernel(ptr); ptr = NULL; }} while(0)
+#define SAFE_RELEASE_PROGRAM(ptr)   do {if(ptr){ clReleaseProgram(ptr); ptr = NULL; }} while(0)
+#define SAFE_RELEASE_MEMOBJECT(ptr) do {if(ptr){ clReleaseMemObject(ptr); ptr = NULL; }} while(0)
+#define SAFE_RELEASE_SAMPLER(ptr)   do {if(ptr){ clReleaseSampler(ptr); ptr = NULL; }} while(0)
+
+#define ARRAYLEN(a) (sizeof(a)/sizeof(a[0]))
+
+#endif // CL_UTIL_H
--- a/Assignment2/Common/CMakeLists.txt
+++ b/Assignment2/Common/CMakeLists.txt
+
+FILE(GLOB CommonSources *.cpp)
+FILE(GLOB CommonHeaders *.h)
+
+add_library(GPUCommon 
+	${CommonSources}
+	${CommonHeaders}
+)
\ No newline at end of file
--- a/Assignment2/Common/CTimer.cpp
+++ b/Assignment2/Common/CTimer.cpp
+/******************************************************************************
+GPU Computing / GPGPU Praktikum source code.
+
+******************************************************************************/
+
+#include "CTimer.h"
+
+///////////////////////////////////////////////////////////////////////////////
+// CTimer
+
+void CTimer::Start()
+{
+#ifdef _WIN32
+	QueryPerformanceCounter(&m_StartTime);
+#else
+	gettimeofday(&m_StartTime, NULL);
+#endif
+}
+
+void CTimer::Stop()
+{
+#ifdef _WIN32
+	QueryPerformanceCounter(&m_EndTime);
+#else
+	gettimeofday(&m_EndTime, NULL);
+#endif
+}
+
+double CTimer::GetElapsedMilliseconds()
+{
+#ifdef _WIN32
+	LARGE_INTEGER freq;
+	if(QueryPerformanceFrequency(&freq))
+	{
+		return 1000.0 * double(m_EndTime.QuadPart - m_StartTime.QuadPart) / double(freq.QuadPart);
+	}
+	else
+	{
+		return -1;
+	}
+#else
+	double delta = ((double)m_EndTime.tv_sec + 1.0e-6 * (double)m_EndTime.tv_usec) - 
+		((double)m_StartTime.tv_sec + 1.0e-6 * (double)m_StartTime.tv_usec);
+	return 1000.0 * delta;
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
--- a/Assignment2/Common/CTimer.h
+++ b/Assignment2/Common/CTimer.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef _CTIMER_H
+#define _CTIMER_H
+
+//Simple wrapper class that can be used to measure time intervals
+//using the built-in precision timer of the OS
+
+// We reverted from std::chrono, because that timer implementation seems to be very imprecise
+// (at least under windows)
+
+#ifdef _WIN32
+
+#include <Windows.h>
+
+#elif defined (__APPLE__) || defined(MACOSX)
+
+#include <sys/time.h>
+
+#else
+
+#include <sys/time.h>
+#include <time.h>
+
+#endif
+
+//! Simple wrapper class for the measurement of time intervals
+/*!
+	Use this timer to measure elapsed time on the HOST side.
+	Not suitable for measuring the execution of DEVICE code
+	without synchronization with the HOST.
+
+	NOTE: This class is not thread-safe (like most other classes in these
+	examples), but we are not doing CPU multithreading in the praktikum...
+*/
+class CTimer
+{
+public:
+
+	CTimer(){};
+
+	~CTimer(){};
+
+	void Start();
+
+	void Stop();
+
+	//! Returns the elapsed time between Start() and Stop() in ms.
+	double GetElapsedMilliseconds();
+
+protected:
+
+#ifdef WIN32
+	LARGE_INTEGER		m_StartTime;
+	LARGE_INTEGER		m_EndTime;
+#else
+	struct timeval		m_StartTime;
+	struct timeval		m_EndTime;
+#endif
+};
+
+#endif // _CTIMER_H
--- a/Assignment2/Common/CommonDefs.h
+++ b/Assignment2/Common/CommonDefs.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
--- a/Assignment2/Common/IComputeTask.h
+++ b/Assignment2/Common/IComputeTask.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef _ICOMPUTE_TASK_H
+#define _ICOMPUTE_TASK_H
+
+// All OpenCL headers
+#if defined(WIN32)
+    #include <CL/opencl.h>
+#elif defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/cl.h>
+#endif 
+
+#include "CommonDefs.h"
+
+//! Common interface for the tasks within the assignment.
+/*!
+	Inherit a new class for each computing task.
+	(There are usually more tasks in each assignment).
+*/
+class IComputeTask
+{
+public:
+
+	virtual ~IComputeTask() {};
+	
+	//! Init any resources specific to the current task
+	virtual bool InitResources(cl_device_id Device, cl_context Context) = 0;
+
+	//! Release everything allocated in InitResources()
+	virtual void ReleaseResources() = 0;
+
+	//! Perform calculations on the GPU
+	virtual void ComputeGPU(cl_context Context, cl_command_queue CommandQueue, size_t LocalWorkSize[3]) = 0;
+
+	//! Compute the "golden" solution on the CPU. The GPU results must be equal to this reference
+	virtual void ComputeCPU() = 0;
+
+	//! Compare the GPU solution to the "golden" solution
+	virtual bool ValidateResults() = 0;
+};
+
+#endif // _ICOMPUTE_TASK_H
--- a/Assignment2/Common/IGUIEnabledComputeTask.h
+++ b/Assignment2/Common/IGUIEnabledComputeTask.h
+/******************************************************************************
+                         .88888.   888888ba  dP     dP 
+                        d8'   `88  88    `8b 88     88 
+                        88        a88aaaa8P' 88     88 
+                        88   YP88  88        88     88 
+                        Y8.   .88  88        Y8.   .8P 
+                         `88888'   dP        `Y88888P' 
+                                                       
+                                                       
+   a88888b.                                         dP   oo                   
+  d8'   `88                                         88                        
+  88        .d8888b. 88d8b.d8b. 88d888b. dP    dP d8888P dP 88d888b. .d8888b. 
+  88        88'  `88 88'`88'`88 88'  `88 88    88   88   88 88'  `88 88'  `88 
+  Y8.   .88 88.  .88 88  88  88 88.  .88 88.  .88   88   88 88    88 88.  .88 
+   Y88888P' `88888P' dP  dP  dP 88Y888P' `88888P'   dP   dP dP    dP `8888P88 
+                                88                                        .88 
+                                dP                                    d8888P  
+******************************************************************************/
+
+#ifndef _IGUI_ENABLED_COMPUTE_TASK_H
+#define _IGUI_ENABLED_COMPUTE_TASK_H
+
+#include "IComputeTask.h"
+
+//! Common interface for task that have and OpenGL UI
+/*!
+	Currently we only use this interface in Assignment4
+	to perform GL rendering and respond to user input with keyboard and mouse.
+*/
+class IGUIEnabledComputeTask : public IComputeTask
+{
+public:
+	virtual ~IGUIEnabledComputeTask() {};
+
+	// OpenGL render callback
+	virtual void Render() = 0;
+
+	virtual void OnKeyboard(int Key, int Action) = 0;
+
+	virtual void OnMouse(int Button, int Action) = 0;
+	virtual void OnMouseMove(int X, int Y) = 0;
+
+	virtual void OnIdle(double Time, float ElapsedTime) = 0;
+
+	virtual void OnWindowResized(int Width, int Height) = 0;
+};
+
+
+#endif // _IGUI_ENABLED_COMPUTE_TASK_H
--- a/Assignment2/buildVS15/ALL_BUILD.vcxproj
+++ b/Assignment2/buildVS15/ALL_BUILD.vcxproj
--- a/Assignment2/buildVS15/ALL_BUILD.vcxproj.filters
+++ b/Assignment2/buildVS15/ALL_BUILD.vcxproj.filters
+<?xml version="1.0" encoding="UTF-8"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <CustomBuild Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CMakeLists.txt" />
+  </ItemGroup>
+  <ItemGroup>
+  </ItemGroup>
+</Project>
--- a/Assignment2/buildVS15/Assignment.vcxproj
+++ b/Assignment2/buildVS15/Assignment.vcxproj
--- a/Assignment2/buildVS15/Assignment.vcxproj.filters
+++ b/Assignment2/buildVS15/Assignment.vcxproj.filters
+<?xml version="1.0" encoding="UTF-8"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CAssignment2.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CReductionTask.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CScanTask.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="D:\Projekte\GPGPU\Assignment2\Assignment2\main.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CAssignment2.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CReductionTask.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CScanTask.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="D:\Projekte\GPGPU\Assignment2\Assignment2\CMakeLists.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="D:\Projekte\GPGPU\Assignment2\Assignment2\Reduction.cl" />
+    <None Include="D:\Projekte\GPGPU\Assignment2\Assignment2\Scan.cl" />
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{F6E304CD-0084-3FF4-B85F-1EAEDA42C629}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{B2679C4E-9B86-304A-AA4C-74F70A0EAC52}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/Assignment2/buildVS15/Common/GPUCommon.vcxproj
+++ b/Assignment2/buildVS15/Common/GPUCommon.vcxproj
--- a/Assignment2/buildVS15/Common/GPUCommon.vcxproj.filters
+++ b/Assignment2/buildVS15/Common/GPUCommon.vcxproj.filters
+<?xml version="1.0" encoding="UTF-8"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="D:\Projekte\GPGPU\Assignment2\Common\CAssignmentBase.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="D:\Projekte\GPGPU\Assignment2\Common\CLUtil.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="D:\Projekte\GPGPU\Assignment2\Common\CTimer.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CAssignmentBase.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CLUtil.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CTimer.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\CommonDefs.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\IComputeTask.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="D:\Projekte\GPGPU\Assignment2\Common\IGUIEnabledComputeTask.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="D:\Projekte\GPGPU\Assignment2\Common\CMakeLists.txt" />
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{F6E304CD-0084-3FF4-B85F-1EAEDA42C629}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{B2679C4E-9B86-304A-AA4C-74F70A0EAC52}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/Assignment2/buildVS15/GPUComputing.sln
+++ b/Assignment2/buildVS15/GPUComputing.sln
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ALL_BUILD", "ALL_BUILD.vcxproj", "{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}"
+	ProjectSection(ProjectDependencies) = postProject
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE} = {17879C2F-1E39-30B7-AD1F-1D74A7912DEE}
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F} = {7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84} = {B3F98345-39AE-339D-9A68-0AFAC3D8FE84}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Assignment", "Assignment.vcxproj", "{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}"
+	ProjectSection(ProjectDependencies) = postProject
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F} = {7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84} = {B3F98345-39AE-339D-9A68-0AFAC3D8FE84}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "GPUCommon", "Common\GPUCommon.vcxproj", "{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}"
+	ProjectSection(ProjectDependencies) = postProject
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84} = {B3F98345-39AE-339D-9A68-0AFAC3D8FE84}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ZERO_CHECK", "ZERO_CHECK.vcxproj", "{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}"
+	ProjectSection(ProjectDependencies) = postProject
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+		MinSizeRel|x64 = MinSizeRel|x64
+		RelWithDebInfo|x64 = RelWithDebInfo|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.Debug|x64.ActiveCfg = Debug|x64
+		{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.Release|x64.ActiveCfg = Release|x64
+		{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{8CE65BA2-73EA-3BC7-8DD4-C4CF32B6EF3F}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Debug|x64.ActiveCfg = Debug|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Debug|x64.Build.0 = Debug|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Release|x64.ActiveCfg = Release|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.Release|x64.Build.0 = Release|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{17879C2F-1E39-30B7-AD1F-1D74A7912DEE}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Debug|x64.ActiveCfg = Debug|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Debug|x64.Build.0 = Debug|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Release|x64.ActiveCfg = Release|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.Release|x64.Build.0 = Release|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{7ACA3E34-73C3-3EB4-A2A4-0DA32162BA5F}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Debug|x64.ActiveCfg = Debug|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Debug|x64.Build.0 = Debug|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Release|x64.ActiveCfg = Release|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.Release|x64.Build.0 = Release|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.MinSizeRel|x64.ActiveCfg = MinSizeRel|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.MinSizeRel|x64.Build.0 = MinSizeRel|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.RelWithDebInfo|x64.ActiveCfg = RelWithDebInfo|x64
+		{B3F98345-39AE-339D-9A68-0AFAC3D8FE84}.RelWithDebInfo|x64.Build.0 = RelWithDebInfo|x64
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+	EndGlobalSection
+	GlobalSection(ExtensibilityAddIns) = postSolution
+	EndGlobalSection
+EndGlobal
--- a/Assignment2/buildVS15/ZERO_CHECK.vcxproj
+++ b/Assignment2/buildVS15/ZERO_CHECK.vcxproj
--- a/Assignment2/buildVS15/ZERO_CHECK.vcxproj.filters
+++ b/Assignment2/buildVS15/ZERO_CHECK.vcxproj.filters
+<?xml version="1.0" encoding="UTF-8"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <CustomBuild Include="D:\Projekte\GPGPU\Assignment2\buildVS15\CMakeFiles\348703b5db2c0151a605f81047f5b4d1\generate.stamp.rule">
+      <Filter>CMake Rules</Filter>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <Filter Include="CMake Rules">
+      <UniqueIdentifier>{D943AC8E-153A-31B2-98BF-B32B53A81E2C}</UniqueIdentifier>
+    </Filter>
+  </ItemGroup>
+</Project>
--- a/Assignment2/buildVS15/arch.c
+++ b/Assignment2/buildVS15/arch.c
+
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
+    #if defined(__ARM_ARCH_7__) \
+        || defined(__ARM_ARCH_7A__) \
+        || defined(__ARM_ARCH_7R__) \
+        || defined(__ARM_ARCH_7M__) \
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
+        #error cmake_ARCH armv7
+    #elif defined(__ARM_ARCH_6__) \
+        || defined(__ARM_ARCH_6J__) \
+        || defined(__ARM_ARCH_6T2__) \
+        || defined(__ARM_ARCH_6Z__) \
+        || defined(__ARM_ARCH_6K__) \
+        || defined(__ARM_ARCH_6ZK__) \
+        || defined(__ARM_ARCH_6M__) \
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
+        #error cmake_ARCH armv6
+    #elif defined(__ARM_ARCH_5TEJ__) \
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
+        #error cmake_ARCH armv5
+    #else
+        #error cmake_ARCH arm
+    #endif
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+    #error cmake_ARCH i386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+    #error cmake_ARCH x86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+    #error cmake_ARCH ia64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \
+      || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \
+      || defined(_M_MPPC) || defined(_M_PPC)
+    #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+        #error cmake_ARCH ppc64
+    #else
+        #error cmake_ARCH ppc
+    #endif
+#endif
+
+#error cmake_ARCH unknown
--- a/Assignment2/cmake/ChangeWorkingDirectory.cmake
+++ b/Assignment2/cmake/ChangeWorkingDirectory.cmake
+function(change_workingdir EXE WorkingDir)
+	#add a user file to auto config the working directory for debugging
+	if (MSVC)
+		set(Platform "Win32")
+		if (CMAKE_CL_64)
+			set(Platform "x64")
+		endif (CMAKE_CL_64)
+				
+		configure_file (
+			${CMAKE_SOURCE_DIR}/../cmake/WorkingDirectory.vcxproj.user.in
+			${CMAKE_CURRENT_BINARY_DIR}/${EXE}.vcxproj.user 
+			@ONLY
+			)
+	endif()
+endfunction()
+
--- a/Assignment2/cmake/FindOpenCL.cmake
+++ b/Assignment2/cmake/FindOpenCL.cmake
+# - Try to find OpenCL
+# This module tries to find an OpenCL implementation on your system. It supports
+# AMD / ATI, Apple and NVIDIA implementations, but should work, too.
+#
+# To set manually the paths, define these environment variables:
+# OpenCL_INCPATH    - Include path (e.g. OpenCL_INCPATH=/opt/cuda/4.0/cuda/include)
+# OpenCL_LIBPATH    - Library path (e.h. OpenCL_LIBPATH=/usr/lib64/nvidia)
+#
+# Once done this will define
+#  OPENCL_FOUND        - system has OpenCL
+#  OPENCL_INCLUDE_DIRS  - the OpenCL include directory
+#  OPENCL_LIBRARIES    - link these to use OpenCL
+#
+# WIN32 should work, but is untested
+
+FIND_PACKAGE(PackageHandleStandardArgs)
+
+SET (OPENCL_VERSION_STRING "0.1.0")
+SET (OPENCL_VERSION_MAJOR 0)
+SET (OPENCL_VERSION_MINOR 1)
+SET (OPENCL_VERSION_PATCH 0)
+
+include(${CMAKE_SOURCE_DIR}/../cmake/TargetArch.cmake)
+target_architecture(TARGET_ARCH)
+
+
+IF (APPLE)
+
+	FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX")
+	FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX")
+	FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX")
+
+ELSE (APPLE)
+
+	IF (WIN32)
+		FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
+		FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ENV AMDAPPSDKROOT ENV CUDA_PATH ENV INTELOCLSDKROOT)
+				
+		# The AMD SDK currently installs both x86 and x86_64 libraries
+		# This is only a hack to find out architecture
+		# The same is true for CUDA SDK
+		IF( ${TARGET_ARCH} STREQUAL "x86_64" )
+			SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86_64")
+			SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/x64")
+			message(STATUS "Using 64bit libraries")
+		ELSE (${TARGET_ARCH} STREQUAL "x86_64")
+			SET(OPENCL_AMD_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86")
+			SET(OPENCL_NVIDIA_LIB_DIR "$ENV{CUDA_PATH}/lib/Win32")
+			message(STATUS "Using 32bit libraries")
+		ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
+		
+		
+		# Find library
+		FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib PATHS ${OPENCL_AMD_LIB_DIR} ${OPENCL_NVIDIA_LIB_DIR} ENV OpenCL_LIBPATH )
+		
+		GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
+
+		# On Win32 search relative to the library
+		FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
+		FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" ENV OpenCL_INCPATH)
+
+	ELSE (WIN32)
+		# Unix style platforms
+
+		IF( ${TARGET_ARCH} STREQUAL "x86_64" )
+			SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib64 /usr/lib64/nvidia-304xx )
+			SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib64/nvidia)			
+			message(STATUS "Using 64bit libraries")
+		ELSE (${TARGET_ARCH} STREQUAL "x86_64")
+			SET(OPENCL_NVIDIA_LIB_DIR /usr/local/cuda/lib)
+			SET(OPENCL_NVIDIA_ATIS_LIB_DIR /usr/lib/nvidia)			
+			message(STATUS "Using 32bit libraries")
+		ENDIF( ${TARGET_ARCH} STREQUAL "x86_64" )
+
+		FIND_LIBRARY(OPENCL_LIBRARIES OpenCL
+			PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
+		)
+
+		# Alternatives (for ATIS pool)
+		FIND_LIBRARY(OPENCL_LIBRARIES libOpenCL.so.1
+			PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH ${OPENCL_NVIDIA_LIB_DIR} ${OPENCL_NVIDIA_ATIS_LIB_DIR}
+		)
+
+		GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)
+		GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
+
+		FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda/ /opt/AMDAPP/ /opt/cuda-5.0/ ENV OpenCL_INCPATH ENV INCLUDE)
+		FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATH_SUFFIXES include PATHS ${_OPENCL_INC_CAND} /usr/ /usr/local/cuda /opt/AMDAPP ENV OpenCL_INCPATH)
+
+	ENDIF (WIN32)
+
+ENDIF (APPLE)
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS)
+
+IF(_OPENCL_CPP_INCLUDE_DIRS)
+	SET( OPENCL_HAS_CPP_BINDINGS TRUE )
+	LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} )
+	# This is often the same, so clean up
+	LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )
+ENDIF(_OPENCL_CPP_INCLUDE_DIRS)
+
+MARK_AS_ADVANCED(
+  OPENCL_INCLUDE_DIRS
+)
--- a/Assignment2/cmake/TargetArch.cmake
+++ b/Assignment2/cmake/TargetArch.cmake
+# Based on the Qt 5 processor detection code, so should be very accurate
+# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
+# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
+
+# Regarding POWER/PowerPC, just as is noted in the Qt source,
+# "There are many more known variants/revisions that we do not handle/detect."
+
+set(archdetect_c_code "
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)
+    #if defined(__ARM_ARCH_7__) \\
+        || defined(__ARM_ARCH_7A__) \\
+        || defined(__ARM_ARCH_7R__) \\
+        || defined(__ARM_ARCH_7M__) \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
+        #error cmake_ARCH armv7
+    #elif defined(__ARM_ARCH_6__) \\
+        || defined(__ARM_ARCH_6J__) \\
+        || defined(__ARM_ARCH_6T2__) \\
+        || defined(__ARM_ARCH_6Z__) \\
+        || defined(__ARM_ARCH_6K__) \\
+        || defined(__ARM_ARCH_6ZK__) \\
+        || defined(__ARM_ARCH_6M__) \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
+        #error cmake_ARCH armv6
+    #elif defined(__ARM_ARCH_5TEJ__) \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
+        #error cmake_ARCH armv5
+    #else
+        #error cmake_ARCH arm
+    #endif
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+    #error cmake_ARCH i386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+    #error cmake_ARCH x86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+    #error cmake_ARCH ia64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
+      || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \\
+      || defined(_M_MPPC) || defined(_M_PPC)
+    #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+        #error cmake_ARCH ppc64
+    #else
+        #error cmake_ARCH ppc
+    #endif
+#endif
+
+#error cmake_ARCH unknown
+")
+
+# Set ppc_support to TRUE before including this file or ppc and ppc64
+# will be treated as invalid architectures since they are no longer supported by Apple
+
+function(target_architecture output_var)
+    if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+        # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
+        # First let's normalize the order of the values
+
+        # Note that it's not possible to compile PowerPC applications if you are using
+        # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
+        # disable it by default
+        # See this page for more information:
+        # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
+
+        # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
+        # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
+
+        foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
+            if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
+                set(osx_arch_ppc TRUE)
+            elseif("${osx_arch}" STREQUAL "i386")
+                set(osx_arch_i386 TRUE)
+            elseif("${osx_arch}" STREQUAL "x86_64")
+                set(osx_arch_x86_64 TRUE)
+            elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
+                set(osx_arch_ppc64 TRUE)
+            else()
+                message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
+            endif()
+        endforeach()
+
+        # Now add all the architectures in our normalized order
+        if(osx_arch_ppc)
+            list(APPEND ARCH ppc)
+        endif()
+
+        if(osx_arch_i386)
+            list(APPEND ARCH i386)
+        endif()
+
+        if(osx_arch_x86_64)
+            list(APPEND ARCH x86_64)
+        endif()
+
+        if(osx_arch_ppc64)
+            list(APPEND ARCH ppc64)
+        endif()
+    else()
+        file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
+
+        enable_language(C)
+
+        # Detect the architecture in a rather creative way...
+        # This compiles a small C program which is a series of ifdefs that selects a
+        # particular #error preprocessor directive whose message string contains the
+        # target architecture. The program will always fail to compile (both because
+        # file is not a valid C program, and obviously because of the presence of the
+        # #error preprocessor directives... but by exploiting the preprocessor in this
+        # way, we can detect the correct target architecture even when cross-compiling,
+        # since the program itself never needs to be run (only the compiler/preprocessor)
+        try_run(
+            run_result_unused
+            compile_result_unused
+            "${CMAKE_BINARY_DIR}"
+            "${CMAKE_BINARY_DIR}/arch.c"
+            COMPILE_OUTPUT_VARIABLE ARCH
+            CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+        )
+
+        # Parse the architecture name from the compiler output
+        string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
+
+        # Get rid of the value marker leaving just the architecture name
+        string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
+
+        # If we are compiling with an unknown architecture this variable should
+        # already be set to "unknown" but in the case that it's empty (i.e. due
+        # to a typo in the code), then set it to unknown
+        if (NOT ARCH)
+            set(ARCH unknown)
+        endif()
+    endif()
+
+    set(${output_var} "${ARCH}" PARENT_SCOPE)
+endfunction()
--- a/Assignment2/cmake/WorkingDirectory.vcxproj.user.in
+++ b/Assignment2/cmake/WorkingDirectory.vcxproj.user.in
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|@Platform@'">
+    <LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|@Platform@'">
+    <LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='RelWithDebInfo|@Platform@'">
+    <LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='MinSizeRel|@Platform@'">
+    <LocalDebuggerWorkingDirectory>@WorkingDir@</LocalDebuggerWorkingDirectory>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+  </PropertyGroup>
+</Project>
\ No newline at end of file
--- a/gpgpu-a2.pdf
+++ b/gpgpu-a2.pdf