Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
G
GPGPU
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
CI / CD
CI / CD
Pipelines
Schedules
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Commits
Open sidebar
Kai Westerkamp
GPGPU
Commits
0d7ab823
Commit
0d7ab823
authored
Nov 07, 2016
by
Kai Westerkamp
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
local
parent
5c5f4cb3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
122 additions
and
9 deletions
+122
-9
CAssignment2.cpp
Assignment2/Assignment2/CAssignment2.cpp
+8
-4
CScanTask.cpp
Assignment2/Assignment2/CScanTask.cpp
+46
-3
Scan.cl
Assignment2/Assignment2/Scan.cl
+68
-2
No files found.
Assignment2/Assignment2/CAssignment2.cpp
View file @
0d7ab823
...
...
@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute()
cout
<<
"Running parallel reduction task..."
<<
endl
<<
endl
;
{
size_t
LocalWorkSize
[
3
]
=
{
256
,
1
,
1
};
CReductionTask
reduction
(
1024
*
1024
*
16
);
RunComputeTask
(
reduction
,
LocalWorkSize
);
//
CReductionTask reduction(1024 * 1024 * 16);
//
RunComputeTask(reduction, LocalWorkSize);
}
// Task 2: parallel prefix sum
...
...
@@ -31,8 +31,12 @@ bool CAssignment2::DoCompute()
cout
<<
"Running parallel prefix sum task..."
<<
endl
<<
endl
;
{
size_t
LocalWorkSize
[
3
]
=
{
256
,
1
,
1
};
//CScanTask scan(1024 * 1024 * 64, LocalWorkSize[0]);
//RunComputeTask(scan, LocalWorkSize);
CScanTask
scan
(
512
,
LocalWorkSize
[
0
]);
RunComputeTask
(
scan
,
LocalWorkSize
);
CScanTask
scan2
(
1024
*
1024
*
64
,
LocalWorkSize
[
0
]);
RunComputeTask
(
scan2
,
LocalWorkSize
);
}
...
...
Assignment2/Assignment2/CScanTask.cpp
View file @
0d7ab823
...
...
@@ -62,8 +62,9 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
//fill the array with some values
for
(
unsigned
int
i
=
0
;
i
<
m_N
;
i
++
)
//m_hArray[i] = 1; // Use this for debugging
m_hArray
[
i
]
=
rand
()
&
15
;
m_hArray
[
i
]
=
1
;
// Use this for debugging
//m_hArray[i] = rand() & 15;
cout
<<
"Debug array"
<<
endl
;
//device resources
// ping-pong buffers
...
...
@@ -138,7 +139,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si
cout
<<
endl
;
TestPerformance
(
Context
,
CommandQueue
,
LocalWorkSize
,
0
);
TestPerformance
(
Context
,
CommandQueue
,
LocalWorkSize
,
1
);
//
TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
cout
<<
endl
;
}
...
...
@@ -178,6 +179,27 @@ bool CScanTask::ValidateResults()
void
CScanTask
::
Scan_Naive
(
cl_context
Context
,
cl_command_queue
CommandQueue
,
size_t
LocalWorkSize
[
3
])
{
cl_int
clErr
;
size_t
globalWorkSize
[
1
];
size_t
localWorkSize
[
1
];
localWorkSize
[
0
]
=
LocalWorkSize
[
0
];
globalWorkSize
[
0
]
=
CLUtil
::
GetGlobalWorkSize
(
m_N
,
localWorkSize
[
0
]);
clErr
=
clSetKernelArg
(
m_ScanNaiveKernel
,
2
,
sizeof
(
cl_uint
),
(
void
*
)
&
m_N
);
V_RETURN_CL
(
clErr
,
"Error setting Kernel Arg 3"
);
for
(
unsigned
int
offset
=
1
;
offset
<=
m_N
;
offset
*=
2
)
{
clErr
=
clSetKernelArg
(
m_ScanNaiveKernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dPingArray
);
clErr
|=
clSetKernelArg
(
m_ScanNaiveKernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dPongArray
);
clErr
|=
clSetKernelArg
(
m_ScanNaiveKernel
,
3
,
sizeof
(
cl_uint
),
(
void
*
)
&
offset
);
V_RETURN_CL
(
clErr
,
"Error setting Kernel args"
);
clErr
=
clEnqueueNDRangeKernel
(
CommandQueue
,
m_ScanNaiveKernel
,
1
,
NULL
,
globalWorkSize
,
localWorkSize
,
0
,
NULL
,
NULL
);
V_RETURN_CL
(
clErr
,
"Error running Kernel"
);
swap
(
m_dPingArray
,
m_dPongArray
);
}
// TO DO: Implement naive version of scan
...
...
@@ -190,6 +212,23 @@ void CScanTask::Scan_Naive(cl_context Context, cl_command_queue CommandQueue, si
void
CScanTask
::
Scan_WorkEfficient
(
cl_context
Context
,
cl_command_queue
CommandQueue
,
size_t
LocalWorkSize
[
3
])
{
cl_int
clErr
;
size_t
globalWorkSize
[
1
];
size_t
localWorkSize
[
1
];
localWorkSize
[
0
]
=
LocalWorkSize
[
0
];
globalWorkSize
[
0
]
=
CLUtil
::
GetGlobalWorkSize
(
m_N
/
2
,
localWorkSize
[
0
]);
clErr
=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
0
]);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
1
]);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
2
,
localWorkSize
[
0
]
*
2
*
sizeof
(
cl_uint
),
NULL
);
V_RETURN_CL
(
clErr
,
"Error setting Kernel args3"
);
clErr
=
clEnqueueNDRangeKernel
(
CommandQueue
,
m_ScanWorkEfficientKernel
,
1
,
NULL
,
globalWorkSize
,
localWorkSize
,
0
,
NULL
,
NULL
);
V_RETURN_CL
(
clErr
,
"Error running Kernel"
);
swap
(
m_dPingArray
,
m_dPongArray
);
// TO DO: Implement efficient version of scan
...
...
@@ -215,6 +254,10 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue,
// validate results
m_bValidationResults
[
Task
]
=
(
memcmp
(
m_hResultCPU
,
m_hResultGPU
,
m_N
*
sizeof
(
unsigned
int
))
==
0
);
//for (int j = 0; j < m_N; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
}
void
CScanTask
::
TestPerformance
(
cl_context
Context
,
cl_command_queue
CommandQueue
,
size_t
LocalWorkSize
[
3
],
unsigned
int
Task
)
...
...
Assignment2/Assignment2/Scan.cl
View file @
0d7ab823
...
...
@@ -4,7 +4,18 @@
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
Scan_Naive
(
const
__global
uint*
inArray,
__global
uint*
outArray,
uint
N,
uint
offset
)
{
//
TO
DO:
Kernel
implementation
int
left
=
get_global_id
(
0
)
;
int
right
=
left
+
offset
;
int
value
=
inArray[left]
;
if
(
left
<
offset
)
{
outArray[left]
=
value
;
}
if
(
right
<
N
)
{
outArray[right]
=
value
+
inArray[right]
;
}
}
...
...
@@ -27,7 +38,62 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray,
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
Scan_WorkEfficient
(
__global
uint*
array,
__global
uint*
higherLevelArray,
__local
uint*
localBlock
)
{
//
TO
DO:
Kernel
implementation
int
ls
=
get_local_size
(
0
)
;
int
ls2
=
ls
*
2
;
int
gid2
=
get_global_id
(
0
)
*2
;
int
lid
=
get_local_id
(
0
)
;
int
lid2
=
lid
*2
;
//read
to
local
uint
inLeft,
inRight
;;
localBlock[lid2]
=
inLeft
=
array[gid2]
;
localBlock[lid2+1]
=
inRight
=
array[gid2+1]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//Upsweep
for
(
uint
stride
=
1
; stride <= ls; stride *= 2) {
uint
right
=
(
ls2
-
1
)
-
lid
*
stride
*
2
;
uint
left
=
right
-
stride
;
if
(
right
<
ls2
)
{
localBlock[right]
+=
localBlock[left]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
//letzten
auf
0
setzen
if
(
lid
==
0
)
localBlock[ls2
-
1]
=
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//Downsweep
for
(
uint
stride
=
ls
; stride >= 1; stride /= 2) {
uint
right
=
(
ls2
-
1
)
-
lid
*
stride
*
2
;
uint
left
=
right
-
stride
;
if
(
right
<
ls2
)
{
uint
vright
=
localBlock[right]
;
uint
vleft
=
localBlock[left]
;
localBlock[left]
=
vright
;
localBlock[right]
=
vleft
+
vright
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
//write
back
array[gid2]
=
localBlock[lid2]
+
inLeft
;
uint
last
=
localBlock[lid2
+
1]
+
inRight
;
array[gid2
+
1]
=
last
;
if
(
lid
+
1
==
ls
)
{
higherLevelArray[get_group_id
(
0
)
]
=
last
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment