Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
G
GPGPU
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
CI / CD
CI / CD
Pipelines
Schedules
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Commits
Open sidebar
Kai Westerkamp
GPGPU
Commits
5c5f4cb3
Commit
5c5f4cb3
authored
Nov 07, 2016
by
Kai Westerkamp
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Decomp Kernel
parent
a752caf0
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
80 additions
and
8 deletions
+80
-8
CReductionTask.cpp
Assignment2/Assignment2/CReductionTask.cpp
+46
-4
Reduction.cl
Assignment2/Assignment2/Reduction.cl
+34
-4
No files found.
Assignment2/Assignment2/CReductionTask.cpp
View file @
5c5f4cb3
...
...
@@ -42,8 +42,10 @@ bool CReductionTask::InitResources(cl_device_id Device, cl_context Context)
//fill the array with some values
for
(
unsigned
int
i
=
0
;
i
<
m_N
;
i
++
)
m_hInput
[
i
]
=
1
;
// Use this for debugging
//m_hInput[i] = rand() & 15;
//m_hInput[i] = 1; // Use this for debugging
m_hInput
[
i
]
=
rand
()
&
15
;
//cout << "init are 1" << endl;
//device resources
cl_int
clError
,
clError2
;
...
...
@@ -96,9 +98,13 @@ void CReductionTask::ReleaseResources()
void
CReductionTask
::
ComputeGPU
(
cl_context
Context
,
cl_command_queue
CommandQueue
,
size_t
LocalWorkSize
[
3
])
{
ExecuteTask
(
Context
,
CommandQueue
,
LocalWorkSize
,
0
);
cout
<<
"Task 1 Done"
<<
endl
;
ExecuteTask
(
Context
,
CommandQueue
,
LocalWorkSize
,
1
);
cout
<<
"Task 2 Done"
<<
endl
;
ExecuteTask
(
Context
,
CommandQueue
,
LocalWorkSize
,
2
);
cout
<<
"Task 3 Done"
<<
endl
;
ExecuteTask
(
Context
,
CommandQueue
,
LocalWorkSize
,
3
);
cout
<<
"Task 4 Done"
<<
endl
;
TestPerformance
(
Context
,
CommandQueue
,
LocalWorkSize
,
0
);
TestPerformance
(
Context
,
CommandQueue
,
LocalWorkSize
,
1
);
...
...
@@ -192,10 +198,13 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
cl_int
clErr
;
size_t
globalWorkSize
[
1
];
size_t
localWorkSize
[
1
];
localWorkSize
[
0
]
=
LocalWorkSize
[
0
];
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hInput[j]);
//cout << endl;
for
(
unsigned
int
i
=
m_N
/
2
;
i
>
1
;
i
/=
2
*
localWorkSize
[
0
])
{
localWorkSize
[
0
]
=
min
(
l
ocalWorkSize
[
0
],
i
);
localWorkSize
[
0
]
=
min
(
L
ocalWorkSize
[
0
],
i
);
clErr
=
clSetKernelArg
(
m_DecompKernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dPingArray
);
clErr
|=
clSetKernelArg
(
m_DecompKernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dPongArray
);
...
...
@@ -209,6 +218,10 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
swap
(
m_dPingArray
,
m_dPongArray
);
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hDebug[j]);
//cout << endl;
}
...
...
@@ -223,6 +236,35 @@ void CReductionTask::Reduction_Decomp(cl_context Context, cl_command_queue Comma
void
CReductionTask
::
Reduction_DecompUnroll
(
cl_context
Context
,
cl_command_queue
CommandQueue
,
size_t
LocalWorkSize
[
3
])
{
cl_int
clErr
;
size_t
globalWorkSize
[
1
];
size_t
localWorkSize
[
1
];
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hInput[j]);
//cout << endl;
for
(
unsigned
int
i
=
m_N
/
2
;
i
>
1
;
i
/=
2
*
localWorkSize
[
0
])
{
localWorkSize
[
0
]
=
min
(
LocalWorkSize
[
0
],
i
);
clErr
=
clSetKernelArg
(
m_DecompUnrollKernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dPingArray
);
clErr
|=
clSetKernelArg
(
m_DecompUnrollKernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dPongArray
);
clErr
|=
clSetKernelArg
(
m_DecompUnrollKernel
,
2
,
sizeof
(
cl_uint
),
(
void
*
)
&
i
);
clErr
|=
clSetKernelArg
(
m_DecompUnrollKernel
,
3
,
localWorkSize
[
0
]
*
sizeof
(
cl_uint
),
NULL
);
V_RETURN_CL
(
clErr
,
"Error setting Kernel args"
);
globalWorkSize
[
0
]
=
CLUtil
::
GetGlobalWorkSize
(
i
,
localWorkSize
[
0
]);
clErr
=
clEnqueueNDRangeKernel
(
CommandQueue
,
m_DecompUnrollKernel
,
1
,
NULL
,
globalWorkSize
,
localWorkSize
,
0
,
NULL
,
NULL
);
V_RETURN_CL
(
clErr
,
"Error running Kernel"
);
swap
(
m_dPingArray
,
m_dPongArray
);
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dPingArray, CL_TRUE, 0, m_N * sizeof(cl_uint), m_hDebug, 0, NULL, NULL), "error reading data from device!");
//for (int j = 0; j < 32; j++)
// printf("%4d, ", m_hDebug[j]);
//cout << endl;
}
// TO DO: Implement reduction with loop unrolling
...
...
Assignment2/Assignment2/Reduction.cl
View file @
5c5f4cb3
...
...
@@ -28,22 +28,24 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
{
int
ls
=
get_local_size
(
0
)
;
int
gs
=
get_global_size
(
0
)
;
int
gid
=
get_global_id
(
0
)
;
int
lid
=
get_local_id
(
0
)
;
int
groupID
=
get_group_id
(
0
)
;
localBlock[lid]
=
inArray[gid]
+
inArray[gid
+
gs]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
uint
stride
=
ls
/
2
; stride > 0;
ls
/= 2){
for
(
uint
stride
=
ls
/
2
; stride > 0;
stride
/= 2){
if
(
lid
<
stride
)
{
localBlock[lid]
=
inArray[lid]
+
inArray
[lid
+
stride]
;
localBlock[lid]
=
localBlock[lid]
+
localBlock
[lid
+
stride]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lid
=
0
)
if
(
lid
=
=
0
)
outArray[groupID]
=
localBlock[0]
;
}
...
...
@@ -51,5 +53,33 @@ __kernel void Reduction_Decomp(const __global uint* inArray, __global uint* outA
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
Reduction_DecompUnroll
(
const
__global
uint*
inArray,
__global
uint*
outArray,
uint
N,
__local
uint*
localBlock
)
{
//
TO
DO:
Kernel
implementation
int
ls
=
get_local_size
(
0
)
;
int
gs
=
get_global_size
(
0
)
;
int
gid
=
get_global_id
(
0
)
;
int
lid
=
get_local_id
(
0
)
;
int
groupID
=
get_group_id
(
0
)
;
localBlock[lid]
=
inArray[gid]
+
inArray[gid
+
gs]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
uint
stride
=
ls
/
2
; stride > 32; stride /= 2){
if
(
lid
<
stride
)
{
localBlock[lid]
=
localBlock[lid]
+
localBlock[lid
+
stride]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lid
<
32
)
{
if
(
ls
>=
64
)
localBlock[lid]
+=
localBlock[lid
+
32]
;
if
(
ls
>=
32
)
localBlock[lid]
+=
localBlock[lid
+
16]
;
if
(
ls
>=
16
)
localBlock[lid]
+=
localBlock[lid
+
8]
;
if
(
ls
>=
8
)
localBlock[lid]
+=
localBlock[lid
+
4]
;
if
(
ls
>=
4
)
localBlock[lid]
+=
localBlock[lid
+
2]
;
}
if
(
lid
==
0
)
outArray[groupID]
=
localBlock[0]+localBlock[1]
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment