Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
G
GPGPU
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
CI / CD
CI / CD
Pipelines
Schedules
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Commits
Open sidebar
Kai Westerkamp
GPGPU
Commits
84bbc7a6
Commit
84bbc7a6
authored
Nov 08, 2016
by
Kai Westerkamp
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
scan work efficient
parent
0d7ab823
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
90 additions
and
34 deletions
+90
-34
CAssignment2.cpp
Assignment2/Assignment2/CAssignment2.cpp
+4
-4
CScanTask.cpp
Assignment2/Assignment2/CScanTask.cpp
+55
-18
Scan.cl
Assignment2/Assignment2/Scan.cl
+31
-12
No files found.
Assignment2/Assignment2/CAssignment2.cpp
View file @
84bbc7a6
...
@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute()
...
@@ -22,8 +22,8 @@ bool CAssignment2::DoCompute()
cout
<<
"Running parallel reduction task..."
<<
endl
<<
endl
;
cout
<<
"Running parallel reduction task..."
<<
endl
<<
endl
;
{
{
size_t
LocalWorkSize
[
3
]
=
{
256
,
1
,
1
};
size_t
LocalWorkSize
[
3
]
=
{
256
,
1
,
1
};
//
CReductionTask reduction(1024 * 1024 * 16);
CReductionTask
reduction
(
1024
*
1024
*
16
);
//
RunComputeTask(reduction, LocalWorkSize);
RunComputeTask
(
reduction
,
LocalWorkSize
);
}
}
// Task 2: parallel prefix sum
// Task 2: parallel prefix sum
...
@@ -32,8 +32,8 @@ bool CAssignment2::DoCompute()
...
@@ -32,8 +32,8 @@ bool CAssignment2::DoCompute()
{
{
size_t
LocalWorkSize
[
3
]
=
{
256
,
1
,
1
};
size_t
LocalWorkSize
[
3
]
=
{
256
,
1
,
1
};
CScanTask
scan
(
512
,
LocalWorkSize
[
0
]);
// CScanTask scan(1024
, LocalWorkSize[0]);
RunComputeTask
(
scan
,
LocalWorkSize
);
//
RunComputeTask(scan, LocalWorkSize);
CScanTask
scan2
(
1024
*
1024
*
64
,
LocalWorkSize
[
0
]);
CScanTask
scan2
(
1024
*
1024
*
64
,
LocalWorkSize
[
0
]);
RunComputeTask
(
scan2
,
LocalWorkSize
);
RunComputeTask
(
scan2
,
LocalWorkSize
);
...
...
Assignment2/Assignment2/CScanTask.cpp
View file @
84bbc7a6
...
@@ -62,9 +62,8 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
...
@@ -62,9 +62,8 @@ bool CScanTask::InitResources(cl_device_id Device, cl_context Context)
//fill the array with some values
//fill the array with some values
for
(
unsigned
int
i
=
0
;
i
<
m_N
;
i
++
)
for
(
unsigned
int
i
=
0
;
i
<
m_N
;
i
++
)
m_hArray
[
i
]
=
1
;
// Use this for debugging
//m_hArray[i] = 1; // Use this for debugging
//m_hArray[i] = rand() & 15;
m_hArray
[
i
]
=
rand
()
&
15
;
cout
<<
"Debug array"
<<
endl
;
//device resources
//device resources
// ping-pong buffers
// ping-pong buffers
...
@@ -139,7 +138,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si
...
@@ -139,7 +138,7 @@ void CScanTask::ComputeGPU(cl_context Context, cl_command_queue CommandQueue, si
cout
<<
endl
;
cout
<<
endl
;
TestPerformance
(
Context
,
CommandQueue
,
LocalWorkSize
,
0
);
TestPerformance
(
Context
,
CommandQueue
,
LocalWorkSize
,
0
);
//
TestPerformance(Context, CommandQueue, LocalWorkSize, 1);
TestPerformance
(
Context
,
CommandQueue
,
LocalWorkSize
,
1
);
cout
<<
endl
;
cout
<<
endl
;
}
}
...
@@ -215,19 +214,61 @@ void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQ
...
@@ -215,19 +214,61 @@ void CScanTask::Scan_WorkEfficient(cl_context Context, cl_command_queue CommandQ
cl_int
clErr
;
cl_int
clErr
;
size_t
globalWorkSize
[
1
];
size_t
globalWorkSize
[
1
];
size_t
localWorkSize
[
1
];
size_t
localWorkSize
[
1
];
size_t
levelSize
=
m_N
;
size_t
*
levelSizes
=
new
size_t
[
m_nLevels
];
//local PPS
for
(
size_t
level
=
0
;
level
<
m_nLevels
-
1
;
level
++
)
{
levelSizes
[
level
]
=
levelSize
;
localWorkSize
[
0
]
=
min
(
levelSize
/
2
,
LocalWorkSize
[
0
]);
if
(
localWorkSize
[
0
]
==
0
)
continue
;
globalWorkSize
[
0
]
=
CLUtil
::
GetGlobalWorkSize
(
levelSize
/
2
,
localWorkSize
[
0
]);
clErr
=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
level
]);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
level
+
1
]);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
2
,
localWorkSize
[
0
]
*
3
*
sizeof
(
cl_uint
),
NULL
);
V_RETURN_CL
(
clErr
,
"Error setting Kernel args3"
);
clErr
=
clEnqueueNDRangeKernel
(
CommandQueue
,
m_ScanWorkEfficientKernel
,
1
,
NULL
,
globalWorkSize
,
localWorkSize
,
0
,
NULL
,
NULL
);
V_RETURN_CL
(
clErr
,
"Error running Kernel"
);
localWorkSize
[
0
]
=
LocalWorkSize
[
0
];
//cout << "Reading level " << levelSize << " Elements " << endl;
globalWorkSize
[
0
]
=
CLUtil
::
GetGlobalWorkSize
(
m_N
/
2
,
localWorkSize
[
0
]);
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[level], CL_TRUE, 0, levelSize * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
//for (int j = 0; j < levelSize; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
levelSize
/=
(
2
*
LocalWorkSize
[
0
]);
//cout << "Reading level +1 " << levelSize << " Elements " << endl;
//V_RETURN_CL(clEnqueueReadBuffer(CommandQueue, m_dLevelArrays[level+1], CL_TRUE, 0, levelSize * sizeof(cl_uint), m_hResultGPU, 0, NULL, NULL), "Error reading data from device!");
//for (int j = 0; j < levelSize; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
}
clErr
=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
0
]);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
1
]);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientKernel
,
2
,
localWorkSize
[
0
]
*
2
*
sizeof
(
cl_uint
),
NULL
);
V_RETURN_CL
(
clErr
,
"Error setting Kernel args3"
);
clErr
=
clEnqueueNDRangeKernel
(
CommandQueue
,
m_ScanWorkEfficientKernel
,
1
,
NULL
,
globalWorkSize
,
localWorkSize
,
0
,
NULL
,
NULL
);
V_RETURN_CL
(
clErr
,
"Error running Kernel"
);
swap
(
m_dPingArray
,
m_dPongArray
);
for
(
size_t
level
=
m_nLevels
-
3
;
level
<
m_nLevels
;
level
--
)
{
// unsigned int -> -1 > m_nLevels
levelSize
=
levelSizes
[
level
];
localWorkSize
[
0
]
=
min
(
levelSize
/
2
,
LocalWorkSize
[
0
]);
if
(
localWorkSize
[
0
]
==
0
)
continue
;
globalWorkSize
[
0
]
=
CLUtil
::
GetGlobalWorkSize
(
levelSize
/
2
,
localWorkSize
[
0
]);
clErr
=
clSetKernelArg
(
m_ScanWorkEfficientAddKernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
level
+
1
]);
V_RETURN_CL
(
clErr
,
"Failed to set kernel args1"
);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientAddKernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
m_dLevelArrays
[
level
]);
V_RETURN_CL
(
clErr
,
"Failed to set kernel args2"
);
clErr
|=
clSetKernelArg
(
m_ScanWorkEfficientAddKernel
,
2
,
localWorkSize
[
0
]
*
2
*
sizeof
(
cl_uint
),
NULL
);
V_RETURN_CL
(
clErr
,
"Failed to set kernel args3"
);
clErr
=
clEnqueueNDRangeKernel
(
CommandQueue
,
m_ScanWorkEfficientAddKernel
,
1
,
NULL
,
globalWorkSize
,
localWorkSize
,
0
,
NULL
,
NULL
);
V_RETURN_CL
(
clErr
,
"Failed to execute kernel"
);
}
// TO DO: Implement efficient version of scan
// TO DO: Implement efficient version of scan
...
@@ -254,10 +295,6 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue,
...
@@ -254,10 +295,6 @@ void CScanTask::ValidateTask(cl_context Context, cl_command_queue CommandQueue,
// validate results
// validate results
m_bValidationResults
[
Task
]
=
(
memcmp
(
m_hResultCPU
,
m_hResultGPU
,
m_N
*
sizeof
(
unsigned
int
))
==
0
);
m_bValidationResults
[
Task
]
=
(
memcmp
(
m_hResultCPU
,
m_hResultGPU
,
m_N
*
sizeof
(
unsigned
int
))
==
0
);
//for (int j = 0; j < m_N; j++)
// printf("%2d, ", m_hResultGPU[j]);
//cout << endl;
}
}
void
CScanTask
::
TestPerformance
(
cl_context
Context
,
cl_command_queue
CommandQueue
,
size_t
LocalWorkSize
[
3
],
unsigned
int
Task
)
void
CScanTask
::
TestPerformance
(
cl_context
Context
,
cl_command_queue
CommandQueue
,
size_t
LocalWorkSize
[
3
],
unsigned
int
Task
)
...
@@ -273,7 +310,7 @@ void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueu
...
@@ -273,7 +310,7 @@ void CScanTask::TestPerformance(cl_context Context, cl_command_queue CommandQueu
timer
.
Start
();
timer
.
Start
();
//run the kernel N times
//run the kernel N times
unsigned
int
nIterations
=
10
0
;
unsigned
int
nIterations
=
10
;
for
(
unsigned
int
i
=
0
;
i
<
nIterations
;
i
++
)
{
for
(
unsigned
int
i
=
0
;
i
<
nIterations
;
i
++
)
{
//run selected task
//run selected task
switch
(
Task
){
switch
(
Task
){
...
...
Assignment2/Assignment2/Scan.cl
View file @
84bbc7a6
...
@@ -31,6 +31,7 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray,
...
@@ -31,6 +31,7 @@ __kernel void Scan_Naive(const __global uint* inArray, __global uint* outArray,
#
define
AVOID_BANK_CONFLICTS
#
define
AVOID_BANK_CONFLICTS
#
ifdef
AVOID_BANK_CONFLICTS
#
ifdef
AVOID_BANK_CONFLICTS
//
TO
DO:
define
your
conflict-free
macro
here
//
TO
DO:
define
your
conflict-free
macro
here
#
define
OFFSET
(
A
)
((
A
)
+
(
A
)
/NUM_BANKS
)
#
else
#
else
#
define
OFFSET
(
A
)
(
A
)
#
define
OFFSET
(
A
)
(
A
)
#
endif
#
endif
...
@@ -47,8 +48,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
...
@@ -47,8 +48,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
//read
to
local
//read
to
local
uint
inLeft,
inRight
;;
uint
inLeft,
inRight
;;
localBlock[
lid2
]
=
inLeft
=
array[gid2]
;
localBlock[
OFFSET
(
lid2
)
]
=
inLeft
=
array[gid2]
;
localBlock[
lid2+1
]
=
inRight
=
array[gid2+1]
;
localBlock[
OFFSET
(
lid2+1
)
]
=
inRight
=
array[gid2+1]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//Upsweep
//Upsweep
...
@@ -57,8 +58,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
...
@@ -57,8 +58,8 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
uint
right
=
(
ls2
-
1
)
-
lid
*
stride
*
2
;
uint
right
=
(
ls2
-
1
)
-
lid
*
stride
*
2
;
uint
left
=
right
-
stride
;
uint
left
=
right
-
stride
;
if
(
right
<
ls2
)
{
if
(
right
<
ls2
&&
left
<
ls2
)
{
localBlock[
right]
+=
localBlock[left
]
;
localBlock[
OFFSET
(
right
)
]
+=
localBlock[OFFSET
(
left
)
]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -66,7 +67,7 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
...
@@ -66,7 +67,7 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
//letzten
auf
0
setzen
//letzten
auf
0
setzen
if
(
lid
==
0
)
if
(
lid
==
0
)
localBlock[
ls2
-
1
]
=
0
;
localBlock[
OFFSET
(
ls2
-
1
)
]
=
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
...
@@ -76,19 +77,19 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
...
@@ -76,19 +77,19 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
uint
right
=
(
ls2
-
1
)
-
lid
*
stride
*
2
;
uint
right
=
(
ls2
-
1
)
-
lid
*
stride
*
2
;
uint
left
=
right
-
stride
;
uint
left
=
right
-
stride
;
if
(
right
<
ls2
)
{
if
(
right
<
ls2
&&
left
<
ls2
)
{
uint
vright
=
localBlock[
right
]
;
uint
vright
=
localBlock[
OFFSET
(
right
)
]
;
uint
vleft
=
localBlock[
left
]
;
uint
vleft
=
localBlock[
OFFSET
(
left
)
]
;
localBlock[
left
]
=
vright
;
localBlock[
OFFSET
(
left
)
]
=
vright
;
localBlock[
right
]
=
vleft
+
vright
;
localBlock[
OFFSET
(
right
)
]
=
vleft
+
vright
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
//write
back
//write
back
array[gid2]
=
localBlock[
lid2
]
+
inLeft
;
array[gid2]
=
localBlock[
OFFSET
(
lid2
)
]
+
inLeft
;
uint
last
=
localBlock[
lid2
+
1
]
+
inRight
;
uint
last
=
localBlock[
OFFSET
(
lid2
+
1
)
]
+
inRight
;
array[gid2
+
1]
=
last
;
array[gid2
+
1]
=
last
;
if
(
lid
+
1
==
ls
)
{
if
(
lid
+
1
==
ls
)
{
...
@@ -100,6 +101,23 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
...
@@ -100,6 +101,23 @@ __kernel void Scan_WorkEfficient(__global uint* array, __global uint* higherLeve
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
Scan_WorkEfficientAdd
(
__global
uint*
higherLevelArray,
__global
uint*
array,
__local
uint*
localBlock
)
__kernel
void
Scan_WorkEfficientAdd
(
__global
uint*
higherLevelArray,
__global
uint*
array,
__local
uint*
localBlock
)
{
{
int
ls2
=
get_local_size
(
0
)
*
2
;
int
gs2
=
get_global_size
(
0
)
*
2
;
int
lid
=
get_local_id
(
0
)
;
int
lid2
=
lid
*2
;
uint
id
=
(
get_group_id
(
0
)
+
1
)
*
ls2
+
lid2
;
uint
add
=
higherLevelArray[get_group_id
(
0
)
]
;
if
(
id
<
gs2
)
{
array[id]
+=
add
;
}
if
(
id
+
1
<
gs2
)
{
array[id
+
1]
+=
add
;
}
//
TO
DO:
Kernel
implementation
(
large
arrays
)
//
TO
DO:
Kernel
implementation
(
large
arrays
)
//
Kernel
that
should
add
the
group
PPS
to
the
local
PPS
(
Figure
14
)
//
Kernel
that
should
add
the
group
PPS
to
the
local
PPS
(
Figure
14
)
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment