driver_api_multi.cpp 3.41 KB
Newer Older
wester committed
1 2 3 4 5 6 7 8 9
/* This sample demonstrates the way you can perform independed tasks
   on the different GPUs */

// Disable some warnings which are caused with CUDA headers
#if defined(_MSC_VER)
#pragma warning(disable: 4201 4408 4100)

#include <iostream>
Kai Westerkamp committed
#include "opencv2/core/core.hpp"
wester committed
#include "opencv2/gpu/gpu.hpp"
wester committed

wester committed
#if defined(__arm__)
wester committed
14 15 16 17 18 19 20 21 22 23 24 25
int main()
    std::cout << "Unsupported for ARM CUDA library." << std::endl;
    return 0;

#include <cuda.h>
#include <cuda_runtime.h>

using namespace std;
using namespace cv;
wester committed
using namespace cv::gpu;
wester committed
27 28 29 30 31 32 33

#define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__)
inline void safeCall_(int code, const char* expr, const char* file, int line)
    if (code != CUDA_SUCCESS)
        std::cout << "CUDA driver API error: code " << code << ", expr " << expr
wester committed
        << ", file " << file << ", line " << line << endl;
wester committed
35 36 37 38

wester committed
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
struct Worker: public ParallelLoopBody
    Worker(int num_devices)
        count = num_devices;
        contexts = new CUcontext[num_devices];
        for (int device_id = 0; device_id < num_devices; device_id++)
            CUdevice device;
            safeCall(cuDeviceGet(&device, device_id));
            safeCall(cuCtxCreate(&contexts[device_id], 0, device));

    virtual void operator() (const Range& range) const
        for (int device_id = range.start; device_id != range.end; ++device_id)
            // Set the proper context

            Mat src(1000, 1000, CV_32F);
            Mat dst;

            RNG rng(0);
            rng.fill(src, RNG::UNIFORM, 0, 1);

            // CPU works
            transpose(src, dst);

            // GPU works
            GpuMat d_src(src);
            GpuMat d_dst;
            transpose(d_src, d_dst);

            // Check results
            bool passed = norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
            std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
            << (passed ? "passed" : "FAILED") << endl;

            // Deallocate data here, otherwise deallocation will be performed
            // after context is extracted from the stack

            CUcontext prev_context;

        if ((contexts != NULL) && count != 0)
            for (int device_id = 0; device_id < count; device_id++)

            delete[] contexts;

    CUcontext* contexts;
    int count;
wester committed
105 106 107 108 109 110 111 112 113 114 115 116

int main()
    int num_devices = getCudaEnabledDeviceCount();
    if (num_devices < 2)
        std::cout << "Two or more GPUs are required\n";
        return -1;

    for (int i = 0; i < num_devices; ++i)
wester committed
wester committed
118 119 120 121

        DeviceInfo dev_info(i);
        if (!dev_info.isCompatible())
wester committed
            std::cout << "GPU module isn't built for GPU #" << i << " ("
wester committed
123 124 125 126 127 128 129 130 131
                 << << ", CC " << dev_info.majorVersion()
                 << dev_info.minorVersion() << "\n";
            return -1;

    // Init CUDA Driver API

wester committed
132 133
    // Execute calculation
    parallel_for_(cv::Range(0, num_devices), Worker(num_devices));
wester committed
134 135 136 137 138

    return 0;
