fast.cl 5.01 KB
Newer Older
wester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
// OpenCL port of the FAST corner detector.
// Copyright (C) 2014, Itseez Inc. See the license at http://opencv.org

inline int cornerScore(__global const uchar* img, int step)
{
    int k, tofs, v = img[0], a0 = 0, b0;
    int d[16];
    #define LOAD2(idx, ofs) \
        tofs = ofs; d[idx] = (short)(v - img[tofs]); d[idx+8] = (short)(v - img[-tofs])
    LOAD2(0, 3);
    LOAD2(1, -step+3);
    LOAD2(2, -step*2+2);
    LOAD2(3, -step*3+1);
    LOAD2(4, -step*3);
    LOAD2(5, -step*3-1);
    LOAD2(6, -step*2-2);
    LOAD2(7, -step-3);

    #pragma unroll
    for( k = 0; k < 16; k += 2 )
    {
        int a = min((int)d[(k+1)&15], (int)d[(k+2)&15]);
        a = min(a, (int)d[(k+3)&15]);
        a = min(a, (int)d[(k+4)&15]);
        a = min(a, (int)d[(k+5)&15]);
        a = min(a, (int)d[(k+6)&15]);
        a = min(a, (int)d[(k+7)&15]);
        a = min(a, (int)d[(k+8)&15]);
        a0 = max(a0, min(a, (int)d[k&15]));
        a0 = max(a0, min(a, (int)d[(k+9)&15]));
    }

    b0 = -a0;
    #pragma unroll
    for( k = 0; k < 16; k += 2 )
    {
        int b = max((int)d[(k+1)&15], (int)d[(k+2)&15]);
        b = max(b, (int)d[(k+3)&15]);
        b = max(b, (int)d[(k+4)&15]);
        b = max(b, (int)d[(k+5)&15]);
        b = max(b, (int)d[(k+6)&15]);
        b = max(b, (int)d[(k+7)&15]);
        b = max(b, (int)d[(k+8)&15]);

        b0 = min(b0, max(b, (int)d[k]));
        b0 = min(b0, max(b, (int)d[(k+9)&15]));
    }

    return -b0-1;
}

__kernel
void FAST_findKeypoints(
    __global const uchar * _img, int step, int img_offset,
    int img_rows, int img_cols,
    volatile __global int* kp_loc,
    int max_keypoints, int threshold )
{
    int j = get_global_id(0) + 3;
    int i = get_global_id(1) + 3;

    if (i < img_rows - 3 && j < img_cols - 3)
    {
        __global const uchar* img = _img + mad24(i, step, j + img_offset);
        int v = img[0], t0 = v - threshold, t1 = v + threshold;
        int k, tofs, v0, v1;
        int m0 = 0, m1 = 0;

        #define UPDATE_MASK(idx, ofs) \
            tofs = ofs; v0 = img[tofs]; v1 = img[-tofs]; \
            m0 |= ((v0 < t0) << idx) | ((v1 < t0) << (8 + idx)); \
            m1 |= ((v0 > t1) << idx) | ((v1 > t1) << (8 + idx))

        UPDATE_MASK(0, 3);
        if( (m0 | m1) == 0 )
            return;

        UPDATE_MASK(2, -step*2+2);
        UPDATE_MASK(4, -step*3);
        UPDATE_MASK(6, -step*2-2);

        #define EVEN_MASK (1+4+16+64)

        if( ((m0 | (m0 >> 8)) & EVEN_MASK) != EVEN_MASK &&
            ((m1 | (m1 >> 8)) & EVEN_MASK) != EVEN_MASK )
            return;

        UPDATE_MASK(1, -step+3);
        UPDATE_MASK(3, -step*3+1);
        UPDATE_MASK(5, -step*3-1);
        UPDATE_MASK(7, -step-3);
        if( ((m0 | (m0 >> 8)) & 255) != 255 &&
            ((m1 | (m1 >> 8)) & 255) != 255 )
            return;

        m0 |= m0 << 16;
        m1 |= m1 << 16;

        #define CHECK0(i) ((m0 & (511 << i)) == (511 << i))
        #define CHECK1(i) ((m1 & (511 << i)) == (511 << i))

        if( CHECK0(0) + CHECK0(1) + CHECK0(2) + CHECK0(3) +
            CHECK0(4) + CHECK0(5) + CHECK0(6) + CHECK0(7) +
            CHECK0(8) + CHECK0(9) + CHECK0(10) + CHECK0(11) +
            CHECK0(12) + CHECK0(13) + CHECK0(14) + CHECK0(15) +

            CHECK1(0) + CHECK1(1) + CHECK1(2) + CHECK1(3) +
            CHECK1(4) + CHECK1(5) + CHECK1(6) + CHECK1(7) +
            CHECK1(8) + CHECK1(9) + CHECK1(10) + CHECK1(11) +
            CHECK1(12) + CHECK1(13) + CHECK1(14) + CHECK1(15) == 0 )
            return;

        {
            int idx = atomic_inc(kp_loc);
            if( idx < max_keypoints )
            {
                kp_loc[1 + 2*idx] = j;
                kp_loc[2 + 2*idx] = i;
            }
        }
    }
}

///////////////////////////////////////////////////////////////////////////
// nonmaxSupression

__kernel
void FAST_nonmaxSupression(
    __global const int* kp_in, volatile __global int* kp_out,
    __global const uchar * _img, int step, int img_offset,
    int rows, int cols, int counter, int max_keypoints)
{
    const int idx = get_global_id(0);

    if (idx < counter)
    {
        int x = kp_in[1 + 2*idx];
        int y = kp_in[2 + 2*idx];
        __global const uchar* img = _img + mad24(y, step, x + img_offset);

        int s = cornerScore(img, step);

        if( (x < 4 || s > cornerScore(img-1, step)) +
            (y < 4 || s > cornerScore(img-step, step)) != 2 )
            return;
        if( (x >= cols - 4 || s > cornerScore(img+1, step)) +
            (y >= rows - 4 || s > cornerScore(img+step, step)) +
            (x < 4 || y < 4 || s > cornerScore(img-step-1, step)) +
            (x >= cols - 4 || y < 4 || s > cornerScore(img-step+1, step)) +
            (x < 4 || y >= rows - 4 || s > cornerScore(img+step-1, step)) +
            (x >= cols - 4 || y >= rows - 4 || s > cornerScore(img+step+1, step)) == 6)
        {
            int new_idx = atomic_inc(kp_out);
            if( new_idx < max_keypoints )
            {
                kp_out[1 + 3*new_idx] = x;
                kp_out[2 + 3*new_idx] = y;
                kp_out[3 + 3*new_idx] = s;
            }
        }
    }
}