morph3x3.cl 4.47 KB
Newer Older
wester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

#ifdef OP_ERODE
#define OP(m1, m2) min(m1, m2)
#define VAL UCHAR_MAX
#endif

#ifdef OP_DILATE
#define OP(m1, m2) max(m1, m2)
#define VAL 0
#endif

#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT
#define EXTRA_PARAMS , __global const uchar * matptr, int mat_step, int mat_offset
#else
#define EXTRA_PARAMS
#endif

#define PROCESS(_y, _x) \
    line_out[0] = OP(line_out[0], arr[_x + 3 * _y]); \
    line_out[1] = OP(line_out[1], arr[_x + 3 * (_y + 1)]);

#define PROCESS_ELEM \
    line_out[0] = (uchar16)VAL; \
    line_out[1] = (uchar16)VAL; \
    PROCESS_ELEM_

__kernel void morph3x3_8UC1_cols16_rows2(__global const uint* src, int src_step,
                                         __global uint* dst, int dst_step,
                                         int rows, int cols
                                         EXTRA_PARAMS)
{
    int block_x = get_global_id(0);
    int y = get_global_id(1) * 2;
    int ssx = 1, dsx = 1;

    if ((block_x * 16) >= cols || y >= rows) return;

    uchar a; uchar16 b; uchar c;
    uchar d; uchar16 e; uchar f;
    uchar g; uchar16 h; uchar i;
    uchar j; uchar16 k; uchar l;

    uchar16 line[4];
    uchar16 line_out[2];

    int src_index = block_x * 4 * ssx + (y - 1) * (src_step / 4);
    line[0] = (y == 0) ? (uchar16)VAL: as_uchar16(vload4(0, src + src_index));
    line[1] = as_uchar16(vload4(0, src + src_index + (src_step / 4)));
    line[2] = as_uchar16(vload4(0, src + src_index + 2 * (src_step / 4)));
    line[3] = (y == (rows - 2)) ? (uchar16)VAL: as_uchar16(vload4(0, src + src_index + 3 * (src_step / 4)));

    __global uchar *src_p = (__global uchar *)src;
    bool line_end = ((block_x + 1) * 16 == cols);

    src_index = block_x * 16 * ssx + (y - 1) * src_step;

    a = (block_x == 0 || y == 0) ? VAL : src_p[src_index - 1];
    b = line[0];
    c = (line_end || y == 0) ? VAL : src_p[src_index + 16];

    d = (block_x == 0) ? VAL : src_p[src_index + src_step - 1];
    e = line[1];
    f = line_end ? VAL : src_p[src_index + src_step + 16];

    g = (block_x == 0) ? VAL : src_p[src_index + 2 * src_step - 1];
    h = line[2];
    i = line_end ? VAL : src_p[src_index + 2 * src_step + 16];

    j = (block_x == 0 || y == (rows - 2)) ? VAL : src_p[src_index + 3 * src_step - 1];
    k = line[3];
    l = (line_end || y == (rows - 2)) ? VAL : src_p[src_index + 3 * src_step + 16];

    uchar16 arr[12];
    arr[0] = (uchar16)(a, b.s01234567, b.s89ab, b.scde);
    arr[1] = b;
    arr[2] = (uchar16)(b.s12345678, b.s9abc, b.sdef, c);
    arr[3] = (uchar16)(d, e.s01234567, e.s89ab, e.scde);
    arr[4] = e;
    arr[5] = (uchar16)(e.s12345678, e.s9abc, e.sdef, f);
    arr[6] = (uchar16)(g, h.s01234567, h.s89ab, h.scde);
    arr[7] = h;
    arr[8] = (uchar16)(h.s12345678, h.s9abc, h.sdef, i);
    arr[9] = (uchar16)(j, k.s01234567, k.s89ab, k.scde);
    arr[10] = k;
    arr[11] = (uchar16)(k.s12345678, k.s9abc, k.sdef, l);

    PROCESS_ELEM;

    int dst_index = block_x * 4 * dsx + y * (dst_step / 4);

#if defined OP_GRADIENT || defined OP_TOPHAT || defined OP_BLACKHAT
    int mat_index = y * mat_step + block_x * 16 * ssx + mat_offset;
    uchar16 val0 = vload16(0, matptr + mat_index);
    uchar16 val1 = vload16(0, matptr + mat_index + mat_step);

#ifdef OP_GRADIENT
    line_out[0] = convert_uchar16_sat(convert_int16(line_out[0]) - convert_int16(val0));
    line_out[1] = convert_uchar16_sat(convert_int16(line_out[1]) - convert_int16(val1));
    vstore4(as_uint4(line_out[0]), 0, dst + dst_index);
    vstore4(as_uint4(line_out[1]), 0, dst + dst_index + (dst_step / 4));
#elif defined OP_TOPHAT
    line_out[0] = convert_uchar16_sat(convert_int16(val0) - convert_int16(line_out[0]));
    line_out[1] = convert_uchar16_sat(convert_int16(val1) - convert_int16(line_out[1]));
    vstore4(as_uint4(line_out[0]), 0, dst + dst_index);
    vstore4(as_uint4(line_out[1]), 0, dst + dst_index + (dst_step / 4));
#elif defined OP_BLACKHAT
    line_out[0] = convert_uchar16_sat(convert_int16(line_out[0]) - convert_int16(val0));
    line_out[1] = convert_uchar16_sat(convert_int16(line_out[1]) - convert_int16(val1));
    vstore4(as_uint4(line_out[0]), 0, dst + dst_index);
    vstore4(as_uint4(line_out[1]), 0, dst + dst_index + (dst_step / 4));
#endif
#else
    vstore4(as_uint4(line_out[0]), 0, dst + dst_index);
    vstore4(as_uint4(line_out[1]), 0, dst + dst_index + (dst_step / 4));
#endif
}