laplacian5.cl 6.5 KB
Newer Older
wester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

// Copyright (C) 2014, Itseez, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.


#define noconvert

#ifdef ONLY_SUM_CONVERT

__kernel void sumConvert(__global const uchar * src1ptr, int src1_step, int src1_offset,
                         __global const uchar * src2ptr, int src2_step, int src2_offset,
                         __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                         coeffT scale, coeffT delta)
{
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (y < dst_rows && x < dst_cols)
    {
        int src1_index = mad24(y, src1_step, mad24(x, (int)sizeof(srcT), src1_offset));
        int src2_index = mad24(y, src2_step, mad24(x, (int)sizeof(srcT), src2_offset));
        int dst_index = mad24(y, dst_step, mad24(x, (int)sizeof(dstT), dst_offset));

        __global const srcT * src1 = (__global const srcT *)(src1ptr + src1_index);
        __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index);
        __global dstT * dst = (__global dstT *)(dstptr + dst_index);

#if wdepth <= 4
        dst[0] = convertToDT( mad24((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
#else
        dst[0] = convertToDT( mad((WT)(scale), convertToWT(src1[0]) + convertToWT(src2[0]), (WT)(delta)) );
#endif
    }
}

#else

///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////

#ifdef BORDER_CONSTANT
// CCCCCC|abcdefgh|CCCCCCC
#define EXTRAPOLATE(x, maxV)
#elif defined BORDER_REPLICATE
// aaaaaa|abcdefgh|hhhhhhh
#define EXTRAPOLATE(x, maxV) \
    { \
        (x) = clamp((x), 0, (maxV)-1); \
    }
#elif defined BORDER_WRAP
// cdefgh|abcdefgh|abcdefg
#define EXTRAPOLATE(x, maxV) \
    { \
        (x) = ( (x) + (maxV) ) % (maxV); \
    }
#elif defined BORDER_REFLECT
// fedcba|abcdefgh|hgfedcb
#define EXTRAPOLATE(x, maxV) \
    { \
        (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
    }
#elif defined BORDER_REFLECT_101
// gfedcb|abcdefgh|gfedcba
#define EXTRAPOLATE(x, maxV) \
    { \
        (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
    }
#else
#error No extrapolation method
#endif

#if CN != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr)  *(__global dstT *)(addr) = val
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE (int)sizeof(srcT1)*3
#define DSTSIZE (int)sizeof(dstT1)*3
#endif

#define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))

#ifdef BORDER_CONSTANT
// CCCCCC|abcdefgh|CCCCCCC
#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
#else
#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
#endif

// horizontal and vertical filter kernels
// should be defined on host during compile time to avoid overhead
#define DIG(a) a,
__constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };
__constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };

__kernel void laplacian(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
                         __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                         WT1 scale, WT1 delta)
{
    __local WT lsmem[BLK_Y + 2 * RADIUS][BLK_X + 2 * RADIUS];
    __local WT lsmemDy1[BLK_Y][BLK_X + 2 * RADIUS];
    __local WT lsmemDy2[BLK_Y][BLK_X + 2 * RADIUS];

    int lix = get_local_id(0);
    int liy = get_local_id(1);

    int x = get_global_id(0);

    int srcX = x + srcOffsetX - RADIUS;

    int clocY = liy;
    do
    {
        int yb = clocY + srcOffsetY - RADIUS;
        EXTRAPOLATE(yb, (height));

        int clocX = lix;
        int cSrcX = srcX;
        do
        {
            int xb = cSrcX;
            EXTRAPOLATE(xb,(width));
            lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );

            clocX += BLK_X;
            cSrcX += BLK_X;
        }
        while(clocX < BLK_X+(RADIUS*2));

        clocY += BLK_Y;
    }
    while (clocY < BLK_Y+(RADIUS*2));
    barrier(CLK_LOCAL_MEM_FENCE);

    WT scale_v = (WT)scale;
    WT delta_v = (WT)delta;
    for (int y = 0; y < dst_rows; y+=BLK_Y)
    {
        int i, clocX = lix;
        WT sum1 = (WT) 0;
        WT sum2 = (WT) 0;
        do
        {
            sum1 = (WT) 0;
            sum2 = (WT) 0;
            for (i=0; i<=2*RADIUS; i++)
            {
                sum1 = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum1);
                sum2 = mad(lsmem[liy + i][clocX], mat_kernelX[i], sum2);
            }
            lsmemDy1[liy][clocX] = sum1;
            lsmemDy2[liy][clocX] = sum2;
            clocX += BLK_X;
        }
        while(clocX < BLK_X+(RADIUS*2));
        barrier(CLK_LOCAL_MEM_FENCE);

        if ((x < dst_cols) && (y + liy < dst_rows))
        {
            sum1 = (WT) 0;
            sum2 = (WT) 0;
            for (i=0; i<=2*RADIUS; i++)
            {
                sum1 = mad(lsmemDy1[liy][lix+i], mat_kernelX[i], sum1);
                sum2 = mad(lsmemDy2[liy][lix+i], mat_kernelY[i], sum2);
            }

            WT sum = mad(scale_v, (sum1 + sum2), delta_v);
            storepix(convertToDT(sum), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
        }

        for (int i = liy * BLK_X + lix; i < (RADIUS*2) * (BLK_X+(RADIUS*2)); i += BLK_X * BLK_Y)
        {
            int clocX = i % (BLK_X+(RADIUS*2));
            int clocY = i / (BLK_X+(RADIUS*2));
            lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        int yb = y + liy + BLK_Y + srcOffsetY + RADIUS;
        EXTRAPOLATE(yb, (height));

        clocX = lix;
        int cSrcX = x + srcOffsetX - RADIUS;
        do
        {
            int xb = cSrcX;
            EXTRAPOLATE(xb,(width));
            lsmem[liy + 2*RADIUS][clocX] = ELEM(xb, yb, (width), (height), 0 );

            clocX += BLK_X;
            cSrcX += BLK_X;
        }
        while(clocX < BLK_X+(RADIUS*2));
        barrier(CLK_LOCAL_MEM_FENCE);
    }
}

#endif