/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Dachuan Zhao, dachuan@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif

#if defined BORDER_REPLICATE
// aaaaaa|abcdefgh|hhhhhhh
#define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1)
#elif defined BORDER_WRAP
// cdefgh|abcdefgh|abcdefg
#define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV)
#elif defined BORDER_REFLECT
// fedcba|abcdefgh|hgfedcb
#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1)
#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
// gfedcb|abcdefgh|gfedcba
#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1)
#else
#error No extrapolation method
#endif

#if cn != 3
#define loadpix(addr)  *(__global const T*)(addr)
#define storepix(val, addr)  *(__global T*)(addr) = (val)
#define PIXSIZE ((int)sizeof(T))
#else
#define loadpix(addr)  vload3(0, (__global const T1*)(addr))
#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))
#define PIXSIZE ((int)sizeof(T1)*3)
#endif

#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))

#if kercn == 4
#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))
#endif

#ifdef INTEL_DEVICE
#define MAD(x,y,z) fma((x),(y),(z))
#else
#define MAD(x,y,z) mad((x),(y),(z))
#endif

#define LOAD_LOCAL(col_gl, col_lcl) \
    sum0 =     co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));         \
    sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0);  \
    temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows));                      \
    sum0 = MAD(co1, temp, sum0);                                            \
    sum1 = co3 * temp;                                                      \
    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows));                  \
    sum0 = MAD(co2, temp, sum0);                                            \
    sum1 = MAD(co2, temp, sum1);                                            \
    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows));                  \
    sum0 = MAD(co3, temp, sum0);                                            \
    sum1 = MAD(co1, temp, sum1);                                            \
    smem[0][col_lcl] = sum0;                                                \
    sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1);  \
    sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1);  \
    smem[1][col_lcl] = sum1;


#if kercn == 4
#define LOAD_LOCAL4(col_gl, col_lcl) \
    sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \
    sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \
    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y, src_rows));                       \
    sum40 = MAD(co1, temp4, sum40);                                             \
    sum41 = co3 * temp4;                                                        \
    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 1, src_rows));                   \
    sum40 = MAD(co2, temp4, sum40);                                             \
    sum41 = MAD(co2, temp4, sum41);                                             \
    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 2, src_rows));                   \
    sum40 = MAD(co3, temp4, sum40);                                             \
    sum41 = MAD(co1, temp4, sum41);                                             \
    vstore4(sum40, col_lcl, (__local float*) &smem[0][2]);                      \
    sum41 = MAD(co2, SRC4(col_gl,  EXTRAPOLATE_(src_y + 3, src_rows)), sum41);  \
    sum41 = MAD(co3, SRC4(col_gl,  EXTRAPOLATE_(src_y + 4, src_rows)), sum41);  \
    vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);
#endif

#define noconvert

__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
    const int x = get_global_id(0)*kercn;
    const int y = 2*get_global_id(1);

    __local FT smem[2][LOCAL_SIZE + 4];
    __global uchar * dstData = dst + dst_offset;
    __global const uchar * srcData = src + src_offset;

    FT sum0, sum1, temp;
    FT co1 = 0.375f;
    FT co2 = 0.25f;
    FT co3 = 0.0625f;

    const int src_y = 2*y;
    int col;

    if (src_y >= 2 && src_y < src_rows - 4)
    {
#define EXTRAPOLATE_(val, maxVal)   val
#if kercn == 1
        col = EXTRAPOLATE(x, src_cols);
        LOAD_LOCAL(col, 2 + get_local_id(0))
#else
        if (x < src_cols-4)
        {
            float4 sum40, sum41, temp4;
            LOAD_LOCAL4(x, get_local_id(0))
        }
        else
        {
            for (int i=0; i<4; i++)
            {
                col = EXTRAPOLATE(x+i, src_cols);
                LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)
            }
        }
#endif
        if (get_local_id(0) < 2)
        {
            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
            LOAD_LOCAL(col, get_local_id(0))
        }
        else if (get_local_id(0) < 4)
        {
            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
        }
    }
    else // need extrapolate y
    {
#define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)
#if kercn == 1
        col = EXTRAPOLATE(x, src_cols);
        LOAD_LOCAL(col, 2 + get_local_id(0))
#else
        if (x < src_cols-4)
        {
            float4 sum40, sum41, temp4;
            LOAD_LOCAL4(x, get_local_id(0))
        }
        else
        {
            for (int i=0; i<4; i++)
            {
                col = EXTRAPOLATE(x+i, src_cols);
                LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)
            }
        }
#endif
        if (get_local_id(0) < 2)
        {
            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
            LOAD_LOCAL(col, get_local_id(0))
        }
        else if (get_local_id(0) < 4)
        {
            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE);

#if kercn == 1
    if (get_local_id(0) < LOCAL_SIZE / 2)
    {
        const int tid2 = get_local_id(0) * 2;

        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;

        if (dst_x < dst_cols)
        {
            for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
            {
#if cn == 1
#if fdepth <= 5
                FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
#else
                FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
#endif
#else
                FT sum = co3 * smem[yin - y][2 + tid2 - 2];
                sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);
                sum = MAD(co1, smem[yin - y][2 + tid2    ], sum);
                sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
#endif
                sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
                storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
            }
        }
    }
#else
    int tid4 = get_local_id(0) * 4;
    int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
    if (dst_x < dst_cols - 1)
    {
        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
        {

            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));

            dst_x ++;
            sum =     co3* smem[yin - y][2 + tid4 + 4];
            sum = MAD(co3, smem[yin - y][2 + tid4    ], sum);
            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
            sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
            sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
            dst_x --;
        }

    }
    else if (dst_x < dst_cols)
    {
        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
        {
            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);

            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
        }
    }
#endif

}