/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Jin Ma,  jin@multicorewareinc.com
//    Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
typedef double T;
#else
typedef long T;
#endif

#define DST_ROW_00     0
#define DST_ROW_10     1
#define DST_ROW_01     2
#define DST_ROW_20     3
#define DST_ROW_11     4
#define DST_ROW_02     5
#define DST_ROW_30     6
#define DST_ROW_21     7
#define DST_ROW_12     8
#define DST_ROW_03     9

__kernel void icvContourMoments(int contour_total,
                                __global float* reader_oclmat_data,
                                __global T* dst_a,
                                int dst_step)
{
    T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1;
    int idx = get_global_id(0);

    if (idx < 0 || idx >= contour_total)
        return;

    xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1)));
    yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1));
    xi_12 = xi_1 * xi_1;
    yi_12 = yi_1 * yi_1;

    if(idx == contour_total - 1)
    {
        xi = (T)(*(reader_oclmat_data));
        yi = (T)(*(reader_oclmat_data + 1));
    }
    else
    {
        xi = (T)(*(reader_oclmat_data + (idx + 1) * 2));
        yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1));
    }
    xi2 = xi * xi;
    yi2 = yi * yi;
    dxy = xi_1 * yi - xi * yi_1;
    xii_1 = xi_1 + xi;
    yii_1 = yi_1 + yi;

    dst_step /= sizeof(T);
    *( dst_a + DST_ROW_00 * dst_step + idx) = dxy;
    *( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1;
    *( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1;
    *( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2);
    *( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi));
    *( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2);
    *( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2);
    *( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2);
    *( dst_a + DST_ROW_21 * dst_step + idx) =
        dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 +
        xi2 * (yi_1 + 3 * yi));
    *( dst_a + DST_ROW_12 * dst_step + idx) =
        dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 +
        yi2 * (xi_1 + 3 * xi));
}

#if defined (DOUBLE_SUPPORT)
#define WT double
#define WT4 double4
#define convert_T4 convert_double4
#define convert_T convert_double
#else
#define WT float
#define WT4 float4
#define convert_T4 convert_float4
#define convert_T convert_float
#endif

#ifdef CV_8UC1
#define TT uchar
#elif defined CV_16UC1
#define TT ushort
#elif defined CV_16SC1
#define TT short
#elif defined CV_32FC1
#define TT float
#elif defined CV_64FC1
#ifdef DOUBLE_SUPPORT
#define TT double
#else
#define TT float
#endif
#endif
__kernel void CvMoments(__global TT* src_data, int src_rows, int src_cols, int src_step,
                        __global WT* dst_m,
                        int dst_cols, int dst_step, int binary)
{
    int dy = get_global_id(1);
    int ly = get_local_id(1);
    int gidx = get_group_id(0);
    int gidy = get_group_id(1);
    int x_rest = src_cols % 256;
    int y_rest = src_rows % 256;
    __local int codxy[256];
    codxy[ly] = ly;
    barrier(CLK_LOCAL_MEM_FENCE);

    WT4 x0 = (WT4)(0.f);
    WT4 x1 = (WT4)(0.f);
    WT4 x2 = (WT4)(0.f);
    WT4 x3 = (WT4)(0.f);

    __global TT* row = src_data + gidy * src_step + ly * src_step + gidx * 256;

    WT4 p;
    WT4 x;
    WT4 xp;
    WT4 xxp;

    WT py = 0.f, sy = 0.f;

    if(dy < src_rows)
    {
        if((x_rest > 0) && (gidx == ((int)get_num_groups(0) - 1)))
        {
            int i;
            for(i = 0; i < x_rest - 4; i += 4)
            {
                p = convert_T4(vload4(0, row + i));
                x = convert_T4(vload4(0, codxy + i));
                xp = x * p;
                xxp = xp * x;

                x0 += p;
                x1 += xp;
                x2 += xxp;
                x3 += convert_T4(xxp * x);
            }

            x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
            x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
            x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
            x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;

            WT x0_ = 0;
            WT x1_ = 0;
            WT x2_ = 0;
            WT x3_ = 0;

            for(; i < x_rest; i++)
            {
                WT p_ = 0;
                p_ = row[i];
                WT x_ = convert_T(codxy[i]);


                WT xp_ = x_ * p_;
                WT xxp_ = xp_ * x_;

                x0_ += p_;
                x1_ += xp_;
                x2_ += xxp_;
                x3_ += xxp_ * x_;
            }

            x0.s0 += x0_;
            x1.s0 += x1_;
            x2.s0 += x2_;
            x3.s0 += x3_;
        }else
        {
            for(int i = 0; i < 256; i += 4)
            {
                p = convert_T4(vload4(0, row + i));
                x = convert_T4(vload4(0, codxy + i));
                xp = x * p;
                xxp = xp * x;

                x0 += p;
                x1 += xp;
                x2 += xxp;
                x3 += convert_T4(xxp * x);
            }

            x0.s0 = x0.s0 + x0.s1 + x0.s2 + x0.s3;
            x1.s0 = x1.s0 + x1.s1 + x1.s2 + x1.s3;
            x2.s0 = x2.s0 + x2.s1 + x2.s2 + x2.s3;
            x3.s0 = x3.s0 + x3.s1 + x3.s2 + x3.s3;
        }

        py = ly * x0.s0;
        sy = ly * ly;
    }
    __local WT mom[10][256];

    if((y_rest > 0) && (gidy == ((int)get_num_groups(1) - 1)))
    {
        if(ly < y_rest)
        {
            mom[9][ly] = py * sy;
            mom[8][ly] = x1.s0 * sy;
            mom[7][ly] = x2.s0 * ly;
            mom[6][ly] = x3.s0;
            mom[5][ly] = x0.s0 * sy;
            mom[4][ly] = x1.s0 * ly;
            mom[3][ly] = x2.s0;
            mom[2][ly] = py;
            mom[1][ly] = x1.s0;
            mom[0][ly] = x0.s0;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
        if(ly < 10)
            for(int i = 1; i < y_rest; i++)
                mom[ly][0] = mom[ly][i] + mom[ly][0];
    }
    else
    {
        mom[9][ly] = py * sy;
        mom[8][ly] = x1.s0 * sy;
        mom[7][ly] = x2.s0 * ly;
        mom[6][ly] = x3.s0;
        mom[5][ly] = x0.s0 * sy;
        mom[4][ly] = x1.s0 * ly;
        mom[3][ly] = x2.s0;
        mom[2][ly] = py;
        mom[1][ly] = x1.s0;
        mom[0][ly] = x0.s0;

        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 128)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 128];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 128];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 128];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 128];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 128];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 128];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 128];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 128];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 128];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 128];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 64)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 64];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 64];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 64];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 64];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 64];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 64];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 64];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 64];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 64];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 64];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 32)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 32];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 32];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 32];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 32];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 32];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 32];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 32];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 32];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 32];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 32];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 16)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 16];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 16];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 16];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 16];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 16];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 16];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 16];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 16];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 16];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 16];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 8)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 8];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 8];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 8];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 8];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 8];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 8];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 8];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 8];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 8];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 8];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 4)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 4];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 4];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 4];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 4];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 4];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 4];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 4];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 4];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 4];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 4];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 2)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 2];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 2];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 2];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 2];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 2];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 2];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 2];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 2];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 2];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 2];
        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(ly < 1)
        {
            mom[0][ly] = mom[0][ly] + mom[0][ly + 1];
            mom[1][ly] = mom[1][ly] + mom[1][ly + 1];
            mom[2][ly] = mom[2][ly] + mom[2][ly + 1];
            mom[3][ly] = mom[3][ly] + mom[3][ly + 1];
            mom[4][ly] = mom[4][ly] + mom[4][ly + 1];
            mom[5][ly] = mom[5][ly] + mom[5][ly + 1];
            mom[6][ly] = mom[6][ly] + mom[6][ly + 1];
            mom[7][ly] = mom[7][ly] + mom[7][ly + 1];
            mom[8][ly] = mom[8][ly] + mom[8][ly + 1];
            mom[9][ly] = mom[9][ly] + mom[9][ly + 1];
        }
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    if(binary)
    {
        WT s = 1.0f/255;
        if(ly < 10)
            mom[ly][0] *= s;
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    WT xm = (gidx * 256) * mom[0][0];
    WT ym = (gidy * 256) * mom[0][0];

    if(ly == 0)
    {
        mom[0][1] = mom[0][0];
        mom[1][1] = mom[1][0] + xm;
        mom[2][1] = mom[2][0] + ym;
        mom[3][1] = mom[3][0] + gidx * 256 * (mom[1][0] * 2 + xm);
        mom[4][1] = mom[4][0] + gidx * 256 * (mom[2][0] + ym) + gidy * 256 * mom[1][0];
        mom[5][1] = mom[5][0] + gidy * 256 * (mom[2][0] * 2 + ym);
        mom[6][1] = mom[6][0] + gidx * 256 * (3 * mom[3][0] + 256 * gidx * (3 * mom[1][0] + xm));
        mom[7][1] = mom[7][0] + gidx * 256 * (2 * (mom[4][0] + 256 * gidy * mom[1][0]) + 256 * gidx * (mom[2][0] + ym)) + 256 * gidy * mom[3][0];
        mom[8][1] = mom[8][0] + gidy * 256 * (2 * (mom[4][0] + 256 * gidx * mom[2][0]) + 256 * gidy * (mom[1][0] + xm)) + 256 * gidx * mom[5][0];
        mom[9][1] = mom[9][0] + gidy * 256 * (3 * mom[5][0] + 256 * gidy * (3 * mom[2][0] + ym));
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    if(ly < 10)
        dst_m[10 * gidy * dst_step + ly * dst_step + gidx] = mom[ly][1];
}