/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010,2014, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Jia Haipeng, jiahaipeng95@gmail.com
//    Peng Xiao,   pengxiao@outlook.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#if defined (DOUBLE_SUPPORT)
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif

#ifdef T_FLOAT
#define T float
#define T4 float4
#else
#define T short
#define T4 short4
#endif

///////////////////////////////////////////////////////////////
/////////////////common///////////////////////////////////////
/////////////////////////////////////////////////////////////
T saturate_cast(float v){
#ifdef T_SHORT
    return convert_short_sat_rte(v);
#else
    return v;
#endif
}

T4 saturate_cast4(float4 v){
#ifdef T_SHORT
    return convert_short4_sat_rte(v);
#else
    return v;
#endif
}

#define FLOAT_MAX 3.402823466e+38f
typedef struct
{
    int   cndisp;
    float cmax_data_term;
    float cdata_weight;
    float cmax_disc_term;
    float cdisc_single_jump;
}con_srtuct_t;
///////////////////////////////////////////////////////////////
////////////////////////// comp data //////////////////////////
///////////////////////////////////////////////////////////////

float pix_diff_1(const uchar4 l, __global const uchar *rs)
{
    return abs((int)(l.x) - *rs);
}

float pix_diff_4(const uchar4 l, __global const uchar *rs)
{
    uchar4 r;
    r = *((__global uchar4 *)rs);

    const float tr = 0.299f;
    const float tg = 0.587f;
    const float tb = 0.114f;

    float val;

    val  = tb * abs((int)l.x - r.x);
    val += tg * abs((int)l.y - r.y);
    val += tr * abs((int)l.z - r.z);

    return val;
}

float pix_diff_3(const uchar4 l, __global const uchar *rs)
{
    return pix_diff_4(l, rs);
}

#ifndef CN
#define CN 4
#endif

#ifndef CNDISP
#define CNDISP 64
#endif

#define CAT(X,Y) X##Y
#define CAT2(X,Y) CAT(X,Y)

#define PIX_DIFF CAT2(pix_diff_, CN)

__kernel void comp_data(__global uchar *left,  int left_rows,  int left_cols,  int left_step,
                        __global uchar *right, int right_step,
                        __global T *data, int data_step,
                        __constant con_srtuct_t *con_st)
{
    int x = get_global_id(0);
    int y = get_global_id(1);

    if (y > 0 && y < (left_rows - 1) && x > 0 && x < (left_cols - 1))
    {
        data_step /= sizeof(T);
        const __global uchar* ls = left  + y * left_step  + x * CN;
        const __global uchar* rs = right + y * right_step + x * CN;

        __global T *ds = data + y * data_step + x;

        const unsigned int disp_step = data_step * left_rows;
        const float weightXterm = con_st -> cdata_weight * con_st -> cmax_data_term;
        const uchar4 ls_data = vload4(0, ls);

        for (int disp = 0; disp < con_st -> cndisp; disp++)
        {
            if (x - disp >= 1)
            {
                float val = 0;
                val = PIX_DIFF(ls_data, rs - disp * CN);
                ds[disp * disp_step] =  saturate_cast(fmin(con_st -> cdata_weight * val, weightXterm));
            }
            else
            {
                ds[disp * disp_step] =  saturate_cast(weightXterm);
            }
        }
    }
}

///////////////////////////////////////////////////////////////
//////////////////////// data step down ///////////////////////
///////////////////////////////////////////////////////////////
__kernel void data_step_down(__global T *src, int src_rows,
                             __global T *dst, int dst_rows, int dst_cols,
                             int src_step, int dst_step,
                             int cndisp)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    if (x < dst_cols && y < dst_rows)
    {
        src_step /= sizeof(T);
        dst_step /= sizeof(T);
        int4 coor_step = (int4)(src_rows * src_step);
        int4 coor = (int4)(min(2*y+0, src_rows-1) * src_step + 2*x+0,
                           min(2*y+1, src_rows-1) * src_step + 2*x+0,
                           min(2*y+0, src_rows-1) * src_step + 2*x+1,
                           min(2*y+1, src_rows-1) * src_step + 2*x+1);

        for (int d = 0; d < cndisp; ++d)
        {
            float dst_reg;
            dst_reg  = src[coor.x];
            dst_reg += src[coor.y];
            dst_reg += src[coor.z];
            dst_reg += src[coor.w];
            coor += coor_step;

            dst[(d * dst_rows + y) * dst_step + x] = saturate_cast(dst_reg);
        }
    }
}

///////////////////////////////////////////////////////////////
/////////////////// level up messages  ////////////////////////
///////////////////////////////////////////////////////////////
__kernel void level_up_message(__global T *src, int src_rows, int src_step,
                               __global T *dst, int dst_rows, int dst_cols, int dst_step,
                               int cndisp)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    if (x < dst_cols && y < dst_rows)
    {
        src_step /= sizeof(T);
        dst_step /= sizeof(T);

        const int dst_disp_step = dst_step * dst_rows;
        const int src_disp_step = src_step * src_rows;

        __global T       *dstr = dst + y * dst_step + x;
        __global const T *srcr = src + (y / 2 * src_step) + (x / 2);

        for (int d = 0; d < cndisp; ++d)
            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
    }
}

///////////////////////////////////////////////////////////////
////////////////////  calc all iterations /////////////////////
///////////////////////////////////////////////////////////////
void message(__global T *us_, __global T *ds_, __global T *ls_, __global T *rs_,
              const __global T *dt,
              int u_step, int msg_disp_step, int data_disp_step,
              float4 cmax_disc_term, float4 cdisc_single_jump)
{
    __global T *us = us_ + u_step;
    __global T *ds = ds_ - u_step;
    __global T *ls = ls_ + 1;
    __global T *rs = rs_ - 1;

    float4 minimum = (float4)(FLOAT_MAX);

    T4 t_dst[CNDISP];
    float4 dst_reg;
    float4 prev;
    float4 cur;

    T t_us = us[0];
    T t_ds = ds[0];
    T t_ls = ls[0];
    T t_rs = rs[0];
    T t_dt = dt[0];

    prev = (float4)(t_us + t_ls + t_rs + t_dt,
                    t_ds + t_ls + t_rs + t_dt,
                    t_us + t_ds + t_rs + t_dt,
                    t_us + t_ds + t_ls + t_dt);

    minimum = min(prev, minimum);

    t_dst[0] = saturate_cast4(prev);

    for(int i = 1, idx = msg_disp_step; i < CNDISP; ++i, idx+=msg_disp_step)
    {
        t_us = us[idx];
        t_ds = ds[idx];
        t_ls = ls[idx];
        t_rs = rs[idx];
        t_dt = dt[data_disp_step * i];

        dst_reg = (float4)(t_us + t_ls + t_rs + t_dt,
                           t_ds + t_ls + t_rs + t_dt,
                           t_us + t_ds + t_rs + t_dt,
                           t_us + t_ds + t_ls + t_dt);

        minimum = min(dst_reg, minimum);

        prev += cdisc_single_jump;
        prev = min(prev, dst_reg);

        t_dst[i] = saturate_cast4(prev);
    }

    minimum += cmax_disc_term;

    float4 sum = (float4)(0);
    prev = convert_float4(t_dst[CNDISP - 1]);
    for (int disp = CNDISP - 2; disp >= 0; disp--)
    {
        prev += cdisc_single_jump;
        cur = convert_float4(t_dst[disp]);
        prev = min(prev, cur);
        cur = min(prev, minimum);
        sum += cur;

        t_dst[disp] = saturate_cast4(cur);
    }

    dst_reg = convert_float4(t_dst[CNDISP - 1]);
    dst_reg = min(dst_reg, minimum);
    t_dst[CNDISP - 1] = saturate_cast4(dst_reg);
    sum += dst_reg;

    sum /= (float4)(CNDISP);
#pragma unroll
    for(int i = 0, idx = 0; i < CNDISP; ++i, idx+=msg_disp_step)
    {
        T4 dst = t_dst[i];
        us_[idx] = dst.x - sum.x;
        ds_[idx] = dst.y - sum.y;
        rs_[idx] = dst.z - sum.z;
        ls_[idx] = dst.w - sum.w;
    }
}
__kernel void one_iteration(__global T *u,    int u_step,
                            __global T *data, int data_step,
                            __global T *d,    __global T *l, __global T *r,
                            int t, int cols, int rows,
                            float cmax_disc_term, float cdisc_single_jump)
{
    const int y = get_global_id(1);
    const int x = ((get_global_id(0)) << 1) + ((y + t) & 1);

    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
    {
        u_step    /= sizeof(T);
        data_step /= sizeof(T);

        __global T *us = u + y * u_step + x;
        __global T *ds = d + y * u_step + x;
        __global T *ls = l + y * u_step + x;
        __global T *rs = r + y * u_step + x;
        const __global  T *dt = data + y * data_step + x;

        int msg_disp_step = u_step * rows;
        int data_disp_step = data_step * rows;

        message(us, ds, ls, rs, dt,
                u_step, msg_disp_step, data_disp_step,
                (float4)(cmax_disc_term), (float4)(cdisc_single_jump));
    }
}

///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////
__kernel void output(const __global T *u, int u_step,
                     const __global T *d, const __global T *l,
                     const __global T *r, const __global T *data,
                     __global T *disp, int disp_rows, int disp_cols, int disp_step,
                     int cndisp)
{
    const int x = get_global_id(0);
    const int y = get_global_id(1);

    if (y > 0 && y < disp_rows - 1 && x > 0 && x < disp_cols - 1)
    {
        u_step    /= sizeof(T);
        disp_step /= sizeof(T);
        const __global T *us = u + (y + 1) * u_step + x;
        const __global T *ds = d + (y - 1) * u_step + x;
        const __global T *ls = l + y * u_step + (x + 1);
        const __global T *rs = r + y * u_step + (x - 1);
        const __global T *dt = data + y * u_step + x;

        int disp_steps = disp_rows * u_step;

        int best = 0;
        float best_val = FLOAT_MAX;
        for (int d = 0; d < cndisp; ++d)
        {
            float val;
            val  = us[d * disp_steps];
            val += ds[d * disp_steps];
            val += ls[d * disp_steps];
            val += rs[d * disp_steps];
            val += dt[d * disp_steps];

            if (val < best_val)
            {
                best_val = val;
                best = d;
            }
        }

        (disp + y * disp_step)[x] = convert_short_sat(best);
    }
}