//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Niko Li, newlife20080214@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//

#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif

__kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst,
                         int cols, int rows,
                         int dstStep_in_piexl, int pixel_end)
{
    int id = get_global_id(0);
    int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
    pixelid = clamp(pixelid,0,pixel_end);
    GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;

    pixel0 = src[pixelid.x];
    pixel1 = src[pixelid.y];
    pixel2 = src[pixelid.z];

    outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
    outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
    outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
    outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);

    int4 outy = (id<<2)/cols;
    int4 outx = (id<<2)%cols;

    outx += (int4)(0, 1, 2, 3);
    outy = select(outy, outy+1, outx>=cols);
    outx = select(outx, outx-cols, outx>=cols);

    // when cols == 1
    outy = select(outy, outy + 1, outx >= cols);
    outx = select(outx, outx-cols, outx >= cols);
    outy = select(outy, outy + 1, outx >= cols);
    outx = select(outx, outx-cols, outx >= cols);

    int4 addr = mad24(outy,(int4)dstStep_in_piexl,outx);

    if(outx.w<cols && outy.w<rows)
    {
        dst[addr.x] = outpix0;
        dst[addr.y] = outpix1;
        dst[addr.z] = outpix2;
        dst[addr.w] = outpix3;
    }
    else if(outx.z<cols && outy.z<rows)
    {
        dst[addr.x] = outpix0;
        dst[addr.y] = outpix1;
        dst[addr.z] = outpix2;
    }
    else if(outx.y<cols && outy.y<rows)
    {
        dst[addr.x] = outpix0;
        dst[addr.y] = outpix1;
    }
    else if(outx.x<cols && outy.x<rows)
        dst[addr.x] = outpix0;
}

__kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst,
                          int cols, int rows,
                          int srcStep_in_pixel, int pixel_end)
{
    int id = get_global_id(0)<<2;
    int y = id / cols;
    int x = id % cols;

    int4 x4 = (int4)(x,x+1,x+2,x+3);
    int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
    x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);

    // when cols == 1
    y4 = select(y4, y4 + 1,x4>=(int4)cols);
    x4 = select(x4, x4 - (int4)cols,x4>=(int4)cols);
    y4 = select(y4, y4 + 1,x4>=(int4)cols);
    x4 = select(x4, x4-(int4)cols,x4>=(int4)cols);

    y4=clamp(y4,(int4)0,(int4)(rows-1));
    int4 addr = mad24(y4, (int4)srcStep_in_pixel, x4);

    GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
    pixel0 = src[addr.x];
    pixel1 = src[addr.y];
    pixel2 = src[addr.z];
    pixel3 = src[addr.w];

    pixel0.w = pixel1.x;
    outpixel1.x = pixel1.y;
    outpixel1.y = pixel1.z;
    outpixel1.z = pixel2.x;
    outpixel1.w = pixel2.y;
    outpixel2.x = pixel2.z;
    outpixel2.y = pixel3.x;
    outpixel2.z = pixel3.y;
    outpixel2.w = pixel3.z;

    int4 outaddr = mul24(id>>2 , 3);
    outaddr.y++;
    outaddr.z+=2;

    if(outaddr.z <= pixel_end)
    {
        dst[outaddr.x] = pixel0;
        dst[outaddr.y] = outpixel1;
        dst[outaddr.z] = outpixel2;
    }
    else if(outaddr.y <= pixel_end)
    {
        dst[outaddr.x] = pixel0;
        dst[outaddr.y] = outpixel1;
    }
    else if(outaddr.x <= pixel_end)
        dst[outaddr.x] = pixel0;
}