/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Jin Ma, jin@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#define TYPE double
#else
#define TYPE float
#endif

#define CV_SWAP(a,b,t) ((t) = (a), (a) = (b), (b) = (t))
///////////////////////////////////// find_nearest //////////////////////////////////////
__kernel void knn_find_nearest(__global float* sample, int sample_row, int sample_col, int sample_step,
                               int k, __global float* samples_ocl, int sample_ocl_row, int sample_ocl_step,
                               __global float* _results, int _results_step, int _regression, int K1,
                               int sample_ocl_col, int nThreads, __local float* nr)
{
    int k1 = 0;
    int k2 = 0;

    bool regression = false;

    if(_regression)
        regression = true;

    TYPE inv_scale;
#ifdef DOUBLE_SUPPORT
    inv_scale = 1.0/K1;
#else
    inv_scale = 1.0f/K1;
#endif

    int y = get_global_id(1);
    int j, j1;
    int threadY = (y % nThreads);
    __local float* dd = nr + nThreads * k;
    if(y >= sample_row)
    {
        return;
    }
    for(j = 0; j < sample_ocl_row; j++)
    {
        TYPE sum;
#ifdef DOUBLE_SUPPORT
        sum = 0.0;
#else
        sum = 0.0f;
#endif
        float si;
        int t, ii, ii1;
        for(t = 0; t < sample_col - 16; t += 16)
        {
            float16 t0 = vload16(0, sample + y * sample_step + t) - vload16(0, samples_ocl + j * sample_ocl_step + t);
            t0 *= t0;
            sum += t0.s0 + t0.s1 + t0.s2 + t0.s3 + t0.s4 + t0.s5 + t0.s6 + t0.s7 +
                t0.s8 + t0.s9 + t0.sa + t0.sb + t0.sc + t0.sd + t0.se + t0.sf;
        }

        for(; t < sample_col; t++)
        {
#ifdef DOUBLE_SUPPORT
            double t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
#else
            float t0 = sample[y * sample_step + t] - samples_ocl[j * sample_ocl_step + t];
#endif
            sum = sum + t0 * t0;
        }

        si = (float)sum;
        for(ii = k1 - 1; ii >= 0; ii--)
        {
            if(as_int(si) > as_int(dd[ii * nThreads + threadY]))
                break;
        }
        if(ii < k - 1)
        {
            for(ii1 = k2 - 1; ii1 > ii; ii1--)
            {
                dd[(ii1 + 1) * nThreads + threadY] = dd[ii1 * nThreads + threadY];
                nr[(ii1 + 1) * nThreads + threadY] = nr[ii1 * nThreads + threadY];
            }

            dd[(ii + 1) * nThreads + threadY] = si;
            nr[(ii + 1) * nThreads + threadY] = samples_ocl[sample_col + j * sample_ocl_step];
        }
        k1 = (k1 + 1) < k ? (k1 + 1) : k;
        k2 = k1 < (k - 1) ? k1 : (k - 1);
    }
    /*! find_nearest_neighbor done!*/
    /*! write_results start!*/
    if (regression)
    {
        TYPE s;
#ifdef DOUBLE_SUPPORT
        s = 0.0;
#else
        s = 0.0f;
#endif
        for(j = 0; j < K1; j++)
            s += nr[j * nThreads + threadY];

        _results[y * _results_step] = (float)(s * inv_scale);
    }
    else
    {
        int prev_start = 0, best_count = 0, cur_count;
        float best_val;

        for(j = K1 - 1; j > 0; j--)
        {
            bool swap_f1 = false;
            for(j1 = 0; j1 < j; j1++)
            {
                if(nr[j1 * nThreads + threadY] > nr[(j1 + 1) * nThreads + threadY])
                {
                    int t;
                    CV_SWAP(nr[j1 * nThreads + threadY], nr[(j1 + 1) * nThreads + threadY], t);
                    swap_f1 = true;
                }
            }
            if(!swap_f1)
                break;
        }

        best_val = 0;
        for(j = 1; j <= K1; j++)
            if(j == K1 || nr[j * nThreads + threadY] != nr[(j - 1) * nThreads + threadY])
            {
                cur_count = j - prev_start;
                if(best_count < cur_count)
                {
                    best_count = cur_count;
                    best_val = nr[(j - 1) * nThreads + threadY];
                }
                prev_start = j;
            }
            _results[y * _results_step] = best_val;
    }
    ///*! write_results done!*/
}