/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#include "common.hpp"
#include "vtransform.hpp"

#include <limits>

namespace CAROTENE_NS {

#ifdef CAROTENE_NEON

namespace {

template <typename T>
void minMaxVals(const Size2D &size,
                const T * srcBase, ptrdiff_t srcStride,
                T * pMinVal, T * pMaxVal)
{
    using namespace internal;

    typedef typename VecTraits<T>::vec128 vec128;
    typedef typename VecTraits<T>::vec64 vec64;

    u32 step_base = 32 / sizeof(T), step_tail = 8 / sizeof(T);
    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;

    T maxVal = std::numeric_limits<T>::min();
    T minVal = std::numeric_limits<T>::max();
    vec128 v_min_base = vdupq_n(minVal), v_max_base = vdupq_n(maxVal);
    vec64 v_min_tail = vdup_n(minVal), v_max_tail = vdup_n(maxVal);

    for (size_t i = 0; i < size.height; ++i)
    {
        const T * src = getRowPtr(srcBase, srcStride, i);
        size_t j = 0;

        for (; j < roiw_base; j += step_base)
        {
            prefetch(src + j);
            vec128 v_src0 = vld1q(src + j), v_src1 = vld1q(src + j + 16 / sizeof(T));
            v_min_base = vminq(v_min_base, v_src0);
            v_max_base = vmaxq(v_max_base, v_src0);
            v_min_base = vminq(v_min_base, v_src1);
            v_max_base = vmaxq(v_max_base, v_src1);
        }
        for (; j < roiw_tail; j += step_tail)
        {
            vec64 v_src0 = vld1(src + j);
            v_min_tail = vmin(v_min_tail, v_src0);
            v_max_tail = vmax(v_max_tail, v_src0);
        }

        for (; j < size.width; j++)
        {
            T srcval = src[j];
            minVal = std::min(srcval, minVal);
            maxVal = std::max(srcval, maxVal);
        }
    }

    // collect min & max values
    T ar[16 / sizeof(T)];
    vst1q(ar, vcombine(vmin(v_min_tail, vmin(vget_low(v_min_base), vget_high(v_min_base))),
                       vmax(v_max_tail, vmax(vget_low(v_max_base), vget_high(v_max_base)))));

    for (size_t x = 0; x < 8u / sizeof(T); ++x)
    {
        minVal = std::min(minVal, ar[x]);
        maxVal = std::max(maxVal, ar[x + 8 / sizeof(T)]);
    }

    if (pMaxVal)
        *pMaxVal = maxVal;
    if (pMinVal)
        *pMinVal = minVal;
}

} // namespace

#endif

void minMaxVals(const Size2D &size,
                const u8 * srcBase, ptrdiff_t srcStride,
                u8 * pMinVal, u8 * pMaxVal)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minMaxVals<u8>(size,
                   srcBase, srcStride,
                   pMinVal, pMaxVal);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMinVal;
    (void)pMaxVal;
#endif
}

void minMaxVals(const Size2D &size,
                const s16 * srcBase, ptrdiff_t srcStride,
                s16 * pMinVal, s16 * pMaxVal)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minMaxVals<s16>(size,
                    srcBase, srcStride,
                    pMinVal, pMaxVal);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMinVal;
    (void)pMaxVal;
#endif
}

void minMaxVals(const Size2D &size,
                const u16 * srcBase, ptrdiff_t srcStride,
                u16 * pMinVal, u16 * pMaxVal)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minMaxVals<u16>(size,
                    srcBase, srcStride,
                    pMinVal, pMaxVal);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMinVal;
    (void)pMaxVal;
#endif
}

void minMaxVals(const Size2D &size,
                const s32 * srcBase, ptrdiff_t srcStride,
                s32 * pMinVal, s32 * pMaxVal)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minMaxVals<s32>(size,
                    srcBase, srcStride,
                    pMinVal, pMaxVal);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMinVal;
    (void)pMaxVal;
#endif
}

void minMaxVals(const Size2D &size,
                const u32 * srcBase, ptrdiff_t srcStride,
                u32 * pMinVal, u32 * pMaxVal)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minMaxVals<u32>(size,
                    srcBase, srcStride,
                    pMinVal, pMaxVal);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMinVal;
    (void)pMaxVal;
#endif
}

void minMaxLoc(const Size2D &size,
               const f32 * srcBase, ptrdiff_t srcStride,
               f32 &minVal, size_t &minCol, size_t &minRow,
               f32 &maxVal, size_t &maxCol, size_t &maxRow)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minVal = srcBase[0];
    minCol = 0;
    minRow = 0;
    maxVal = srcBase[0];
    maxCol = 0;
    maxRow = 0;
    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
    {
        const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
        if (size.width >= 16)
        {
            u32 tmp0123[4] = { 0, 1, 2, 3 };
            uint32x4_t   c4       = vdupq_n_u32(4);

#if SIZE_MAX > UINT32_MAX
            size_t boundAll = size.width - (4 - 1);
            for(size_t b = 0; i < boundAll; b = i)
            {
                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
            {
                size_t bound = size.width - (4 - 1);
#endif
                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
                float32x4_t  n_min    = vdupq_n_f32(minVal);
                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
                float32x4_t  n_max    = vdupq_n_f32(maxVal);
                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);

                for(; i < bound; i+=4)
                {
                    internal::prefetch(src + i);
                    float32x4_t line = vld1q_f32(src + i);

                    uint32x4_t minmask = vcltq_f32(line, n_min);
                    uint32x4_t maxmask = vcgtq_f32(line, n_max);

                    n_min    = vbslq_f32(minmask, line, n_min);
                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
                    n_max    = vbslq_f32(maxmask, line, n_max);
                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);

                    // idx[] +=4
                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
                }

                f32 fmin[4], fmax[4];
                u32 fminIdx[4], fmaxIdx[4];

                vst1q_f32(fmin, n_min);
                vst1q_f32(fmax, n_max);

                vst1q_u32(fminIdx, n_minIdx);
                vst1q_u32(fmaxIdx, n_maxIdx);

                size_t minIdx = fminIdx[0];
                size_t maxIdx = fmaxIdx[0];
                minVal = fmin[0];
                maxVal = fmax[0];

                for (s32 j = 1; j < 4; ++j)
                {
                    f32 minval = fmin[j];
                    f32 maxval = fmax[j];
                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
                    {
                        minIdx = fminIdx[j];
                        minVal = minval;
                    }
                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
                    {
                        maxIdx = fmaxIdx[j];
                        maxVal = maxval;
                    }
                }
                if(minIdx < 0xffffFFFC)
                {
#if SIZE_MAX > UINT32_MAX
                    minCol = b + minIdx;
#else
                    minCol = minIdx;
#endif
                    minRow = l;
                }
                if(maxIdx < 0xffffFFFC)
                {
#if SIZE_MAX > UINT32_MAX
                    maxCol = b + maxIdx;
#else
                    maxCol = maxIdx;
#endif
                    maxRow = l;
                }
            }
        }
        for(; i < size.width; ++i )
        {
            float val = src[i];
            if( val < minVal )
            {
                minVal = val;
                minCol = i;
                minRow = l;
            }
            else if( val > maxVal )
            {
                maxVal = val;
                maxCol = i;
                maxRow = l;
            }
        }
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)minVal;
    (void)minCol;
    (void)minRow;
    (void)maxVal;
    (void)maxCol;
    (void)maxRow;
#endif
}

void minMaxLoc(const Size2D &size,
               const f32 * srcBase, ptrdiff_t srcStride,
               const u8 * maskBase, ptrdiff_t maskStride,
               f32 &minVal, size_t &minCol, size_t &minRow,
               f32 &maxVal, size_t &maxCol, size_t &maxRow)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minVal = std::numeric_limits<f32>::max();
    minCol = size.width;
    minRow = size.height;
    maxVal = -std::numeric_limits<f32>::max();
    maxCol = size.width;
    maxRow = size.height;
    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
    {
        const f32 * src = internal::getRowPtr( srcBase, srcStride, l);
        const u8 * mask = internal::getRowPtr( maskBase, maskStride, l);
        if (size.width >= 16)
        {
            u32 tmp0123[4] = { 0, 1, 2, 3 };
            uint32x4_t  uOne      = vdupq_n_u32(1);
            uint32x4_t   c4       = vdupq_n_u32(4);

#if SIZE_MAX > UINT32_MAX
            size_t boundAll = size.width - (4 - 1);
            for(size_t b = 0; i < boundAll; b = i)
            {
                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
            {
                size_t bound = size.width - (4 - 1);
#endif
                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
                float32x4_t  n_min    = vdupq_n_f32(minVal);
                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
                float32x4_t  n_max    = vdupq_n_f32(maxVal);
                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);

                for(; i < bound; i+=4)
                {
                    internal::prefetch(src + i);
                    internal::prefetch(mask + i);
                    float32x4_t line = vld1q_f32(src + i);
                    uint8x8_t maskLine = vld1_u8(mask + i);

                    uint32x4_t maskLine4 = vmovl_u16(vget_low_u16(vmovl_u8(maskLine)));
                    maskLine4 = vcgeq_u32(maskLine4, uOne);

                    uint32x4_t minmask = vcltq_f32(line, n_min);
                    uint32x4_t maxmask = vcgtq_f32(line, n_max);

                    minmask = vandq_u32(minmask, maskLine4);
                    maxmask = vandq_u32(maxmask, maskLine4);

                    n_min    = vbslq_f32(minmask, line, n_min);
                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
                    n_max    = vbslq_f32(maxmask, line, n_max);
                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);

                    // idx[] +=4
                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
                }

                f32 fmin[4], fmax[4];
                u32 fminIdx[4], fmaxIdx[4];

                vst1q_f32(fmin, n_min);
                vst1q_f32(fmax, n_max);

                vst1q_u32(fminIdx, n_minIdx);
                vst1q_u32(fmaxIdx, n_maxIdx);

                size_t minIdx = fminIdx[0];
                size_t maxIdx = fmaxIdx[0];
                minVal = fmin[0];
                maxVal = fmax[0];

                for (s32 j = 1; j < 4; ++j)
                {
                    f32 minval = fmin[j];
                    f32 maxval = fmax[j];
                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
                    {
                        minIdx = fminIdx[j];
                        minVal = minval;
                    }
                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
                    {
                        maxIdx = fmaxIdx[j];
                        maxVal = maxval;
                    }
                }
                if(minIdx < 0xffffFFFC)
                {
#if SIZE_MAX > UINT32_MAX
                    minCol = b + minIdx;
#else
                    minCol = minIdx;
#endif
                    minRow = l;
                }
                if(maxIdx < 0xffffFFFC)
                {
#if SIZE_MAX > UINT32_MAX
                    maxCol = b + maxIdx;
#else
                    maxCol = maxIdx;
#endif
                    maxRow = l;
                }
            }
        }
        for(; i < size.width; i++ )
        {
            if (!mask[i])
                continue;
            f32 val = src[i];
            if( val < minVal )
            {
                minVal = val;
                minCol = i;
                minRow = l;
            }
            if( val > maxVal )
            {
                maxVal = val;
                maxCol = i;
                maxRow = l;
            }
        }
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)maskBase;
    (void)maskStride;
    (void)minVal;
    (void)minCol;
    (void)minRow;
    (void)maxVal;
    (void)maxCol;
    (void)maxRow;
#endif
}

void minMaxLoc(const Size2D &size,
               const s32 * srcBase, ptrdiff_t srcStride,
               s32 &minVal, size_t &minCol, size_t &minRow,
               s32 &maxVal, size_t &maxCol, size_t &maxRow)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minVal = srcBase[0];
    minCol = 0;
    minRow = 0;
    maxVal = srcBase[0];
    maxCol = 0;
    maxRow = 0;
    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
    {
        const s32 * src = internal::getRowPtr( srcBase, srcStride, l);
        if (size.width >= 16)
        {
            u32 tmp0123[4] = { 0, 1, 2, 3 };
            uint32x4_t c4       = vdupq_n_u32(4);

#if SIZE_MAX > UINT32_MAX
            size_t boundAll = size.width - (4 - 1);
            for(size_t b = 0; i < boundAll; b = i)
            {
                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFFC);
#else
            {
                size_t bound = size.width - (4 - 1);
#endif
                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
                int32x4_t  n_min    = vdupq_n_s32(minVal);
                uint32x4_t   n_minIdx = vdupq_n_u32(0xffffFFFC);
                int32x4_t  n_max    = vdupq_n_s32(maxVal);
                uint32x4_t   n_maxIdx = vdupq_n_u32(0xffffFFFC);

                for(; i < bound; i+=4 )
                {
                    internal::prefetch(src + i);
                    int32x4_t line = vld1q_s32(src + i);

                    uint32x4_t minmask = vcltq_s32(line, n_min);
                    uint32x4_t maxmask = vcgtq_s32(line, n_max);

                    n_min    = vbslq_s32(minmask, line, n_min);
                    n_minIdx = vbslq_u32(minmask, lineIdxOffset, n_minIdx);
                    n_max    = vbslq_s32(maxmask, line, n_max);
                    n_maxIdx = vbslq_u32(maxmask, lineIdxOffset, n_maxIdx);

                    // idx[] +=4
                    lineIdxOffset = vaddq_u32(lineIdxOffset, c4);
                }

                s32 fmin[4], fmax[4];
                u32 fminIdx[4], fmaxIdx[4];

                vst1q_s32(fmin, n_min);
                vst1q_s32(fmax, n_max);

                vst1q_u32(fminIdx, n_minIdx);
                vst1q_u32(fmaxIdx, n_maxIdx);

                size_t minIdx = fminIdx[0];
                size_t maxIdx = fmaxIdx[0];
                minVal = fmin[0];
                maxVal = fmax[0];

                for (s32 j = 1; j < 4; ++j)
                {
                    s32 minval = fmin[j];
                    s32 maxval = fmax[j];
                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
                    {
                        minIdx = fminIdx[j];
                        minVal = minval;
                    }
                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
                    {
                        maxIdx = fmaxIdx[j];
                        maxVal = maxval;
                    }
                }
                if(minIdx < 0xffffFFFC)
                {
#if SIZE_MAX > UINT32_MAX
                    minCol = b + minIdx;
#else
                    minCol = minIdx;
#endif
                    minRow = l;
                }
                if(maxIdx < 0xffffFFFC)
                {
#if SIZE_MAX > UINT32_MAX
                    maxCol = b + maxIdx;
#else
                    maxCol = maxIdx;
#endif
                    maxRow = l;
                }
            }
        }
        for(; i < size.width; ++i )
        {
            s32 val = src[i];
            if( val < minVal )
            {
                minVal = val;
                minCol = i;
                minRow = l;
            }
            else if( val > maxVal )
            {
                maxVal = val;
                maxCol = i;
                maxRow = l;
            }
        }
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)minVal;
    (void)minCol;
    (void)minRow;
    (void)maxVal;
    (void)maxCol;
    (void)maxRow;
#endif
}

void minMaxLoc(const Size2D &size,
               const s16 * srcBase, ptrdiff_t srcStride,
               s16 &minVal, size_t &minCol, size_t &minRow,
               s16 &maxVal, size_t &maxCol, size_t &maxRow)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minVal = srcBase[0];
    minCol = 0;
    minRow = 0;
    maxVal = srcBase[0];
    maxCol = 0;
    maxRow = 0;
    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
    {
        const s16 * src = internal::getRowPtr( srcBase,  srcStride, l);
        if (size.width >= 32)
        {
            u32 tmp0123[4] = { 0, 1, 2, 3 };
            uint32x4_t c8        = vdupq_n_u32(8);

#if SIZE_MAX > UINT32_MAX
            size_t boundAll = size.width - (8 - 1);
            for(size_t b = 0; i < boundAll; b = i)
            {
                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
#else
            {
                size_t bound = size.width - (8 - 1);
#endif
                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
                int16x8_t  n_min    = vdupq_n_s16(minVal);
                uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
                uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
                int16x8_t  n_max    = vdupq_n_s16(maxVal);
                uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
                uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);

                for(; i < bound; i+=8 )
                {
                    internal::prefetch(src + i);
                    int16x8_t line = vld1q_s16(src + i);

                    uint16x8_t minmask = vcltq_s16(line, n_min);
                    uint16x8_t maxmask = vcgtq_s16(line, n_max);

                    n_min    = vbslq_s16(minmask, line, n_min);
                    uint16x4_t minml = vget_low_u16(minmask);
                    uint16x4_t minmh = vget_high_u16(minmask);
                    uint32x4_t minml2 = vmovl_u16(minml);
                    uint32x4_t minmh2 = vmovl_u16(minmh);
                    minml2 = vqshlq_n_u32(minml2, 31);
                    minmh2 = vqshlq_n_u32(minmh2, 31);
                    n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
                    n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);

                    n_max    = vbslq_s16(maxmask, line, n_max);
                    uint16x4_t maxml = vget_low_u16(maxmask);
                    uint16x4_t maxmh = vget_high_u16(maxmask);
                    uint32x4_t maxml2 = vmovl_u16(maxml);
                    uint32x4_t maxmh2 = vmovl_u16(maxmh);
                    maxml2 = vqshlq_n_u32(maxml2, 31);
                    maxmh2 = vqshlq_n_u32(maxmh2, 31);
                    n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
                    n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);

                    // idx[] +=8
                    lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
                }

                // fix high part of indexes
                uint32x4_t c4 = vdupq_n_u32((int32_t) 4);
                n_minIdxh = vaddq_u32(n_minIdxh, c4);
                n_maxIdxh = vaddq_u32(n_maxIdxh, c4);

                s16 fmin[8], fmax[8];
                u32 fminIdx[8], fmaxIdx[8];

                vst1q_s16(fmin, n_min);
                vst1q_s16(fmax, n_max);
                vst1q_u32(fminIdx+0, n_minIdxl);
                vst1q_u32(fmaxIdx+0, n_maxIdxl);
                vst1q_u32(fminIdx+4, n_minIdxh);
                vst1q_u32(fmaxIdx+4, n_maxIdxh);

                size_t minIdx = fminIdx[0];
                size_t maxIdx = fmaxIdx[0];
                minVal = fmin[0];
                maxVal = fmax[0];

                for (s32 j = 1; j < 8; ++j)
                {
                    s16 minval = fmin[j];
                    s16 maxval = fmax[j];
                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
                    {
                        minIdx = fminIdx[j];
                        minVal = minval;
                    }
                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
                    {
                        maxIdx = fmaxIdx[j];
                        maxVal = maxval;
                    }
                }
                if(minIdx < 0xffffFFF8)
                {
#if SIZE_MAX > UINT32_MAX
                    minCol = b + minIdx;
#else
                    minCol = minIdx;
#endif
                    minRow = l;
                }
                if(maxIdx < 0xffffFFF8)
                {
#if SIZE_MAX > UINT32_MAX
                    maxCol = b + maxIdx;
#else
                    maxCol = maxIdx;
#endif
                    maxRow = l;
                }
            }
        }
        for(; i < size.width; ++i )
        {
            short val = src[i];
            if( val < minVal )
            {
                minVal = val;
                minCol = i;
                minRow = l;
            }
            else if( val > maxVal )
            {
                maxVal = val;
                maxCol = i;
                maxRow = l;
            }
        }
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)minVal;
    (void)minCol;
    (void)minRow;
    (void)maxVal;
    (void)maxCol;
    (void)maxRow;
#endif
}

void minMaxLoc(const Size2D &size,
               const u16 * srcBase, ptrdiff_t srcStride,
               u16 &minVal, size_t &minCol, size_t &minRow,
               u16 &maxVal, size_t &maxCol, size_t &maxRow)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minVal = srcBase[0];
    minCol = 0;
    minRow = 0;
    maxVal = srcBase[0];
    maxCol = 0;
    maxRow = 0;
    for(size_t l = 0, i = 0; l < size.height; ++l, i = 0)
    {
        const u16 * src = internal::getRowPtr( srcBase,  srcStride, l);
        if (size.width >= 32)
        {
            u32 tmp0123[4] = { 0, 1, 2, 3 };
            uint32x4_t c8        = vdupq_n_u32(8);

#if SIZE_MAX > UINT32_MAX
            size_t boundAll = size.width - (8 - 1);
            for(size_t b = 0; i < boundAll; b = i)
            {
                size_t bound = std::min<size_t>(boundAll, b + 0xffffFFF8);
#else
            {
                size_t bound = size.width - (8 - 1);
#endif
                uint32x4_t  lineIdxOffset = vld1q_u32(tmp0123);
                uint16x8_t  n_min    = vdupq_n_u16(minVal);
                uint32x4_t n_minIdxl = vdupq_n_u32(0xffffFFF8);
                uint32x4_t n_minIdxh = vdupq_n_u32(0xffffFFF8);
                uint16x8_t  n_max    = vdupq_n_u16(maxVal);
                uint32x4_t n_maxIdxl = vdupq_n_u32(0xffffFFF8);
                uint32x4_t n_maxIdxh = vdupq_n_u32(0xffffFFF8);

                for(; i < bound; i+=8 )
                {
                    internal::prefetch(src + i);
                    uint16x8_t line = vld1q_u16(src + i);

                    uint16x8_t minmask = vcltq_u16(line, n_min);
                    uint16x8_t maxmask = vcgtq_u16(line, n_max);

                    n_min    = vbslq_u16(minmask, line, n_min);
                    uint16x4_t minml = vget_low_u16(minmask);
                    uint16x4_t minmh = vget_high_u16(minmask);
                    uint32x4_t minml2 = vmovl_u16(minml);
                    uint32x4_t minmh2 = vmovl_u16(minmh);
                    minml2 = vqshlq_n_u32(minml2, 31);
                    minmh2 = vqshlq_n_u32(minmh2, 31);
                    n_minIdxl = vbslq_u32(minml2, lineIdxOffset, n_minIdxl);
                    n_minIdxh = vbslq_u32(minmh2, lineIdxOffset, n_minIdxh);

                    n_max    = vbslq_u16(maxmask, line, n_max);
                    uint16x4_t maxml = vget_low_u16(maxmask);
                    uint16x4_t maxmh = vget_high_u16(maxmask);
                    uint32x4_t maxml2 = vmovl_u16(maxml);
                    uint32x4_t maxmh2 = vmovl_u16(maxmh);
                    maxml2 = vqshlq_n_u32(maxml2, 31);
                    maxmh2 = vqshlq_n_u32(maxmh2, 31);
                    n_maxIdxl = vbslq_u32(maxml2, lineIdxOffset, n_maxIdxl);
                    n_maxIdxh = vbslq_u32(maxmh2, lineIdxOffset, n_maxIdxh);

                    // idx[] +=8
                    lineIdxOffset = vaddq_u32(lineIdxOffset, c8);
                }

                // fix high part of indexes
                uint32x4_t c4 = vdupq_n_u32(4);
                n_minIdxh = vaddq_u32(n_minIdxh, c4);
                n_maxIdxh = vaddq_u32(n_maxIdxh, c4);

                u16 fmin[8], fmax[8];
                u32 fminIdx[8], fmaxIdx[8];

                vst1q_u16(fmin, n_min);
                vst1q_u16(fmax, n_max);
                vst1q_u32(fminIdx+0, n_minIdxl);
                vst1q_u32(fmaxIdx+0, n_maxIdxl);
                vst1q_u32(fminIdx+4, n_minIdxh);
                vst1q_u32(fmaxIdx+4, n_maxIdxh);

                size_t minIdx = fminIdx[0];
                size_t maxIdx = fmaxIdx[0];
                minVal = fmin[0];
                maxVal = fmax[0];

                for (s32 j = 1; j < 8; ++j)
                {
                    u16 minval = fmin[j];
                    u16 maxval = fmax[j];
                    if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
                    {
                        minIdx = fminIdx[j];
                        minVal = minval;
                    }
                    if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
                    {
                        maxIdx = fmaxIdx[j];
                        maxVal = maxval;
                    }
                }
                if(minIdx < 0xffffFFF8)
                {
#if SIZE_MAX > UINT32_MAX
                    minCol = b + minIdx;
#else
                    minCol = minIdx;
#endif
                    minRow = l;
                }
                if(maxIdx < 0xffffFFF8)
                {
#if SIZE_MAX > UINT32_MAX
                    maxCol = b + maxIdx;
#else
                    maxCol = maxIdx;
#endif
                    maxRow = l;
                }
            }
        }
        for(; i < size.width; ++i )
        {
            u16 val = src[i];
            if( val < minVal )
            {
                minVal = val;
                minCol = i;
                minRow = l;
            }
            else if( val > maxVal )
            {
                maxVal = val;
                maxCol = i;
                maxRow = l;
            }
        }
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)minVal;
    (void)minCol;
    (void)minRow;
    (void)maxVal;
    (void)maxCol;
    (void)maxRow;
#endif
}

#ifdef CAROTENE_NEON
namespace {

void minMaxLocBlock(const u8 * src, u32 len,
                    u8 &minVal, u16 &minIdx,
                    u8 &maxVal, u16 &maxIdx)
{
    u16 tmp0123[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };

    uint8x16_t n_min     = vdupq_n_u8(src[0]);
    uint16x8_t n_minIdxl = vdupq_n_u16(0);
    uint16x8_t n_minIdxh = vdupq_n_u16(0);
    uint8x16_t n_max     = vdupq_n_u8(src[0]);
    uint16x8_t n_maxIdxl = vdupq_n_u16(0);
    uint16x8_t n_maxIdxh = vdupq_n_u16(0);
    uint16x8_t c16       = vdupq_n_u16(16);
    uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);

    s32 i = 0;
    s32 bound = len - (16 - 1);
    for(; i < bound; i+=16 )
    {
        internal::prefetch(src + i);
        uint8x16_t line = vld1q_u8(src + i);

        uint8x16_t minmask = vcltq_u8(line, n_min);
        uint8x16_t maxmask = vcgtq_u8(line, n_max);

        n_min    = vbslq_u8(minmask, line, n_min);
        uint8x8_t minml = vget_low_u8(minmask);
        uint8x8_t minmh = vget_high_u8(minmask);
        uint16x8_t minml2 = vmovl_u8(minml);
        uint16x8_t minmh2 = vmovl_u8(minmh);
        minml2 = vqshlq_n_u16(minml2, 15);
        minmh2 = vqshlq_n_u16(minmh2, 15);
        n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
        n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);

        n_max    = vbslq_u8(maxmask, line, n_max);
        uint8x8_t maxml = vget_low_u8(maxmask);
        uint8x8_t maxmh = vget_high_u8(maxmask);
        uint16x8_t maxml2 = vmovl_u8(maxml);
        uint16x8_t maxmh2 = vmovl_u8(maxmh);
        maxml2 = vqshlq_n_u16(maxml2, 15);
        maxmh2 = vqshlq_n_u16(maxmh2, 15);
        n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
        n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);

        // idx[] +=16
        lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
    }

    // fix high part of indexes
    uint16x8_t c8 = vdupq_n_u16(8);
    n_minIdxh = vaddq_u16(n_minIdxh, c8);
    n_maxIdxh = vaddq_u16(n_maxIdxh, c8);

    u8 fmin[16], fmax[16];
    u16 fminIdx[16], fmaxIdx[16];
    /*{
        uint8x8_t min_low  = vget_low_u8(n_min);
        uint8x8_t min_high = vget_high_u8(n_min);
        uint8x8_t max_low  = vget_low_u8(n_max);
        uint8x8_t max_high = vget_high_u8(n_max);

        uint8x8_t minmask  = vclt_u8(min_low, min_high);
        uint8x8_t maxmask  = vcgt_u8(max_low, max_high);

        uint8x8_t min2     = vbsl_u8(minmask, min_low, min_high);
        uint8x8_t max2     = vbsl_u8(maxmask, max_low, max_high);

        uint16x8_t minidxmask = vmovl_u8(minmask);
        uint16x8_t maxidxmask = vmovl_u8(maxmask);
        minidxmask = vqshlq_n_u16(minidxmask, 15);
        maxidxmask = vqshlq_n_u16(maxidxmask, 15);

        uint16x8_t n_minIdx = vbslq_u16(minidxmask, n_minIdxl, n_minIdxh);
        uint16x8_t n_maxIdx = vbslq_u16(maxidxmask, n_maxIdxl, n_maxIdxh);

        vst1_u8((uint8_t*)fmin, min2);
        vst1_u8((uint8_t*)fmax, max2);

        vst1q_u16((uint16_t*)(fminIdx), n_minIdx);
        vst1q_u16((uint16_t*)(fmaxIdx), n_maxIdx);
    }*/

    vst1q_u8(fmin, n_min);
    vst1q_u8(fmax, n_max);
    vst1q_u16(fminIdx+0, n_minIdxl);
    vst1q_u16(fmaxIdx+0, n_maxIdxl);
    vst1q_u16(fminIdx+8, n_minIdxh);
    vst1q_u16(fmaxIdx+8, n_maxIdxh);

    minIdx = fminIdx[0];
    maxIdx = fmaxIdx[0];
    minVal = fmin[0];
    maxVal = fmax[0];

    for (s32 j = 1; j < 16; ++j)
    {
        u8 minval = fmin[j];
        u8 maxval = fmax[j];
        if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
        {
            minIdx = fminIdx[j];
            minVal = minval;
        }
        if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
        {
            maxIdx = fmaxIdx[j];
            maxVal = maxval;
        }
    }

    for(; i < (s32)len; ++i )
    {
        u8 val = src[i];
        if( val < minVal )
        {
            minVal = val;
            minIdx = (u16)i;
        }
        else if( val > maxVal )
        {
            maxVal = val;
            maxIdx = (u16)i;
        }
    }
}

void minMaxLocBlock(const s8 * src, u32 len,
                    s8 &minVal, u16 &minIdx,
                    s8 &maxVal, u16 &maxIdx)
{
    u16 tmp0123[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };

    int8x16_t n_min      = vdupq_n_s8(src[0]);
    uint16x8_t n_minIdxl = vdupq_n_u16(0);
    uint16x8_t n_minIdxh = vdupq_n_u16(0);
    int8x16_t n_max      = vdupq_n_s8(src[0]);
    uint16x8_t n_maxIdxl = vdupq_n_u16(0);
    uint16x8_t n_maxIdxh = vdupq_n_u16(0);
    uint16x8_t c16       = vdupq_n_u16(16);
    uint16x8_t lineIdxOffset = vld1q_u16(tmp0123);

    s32 i = 0;
    s32 bound = len - (16 - 1);
    for(; i < bound; i+=16 )
    {
        internal::prefetch(src + i);
        int8x16_t line = vld1q_s8(src + i);

        uint8x16_t minmask = vcltq_s8(line, n_min);
        uint8x16_t maxmask = vcgtq_s8(line, n_max);

        n_min    = vbslq_s8(minmask, line, n_min);
        uint8x8_t minml = vget_low_u8(minmask);
        uint8x8_t minmh = vget_high_u8(minmask);
        uint16x8_t minml2 = vmovl_u8(minml);
        uint16x8_t minmh2 = vmovl_u8(minmh);
        minml2 = vqshlq_n_u16(minml2, 15);
        minmh2 = vqshlq_n_u16(minmh2, 15);
        n_minIdxl = vbslq_u16(minml2, lineIdxOffset, n_minIdxl);
        n_minIdxh = vbslq_u16(minmh2, lineIdxOffset, n_minIdxh);

        n_max    = vbslq_s8(maxmask, line, n_max);
        uint8x8_t maxml = vget_low_u8(maxmask);
        uint8x8_t maxmh = vget_high_u8(maxmask);
        uint16x8_t maxml2 = vmovl_u8(maxml);
        uint16x8_t maxmh2 = vmovl_u8(maxmh);
        maxml2 = vqshlq_n_u16(maxml2, 15);
        maxmh2 = vqshlq_n_u16(maxmh2, 15);
        n_maxIdxl = vbslq_u16(maxml2, lineIdxOffset, n_maxIdxl);
        n_maxIdxh = vbslq_u16(maxmh2, lineIdxOffset, n_maxIdxh);

        // idx[] +=16
        lineIdxOffset = vaddq_u16(lineIdxOffset, c16);
    }

    // fix high part of indexes
    uint16x8_t c8 = vdupq_n_u16(8);
    n_minIdxh = vaddq_u16(n_minIdxh, c8);
    n_maxIdxh = vaddq_u16(n_maxIdxh, c8);

    s8 fmin[16], fmax[16];
    u16 fminIdx[16], fmaxIdx[16];

    vst1q_s8(fmin, n_min);
    vst1q_s8(fmax, n_max);
    vst1q_u16(fminIdx+0, n_minIdxl);
    vst1q_u16(fmaxIdx+0, n_maxIdxl);
    vst1q_u16(fminIdx+8, n_minIdxh);
    vst1q_u16(fmaxIdx+8, n_maxIdxh);

    minIdx = fminIdx[0];
    maxIdx = fmaxIdx[0];
    minVal = fmin[0];
    maxVal = fmax[0];

    for (s32 j = 1; j < 16; ++j)
    {
        s8 minval = fmin[j];
        s8 maxval = fmax[j];
        if (minval < minVal || (minval == minVal && fminIdx[j] < minIdx))
        {
            minIdx = fminIdx[j];
            minVal = minval;
        }
        if (maxval > maxVal || (maxval == maxVal && fmaxIdx[j] < maxIdx))
        {
            maxIdx = fmaxIdx[j];
            maxVal = maxval;
        }
    }

    for(; i < (s32)len; ++i )
    {
        s8 val = src[i];
        if( val < minVal )
        {
            minVal = val;
            minIdx = (u16)i;
        }
        else if( val > maxVal )
        {
            maxVal = val;
            maxIdx = (u16)i;
        }
    }
}

} // namespace
#endif // CAROTENE_NEON

#define USHORT_BLOCK_MAX_SIZE (1 << 16)

void minMaxLoc(const Size2D &size,
               const u8 * srcBase, ptrdiff_t srcStride,
               u8 &minVal, size_t &minCol, size_t &minRow,
               u8 &maxVal, size_t &maxCol, size_t &maxRow)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minVal = srcBase[0];
    minCol = 0;
    minRow = 0;
    maxVal = srcBase[0];
    maxCol = 0;
    maxRow = 0;
    for(size_t l = 0; l < size.height; ++l)
    {
        const u8 * src = internal::getRowPtr( srcBase,  srcStride, l);
        if (size.width > 128)
        {
            for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
            {
                u8 locMinVal, locMaxVal;
                u16 locMinIdx, locMaxIdx;
                size_t tail = size.width - blockStart;
                minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
                               locMinVal, locMinIdx, locMaxVal, locMaxIdx);

                if (locMinVal == 0 && locMaxVal == 255)
                {
                    minCol = blockStart + locMinIdx;
                    maxCol = blockStart + locMaxIdx;
                    minRow = l;
                    maxRow = l;
                    minVal = 0;
                    maxVal = 255;
                    return;
                }
                else
                {
                    if (locMinVal < minVal)
                    {
                        minCol = blockStart + locMinIdx;
                        minRow = l;
                        minVal = locMinVal;
                    }
                    if (locMaxVal > maxVal)
                    {
                        maxCol = blockStart + locMaxIdx;
                        maxRow = l;
                        maxVal = locMaxVal;
                    }
                }
            }
        }
        else
        {
            for(size_t i = 0; i < size.width; ++i )
            {
                u8 val = src[i];
                if( val < minVal )
                {
                    minVal = val;
                    minCol = i;
                    minRow = l;
                }
                else if( val > maxVal )
                {
                    maxVal = val;
                    maxCol = i;
                    maxRow = l;
                }
            }
        }

    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)minVal;
    (void)minCol;
    (void)minRow;
    (void)maxVal;
    (void)maxCol;
    (void)maxRow;
#endif
}

void minMaxLoc(const Size2D &size,
               const s8 * srcBase, ptrdiff_t srcStride,
               s8 &minVal, size_t &minCol, size_t &minRow,
               s8 &maxVal, size_t &maxCol, size_t &maxRow)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    minVal = srcBase[0];
    minCol = 0;
    minRow = 0;
    maxVal = srcBase[0];
    maxCol = 0;
    maxRow = 0;
    for(size_t l = 0; l < size.height; ++l)
    {
        const s8 * src = internal::getRowPtr( srcBase,  srcStride, l);
        if (size.width > 128)
        {
            for(size_t blockStart = 0; blockStart < size.width; blockStart += USHORT_BLOCK_MAX_SIZE)
            {
                s8 locMinVal, locMaxVal;
                u16 locMinIdx, locMaxIdx;
                size_t tail = size.width - blockStart;
                minMaxLocBlock(src + blockStart, tail < USHORT_BLOCK_MAX_SIZE ? tail : USHORT_BLOCK_MAX_SIZE,
                               locMinVal, locMinIdx, locMaxVal, locMaxIdx);

                if (locMinVal == -128 && locMaxVal == 127)
                {
                    minCol = blockStart + locMinIdx;
                    maxCol = blockStart + locMaxIdx;
                    minRow = l;
                    maxRow = l;
                    minVal = -128;
                    maxVal = 127;
                    return;
                }
                else
                {
                    if (locMinVal < minVal)
                    {
                        minCol = blockStart + locMinIdx;
                        minRow = l;
                        minVal = locMinVal;
                    }
                    if (locMaxVal > maxVal)
                    {
                        maxCol = blockStart + locMaxIdx;
                        maxRow = l;
                        maxVal = locMaxVal;
                    }
                }
            }
        }
        else
        {
            for(size_t i = 0; i < size.width; ++i )
            {
                s8 val = src[i];
                if( val < minVal )
                {
                    minVal = val;
                    minRow = l;
                    minCol = i;
                }
                else if( val > maxVal )
                {
                    maxVal = val;
                    maxRow = l;
                    maxCol = i;
                }
            }
        }
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)minVal;
    (void)minCol;
    (void)minRow;
    (void)maxVal;
    (void)maxCol;
    (void)maxRow;
#endif
}

} // namespace CAROTENE_NS