/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#include "common.hpp"

namespace CAROTENE_NS {

//magic number; must be multiple of 4
#define NORM32F_BLOCK_SIZE 2048

s32 normInf(const Size2D &_size,
            const u8 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        if (size.width >= 16)
        {
            uint8x16_t s = vld1q_u8(src);
            for (i = 16; i <= size.width - 16; i += 16)
            {
                internal::prefetch(src + i);
                uint8x16_t s1 = vld1q_u8(src + i);
                s = vmaxq_u8(s1, s);
            }
            u8 s2[8];
            uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
            vst1_u8(s2, s3);
            for (u32 j = 0; j < 8; j++)
                result = std::max((s32)(s2[j]), result);
        }
        for ( ; i < size.width; i++)
            result = std::max((s32)(src[i]), result);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normInf(const Size2D &_size,
            const s8 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        if (size.width >= 16)
        {
            uint8x16_t s = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src)));
            for (i = 16; i <= size.width - 16; i += 16)
            {
                internal::prefetch(src + i);
                uint8x16_t s1 = vreinterpretq_u8_s8(vabsq_s8(vld1q_s8(src + i)));
                s = vmaxq_u8(s1, s);
            }
            u8 s2[8];
            uint8x8_t s3 = vmax_u8(vget_low_u8(s), vget_high_u8(s));
            vst1_u8(s2, s3);
            for (u32 j = 0; j < 8; j++)
                result = std::max((s32)(s2[j]), result);
        }
        for ( ; i < size.width; i++)
            result = std::max((s32)(std::abs(src[i])), result);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normInf(const Size2D &_size,
            const u16 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        if (size.width >= 8)
        {
            uint16x8_t s = vld1q_u16(src);
            for (i = 8; i <= size.width - 8; i += 8)
            {
                internal::prefetch(src + i);
                uint16x8_t s1 = vld1q_u16(src + i);
                s = vmaxq_u16(s1, s);
            }
            u16 s2[4];
            uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
            vst1_u16(s2, s3);
            for (u32 j = 0; j < 4; j++)
                result = std::max((s32)(s2[j]), result);
        }
        for ( ; i < size.width; i++)
            result = std::max((s32)(src[i]), result);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normInf(const Size2D &_size,
            const s16 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        if (size.width >= 8)
        {
            uint16x8_t s = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src)));
            for (i = 8; i <= size.width - 8; i += 8)
            {
                internal::prefetch(src + i);
                uint16x8_t s1 = vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(src + i)));
                s = vmaxq_u16(s1, s);
            }
            u16 s2[4];
            uint16x4_t s3 = vmax_u16(vget_low_u16(s), vget_high_u16(s));
            vst1_u16(s2, s3);
            for (u32 j = 0; j < 4; j++)
                result = std::max((s32)(s2[j]), result);
        }
        for ( ; i < size.width; i++)
            result = std::max(std::abs((s32)(src[i])), result);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normInf(const Size2D &_size,
            const s32 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        if (size.width >= 4)
        {
            uint32x4_t s = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src)));
            for (i = 4; i <= size.width - 4; i += 4)
            {
                internal::prefetch(src + i);
                uint32x4_t s1 = vreinterpretq_u32_s32(vabsq_s32(vld1q_s32(src + i)));
                s = vmaxq_u32(s1, s);
            }
            u32 s2[2];
            uint32x2_t s3 = vmax_u32(vget_low_u32(s), vget_high_u32(s));
            vst1_u32(s2, s3);
            for (u32 j = 0; j < 2; j++)
                result = std::max((s32)(s2[j]), result);
        }
        for ( ; i < size.width; i++)
            result = std::max((s32)(std::abs(src[i])), result);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

f32 normInf(const Size2D &_size,
            const f32 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    f32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        if (size.width >= 4)
        {
            float32x4_t s = vabsq_f32(vld1q_f32(src));
            for (i = 4; i <= size.width - 4; i += 4 )
            {
                internal::prefetch(src + i);
                float32x4_t s1 = vld1q_f32(src + i);
                float32x4_t sa = vabsq_f32(s1);
                s = vmaxq_f32(sa, s);
            }
            f32 s2[2];
            float32x2_t s3 = vmax_f32(vget_low_f32(s), vget_high_f32(s));
            vst1_f32(s2, s3);
            for (u32 j = 0; j < 2; j++)
                result = std::max(s2[j], result);
        }
        for (; i < size.width; i++)
            result = std::max(std::abs(src[i]), result);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0.;
#endif
}

s32 normL1(const Size2D &_size,
           const u8 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        uint32x4_t vs = vmovq_n_u32(0);
        for (; i < roiw8;)
        {
            size_t limit = std::min(size.width, i + 256) - 8;
            uint8x8_t s0 = vld1_u8(src + i);
            uint16x8_t s = vmovl_u8(s0);

            for (i += 8; i <= limit; i += 8)
            {
                internal::prefetch(src + i);
                uint8x8_t s1 = vld1_u8(src + i);
                s = vaddw_u8(s, s1);
            }

            uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
            vs = vaddw_u16(vs, s4);
        }

        u32 s2[2];
        uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
        vst1_u32(s2, vs2);

        result += (s32)(s2[0] + s2[1]);

        for ( ; i < size.width; i++)
            result += (s32)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normL1(const Size2D &_size,
           const s8 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        uint32x4_t vs = vmovq_n_u32(0);

        for (; i < roiw8;)
        {
            size_t limit = std::min(size.width, i + 256) - 8;
            uint8x8_t s0 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
            uint16x8_t s = vmovl_u8(s0);

            for (i += 8; i <= limit; i += 8)
            {
                internal::prefetch(src + i);
                uint8x8_t s1 = vreinterpret_u8_s8(vabs_s8(vld1_s8(src + i)));
                s = vaddw_u8(s, s1);
            }

            uint16x4_t s4 = vadd_u16(vget_low_u16(s), vget_high_u16(s));
            vs = vaddw_u16(vs, s4);
        }

        u32 s2[2];
        uint32x2_t vs2 = vadd_u32(vget_low_u32(vs), vget_high_u32(vs));
        vst1_u32(s2, vs2);

        result += (s32)(s2[0] + s2[1]);

        for ( ; i < size.width; i++)
            result += (s32)(std::abs(src[i]));
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normL1(const Size2D &_size,
           const u16 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        uint32x4_t vs = vmovq_n_u32(0);
        for (; i < roiw4; i += 4)
        {
            internal::prefetch(src + i);
            uint16x4_t s = vld1_u16(src + i);
            vs = vaddw_u16(vs, s);
        }
        u32 s2[4];
        vst1q_u32(s2, vs);
        for (u32 j = 0; j < 4; j++)
            result += s2[j];
        for ( ; i < size.width; i++)
            result += (s32)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normL1(const Size2D &_size,
           const s16 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        uint32x4_t vs = vmovq_n_u32(0);
        for (; i < roiw4; i += 4)
        {
            internal::prefetch(src + i);
            uint16x4_t s = vreinterpret_u16_s16(vabs_s16(vld1_s16(src + i)));
            vs = vaddw_u16(vs, s);
        }
        u32 s2[4];
        vst1q_u32(s2, vs);
        for (u32 j = 0; j < 4; j++)
            result += s2[j];
        for ( ; i < size.width; i++)
            result += (s32)(std::abs(src[i]));
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

f64 normL1(const Size2D &_size,
           const s32 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        for (; i < roiw4;)
        {
            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
            float32x4_t s = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
            for (i += 4; i <= limit; i += 4 )
            {
                internal::prefetch(src + i);
                float32x4_t s1 = vcvtq_f32_s32(vabsq_s32(vld1q_s32(src + i)));
                s = vaddq_f32(s, s1);
            }

            f32 s2[4];
            vst1q_f32(s2, s);

            for (u32 j = 0; j < 4; j++)
                result += (f64)(s2[j]);
        }
        for ( ; i < size.width; i++)
            result += (f64)(std::abs(src[i]));
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0.;
#endif
}

f64 normL1(const Size2D &_size,
           const f32 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;

        for (; i < roiw4;)
        {
            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
            float32x4_t s = vabsq_f32(vld1q_f32(src + i));
            for (i += 4; i <= limit; i += 4)
            {
                internal::prefetch(src + i);
                float32x4_t s1 = vld1q_f32(src + i);
                float32x4_t sa = vabsq_f32(s1);
                s = vaddq_f32(sa, s);
            }

            f32 s2[4];
            vst1q_f32(s2, s);

            for (u32 j = 0; j < 4; j++)
                result += (f64)(s2[j]);
        }
        for (; i < size.width; i++)
            result += std::abs((f64)(src[i]));
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0.;
#endif
}

s32 normL2(const Size2D &_size,
           const u8 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u8* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;

        uint32x4_t sl = vmovq_n_u32(0);
        uint32x4_t sh = vmovq_n_u32(0);

        for (; i < roiw8; i += 8)
        {
            internal::prefetch(src + i);
            uint8x8_t s1 = vld1_u8(src + i);
            uint16x8_t sq = vmull_u8(s1, s1);

            sl = vaddw_u16(sl, vget_low_u16(sq));
            sh = vaddw_u16(sh, vget_high_u16(sq));
        }

        uint32x4_t s = vaddq_u32(sl, sh);
        uint32x2_t ss = vadd_u32(vget_low_u32(s), vget_high_u32(s));

        u32 s2[2];
        vst1_u32(s2, ss);

        result += (s32)(s2[0] + s2[1]);

        for (; i < size.width; i++)
            result += (s32)(src[i]) * (s32)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

s32 normL2(const Size2D &_size,
           const s8 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s8* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;

        int32x4_t sl = vmovq_n_s32(0);
        int32x4_t sh = vmovq_n_s32(0);

        for (; i < roiw8; i += 8)
        {
            internal::prefetch(src + i);
            int8x8_t s1 = vld1_s8(src + i);
            int16x8_t sq = vmull_s8(s1, s1);

            sl = vaddw_s16(sl, vget_low_s16(sq));
            sh = vaddw_s16(sh, vget_high_s16(sq));
        }

        int32x4_t s = vaddq_s32(sl, sh);
        int32x2_t ss = vadd_s32(vget_low_s32(s), vget_high_s32(s));

        s32 s2[2];
        vst1_s32(s2, ss);

        result += s2[0] + s2[1];

        for (; i < size.width; i++)
            result += (s32)(src[i]) * (s32)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0;
#endif
}

f64 normL2(const Size2D &_size,
           const u16 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u16* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        for (; i < roiw4;)
        {
            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
            uint16x4_t s0 = vld1_u16(src+i);
            float32x4_t s = vcvtq_f32_u32(vmull_u16(s0,s0));
            for (i += 4; i <= limit; i += 4 )
            {
                internal::prefetch(src + i);
                uint16x4_t s1 = vld1_u16(src+i);
                float32x4_t sq = vcvtq_f32_u32(vmull_u16(s1, s1));
                s = vaddq_f32(s, sq);
            }
            f32 s2[4];
            vst1q_f32(s2, s);
            for (u32 j = 0; j < 4; j++)
                result += (f64)(s2[j]);
        }

        for ( ; i < size.width; i++)
            result += (f64)(src[i]) * (f64)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0.;
#endif
}

f64 normL2(const Size2D &_size,
           const s16 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s16* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        for (; i < roiw4;)
        {
            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
            int16x4_t s0 = vld1_s16(src+i);
            float32x4_t s = vcvtq_f32_s32(vmull_s16(s0,s0));
            for (i += 4; i <= limit; i += 4 )
            {
                internal::prefetch(src + i);
                int16x4_t s1 = vld1_s16(src+i);
                float32x4_t sq = vcvtq_f32_s32(vmull_s16(s1, s1));
                s = vaddq_f32(s, sq);
            }
            f32 s2[4];
            vst1q_f32(s2, s);
            for (u32 j = 0; j < 4; j++)
                result += (f64)(s2[j]);
        }

        for ( ; i < size.width; i++)
            result += (f64)(src[i]) * (f64)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0.;
#endif
}

f64 normL2(const Size2D &_size,
           const s32 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const s32* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        for (; i < roiw4;)
        {
            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
            float32x4_t s = vcvtq_f32_s32(vld1q_s32(src + i));
            s = vmulq_f32(s, s);
            for (i += 4; i <= limit; i += 4 )
            {
                internal::prefetch(src + i);
                float32x4_t s1 = vcvtq_f32_s32(vld1q_s32(src + i));
                s = vmlaq_f32(s, s1, s1);
            }

            f32 s2[4];
            vst1q_f32(s2, s);

            for (u32 j = 0; j < 4; j++)
                result += (f64)(s2[j]);
        }
        for ( ; i < size.width; i++)
            result += (f64)(src[i]) * (f64)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0.;
#endif
}

f64 normL2(const Size2D &_size,
           const f32 * srcBase, ptrdiff_t srcStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (srcStride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    size_t roiw4 = size.width >= 3 ? size.width - 3 : 0;
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const f32* src = internal::getRowPtr( srcBase,  srcStride, k);
        size_t i = 0;
        for (; i < roiw4;)
        {
            size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
            float32x4_t s = vld1q_f32(src + i);
            s = vmulq_f32(s, s);
            for (i += 4; i <= limit; i += 4 )
            {
                internal::prefetch(src + i);
                float32x4_t s1 = vld1q_f32(src + i);
                s = vmlaq_f32(s, s1, s1);
            }

            f32 s2[4];
            vst1q_f32(s2, s);

            for (u32 j = 0; j < 4; j++)
                result += (f64)(s2[j]);
        }
        for ( ; i < size.width; i++)
            result += (f64)(src[i]) * (f64)(src[i]);
    }
    return result;
#else
    (void)_size;
    (void)srcBase;
    (void)srcStride;

    return 0.;
#endif
}

s32 diffNormInf(const Size2D &_size,
                const u8 * src0Base, ptrdiff_t src0Stride,
                const u8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
        const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
        size_t i = 0;

        if (size.width >= 16)
        {
            uint8x16_t vs3 = vdupq_n_u8(0);
            for (; i < size.width - 16; i += 16)
            {
                internal::prefetch(src1 + i);
                internal::prefetch(src2 + i);

                uint8x16_t vs1 = vld1q_u8(src1 + i);
                uint8x16_t vs2 = vld1q_u8(src2 + i);

                vs3 = vmaxq_u8(vs3, vabdq_u8(vs1, vs2));
            }

            u8 s2[8];
            vst1_u8(s2, vpmax_u8(vget_low_u8(vs3), vget_high_u8(vs3)));

            for (u32 j = 0; j < 8; j++)
                result = std::max((s32)(s2[j]), result);
        }

        for (; i < size.width; i++)
        {
            result = std::max(std::abs((s32)(src1[i]) - (s32)(src2[i])), result);
        }
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}

f32 diffNormInf(const Size2D &_size,
                const f32 * src0Base, ptrdiff_t src0Stride,
                const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    f32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
        const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
        size_t i = 0;

        if (size.width >= 4)
        {
            float32x4_t s = vabdq_f32(vld1q_f32(src1), vld1q_f32(src2));

            for (i += 4; i <= size.width - 4; i += 4 )
            {
                internal::prefetch(src1 + i);
                internal::prefetch(src2 + i);

                float32x4_t vs1 = vld1q_f32(src1 + i);
                float32x4_t vs2 = vld1q_f32(src2 + i);

                float32x4_t vd = vabdq_f32(vs2, vs1);
                s = vmaxq_f32(s, vd);
            }

            f32 s2[4];
            vst1q_f32(s2, s);

            for (u32 j = 0; j < 4; j++)
                if (s2[j] > result)
                    result = s2[j];
        }

        for (; i < size.width; i++)
        {
            f32 v = std::abs(src1[i] - src2[i]);
            if (v > result)
                result = v;
        }
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0.;
#endif
}

s32 diffNormL1(const Size2D &_size,
               const u8 * src0Base, ptrdiff_t src0Stride,
               const u8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
        const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
        size_t i = 0;

        if (size.width >= 16)
        {
            for(; i <= size.width - 16;)
            {
                size_t limit = std::min(size.width, i + 2*256) - 16;
                uint16x8_t si1 = vmovq_n_u16(0);
                uint16x8_t si2 = vmovq_n_u16(0);

                for (; i <= limit; i += 16)
                {
                    internal::prefetch(src1 + i);
                    internal::prefetch(src2 + i);

                    uint8x16_t vs1 = vld1q_u8(src1 + i);
                    uint8x16_t vs2 = vld1q_u8(src2 + i);

                    si1 = vabal_u8(si1, vget_low_u8(vs1), vget_low_u8(vs2));
                    si2 = vabal_u8(si2, vget_high_u8(vs1), vget_high_u8(vs2));
                }

                u32 s2[4];
                vst1q_u32(s2, vaddq_u32(vpaddlq_u16(si1), vpaddlq_u16(si2)));

                for (u32 j = 0; j < 4; j++)
                {
                    if ((s32)(0x7fFFffFFu - s2[j]) <= result)
                    {
                        return 0x7fFFffFF; //result already saturated
                    }
                    result = (s32)((u32)(result) + s2[j]);
                }
            }

        }

        for (; i < size.width; i++)
        {
            u32 v = std::abs((s32)(src1[i]) - (s32)(src2[i]));

            if ((s32)(0x7fFFffFFu - v) <= result)
            {
                return 0x7fFFffFF; //result already saturated
            }
            result = (s32)((u32)(result) + v);
        }
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}

f64 diffNormL1(const Size2D &_size,
               const f32 * src0Base, ptrdiff_t src0Stride,
               const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
        const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
        size_t i = 0;

        if (size.width >= 4)
        {
            for(; i <= size.width - 4;)
            {
                size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
                float32x4_t s = vmovq_n_f32(0.0f);

                for (; i <= limit; i += 4 )
                {
                    internal::prefetch(src1 + i);
                    internal::prefetch(src2 + i);

                    float32x4_t vs1 = vld1q_f32(src1 + i);
                    float32x4_t vs2 = vld1q_f32(src2 + i);

                    float32x4_t vd = vabdq_f32(vs2, vs1);
                    s = vaddq_f32(s, vd);
                }

                f32 s2[4];
                vst1q_f32(s2, s);

                for (u32 j = 0; j < 4; j++)
                    result += (f64)(s2[j]);
            }
        }

        for (; i < size.width; i++)
        {
            f32 v = std::abs(src1[i] - src2[i]);
            result += (f64)(v);
        }
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0.;
#endif
}

s32 diffNormL2(const Size2D &_size,
               const u8 * src0Base, ptrdiff_t src0Stride,
               const u8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    s32 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const u8* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
        const u8* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
        size_t i = 0;

#define NORML28U_BLOCK_SIZE (33024*2) //bigger block size can result in integer overflow
        if (size.width >= 16)
        {
            for(; i <= size.width - 16;)
            {
                size_t limit = std::min(size.width, i + NORML28U_BLOCK_SIZE) - 16;
                uint32x4_t si1 = vmovq_n_u32(0);
                uint32x4_t si2 = vmovq_n_u32(0);

                for (; i <= limit; i += 16)
                {
                    internal::prefetch(src1 + i);
                    internal::prefetch(src2 + i);

                    uint8x16_t vs1 = vld1q_u8(src1 + i);
                    uint8x16_t vs2 = vld1q_u8(src2 + i);

                    uint16x8_t vdlo = vabdl_u8(vget_low_u8(vs1), vget_low_u8(vs2));
                    uint16x8_t vdhi = vabdl_u8(vget_high_u8(vs1), vget_high_u8(vs2));

                    si1 = vmlal_u16(si1, vget_low_u16(vdlo), vget_low_u16(vdlo));
                    si2 = vmlal_u16(si2, vget_high_u16(vdlo), vget_high_u16(vdlo));

                    si1 = vmlal_u16(si1, vget_low_u16(vdhi), vget_low_u16(vdhi));
                    si2 = vmlal_u16(si2, vget_high_u16(vdhi), vget_high_u16(vdhi));
                }

                u32 s2[4];
                vst1q_u32(s2, vqaddq_u32(si1, si2));

                for (u32 j = 0; j < 4; j++)
                {
                    if ((s32)(0x7fFFffFFu - s2[j]) <= result)
                    {
                        return 0x7fFFffFF; //result already saturated
                    }
                    result += (s32)s2[j];
                }
            }

        }

        for (; i < size.width; i++)
        {
            s32 v = (s32)(src1[i]) - (s32)(src2[i]);
            v *= v;

            if ((s32)(0x7fFFffFFu - (u32)(v)) <= result)
            {
                return 0x7fFFffFF; //result already saturated
            }
            result += v;
        }
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}

f64 diffNormL2(const Size2D &_size,
               const f32 * src0Base, ptrdiff_t src0Stride,
               const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }
    f64 result = 0;
    for(size_t k = 0; k < size.height; ++k)
    {
        const f32* src1 = internal::getRowPtr( src0Base,  src0Stride, k);
        const f32* src2 = internal::getRowPtr( src1Base,  src1Stride, k);
        size_t i = 0;

        if (size.width >= 4)
        {
            for(; i <= size.width - 4;)
            {
                size_t limit = std::min(size.width, i + NORM32F_BLOCK_SIZE) - 4;
                float32x4_t s = vmovq_n_f32(0.0f);

                for (; i <= limit; i += 4 )
                {
                    internal::prefetch(src1 + i);
                    internal::prefetch(src2 + i);

                    float32x4_t vs1 = vld1q_f32(src1 + i);
                    float32x4_t vs2 = vld1q_f32(src2 + i);

                    float32x4_t vd = vsubq_f32(vs2,vs1);
                    s = vmlaq_f32(s, vd, vd);
                }

                f32 s2[4];
                vst1q_f32(s2, s);

                for (u32 j = 0; j < 4; j++)
                    result += (f64)(s2[j]);
            }
        }

        for (; i < size.width; i++)
        {
            f32 v = src1[i] - src2[i];
            result += v * v;
        }
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0.;
#endif
}

} // namespace CAROTENE_NS