/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#include "common.hpp"

#include <cstring>

namespace CAROTENE_NS {

void reduceColSum(const Size2D &size,
                  const u8 * srcBase, ptrdiff_t srcStride,
                  s32 * dstBase)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    memset(dstBase, 0, size.width*sizeof(s32));
    size_t i = 0;
    for (; i + 16 <= size.width; i += 16)
    {
        const u8* src_address = srcBase + i;

        int32x4_t sll = vmovq_n_s32(0);
        int32x4_t slh = vmovq_n_s32(0);
        int32x4_t shl = vmovq_n_s32(0);
        int32x4_t shh = vmovq_n_s32(0);

        for (size_t h = 0; h < size.height; h += 256)
        {
            size_t lim = std::min(h + 256, size.height);

            uint16x8_t sl = vmovq_n_u16(0);
            uint16x8_t sh = vmovq_n_u16(0);

            for (size_t k = h; k < lim; ++k, src_address += srcStride)
            {
                internal::prefetch(src_address + srcStride, 0);

                uint8x16_t v = vld1q_u8(src_address);

                sl = vaddw_u8(sl, vget_low_u8(v));
                sh = vaddw_u8(sh, vget_high_u8(v));
            }

            int32x4_t vsll = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sl)));
            int32x4_t vslh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sl)));
            int32x4_t vshl = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sh)));
            int32x4_t vshh = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sh)));

            sll = vqaddq_s32(sll, vsll);
            slh = vqaddq_s32(slh, vslh);
            shl = vqaddq_s32(shl, vshl);
            shh = vqaddq_s32(shh, vshh);
        }

        vst1q_s32(dstBase + i + 0, sll);
        vst1q_s32(dstBase + i + 4, slh);
        vst1q_s32(dstBase + i + 8, shl);
        vst1q_s32(dstBase + i + 12, shh);
    }

    for(size_t h = 0; h < size.height; ++h)
    {
        for(size_t j = i ; j < size.width; j++ )
        {
            if (((u32)(dstBase[j] += srcBase[j + srcStride * h])) > 0x7fFFffFFu)
                dstBase[j] = 0x7fFFffFF;
        }
    }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
#endif
}

void reduceColMax(const Size2D &size,
                  const u8 * srcBase, ptrdiff_t srcStride,
                  u8 * dstBase)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    memcpy(dstBase, srcBase, size.width);
    size_t i = 0;
    for (; i + 16*4 <= size.width; i += 16*4)
    {
        const u8* src_address = srcBase + i;

        uint8x16_t s1 = vld1q_u8(src_address + 0);
        uint8x16_t s2 = vld1q_u8(src_address + 16);
        uint8x16_t s3 = vld1q_u8(src_address + 32);
        uint8x16_t s4 = vld1q_u8(src_address + 48);

        src_address += srcStride;

        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
        {
            internal::prefetch(src_address + srcStride, 0);
            internal::prefetch(src_address + srcStride, 32);

            uint8x16_t v1 = vld1q_u8(src_address + 0);
            uint8x16_t v2 = vld1q_u8(src_address + 16);
            uint8x16_t v3 = vld1q_u8(src_address + 32);
            uint8x16_t v4 = vld1q_u8(src_address + 48);

            s1 = vmaxq_u8(s1, v1);
            s2 = vmaxq_u8(s2, v2);
            s3 = vmaxq_u8(s3, v3);
            s4 = vmaxq_u8(s4, v4);
        }

        vst1q_u8(dstBase + i + 0, s1);
        vst1q_u8(dstBase + i + 16, s2);
        vst1q_u8(dstBase + i + 32, s3);
        vst1q_u8(dstBase + i + 48, s4);
    }

    for (; i + 16 <= size.width; i += 16)
    {
        const u8* src_address = srcBase + i;
        uint8x16_t s1 = vld1q_u8(src_address);
        src_address += srcStride;
        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
        {
            internal::prefetch(src_address + srcStride, 0);

            uint8x16_t v1 = vld1q_u8(src_address);
            s1 = vmaxq_u8(s1, v1);
        }
        vst1q_u8(dstBase + i, s1);
    }

    if (i < size.width)
        for(size_t h = 1; h < size.height; ++h)
            for(size_t j = i ; j < size.width; j++ )
                dstBase[j] = std::max(dstBase[j], srcBase[j + srcStride * h]);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
#endif
}

void reduceColMin(const Size2D &size,
                  const u8 * srcBase, ptrdiff_t srcStride,
                  u8 * dstBase)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    memcpy(dstBase, srcBase, size.width);
    size_t i = 0;
    for (; i + 16*4 <= size.width; i += 16*4)
    {
        const u8* src_address = srcBase + i;

        uint8x16_t s1 = vld1q_u8(src_address + 0);
        uint8x16_t s2 = vld1q_u8(src_address + 16);
        uint8x16_t s3 = vld1q_u8(src_address + 32);
        uint8x16_t s4 = vld1q_u8(src_address + 48);

        src_address += srcStride;

        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
        {
            internal::prefetch(src_address + srcStride, 0);
            internal::prefetch(src_address + srcStride, 32);

            uint8x16_t v1 = vld1q_u8(src_address + 0);
            uint8x16_t v2 = vld1q_u8(src_address + 16);
            uint8x16_t v3 = vld1q_u8(src_address + 32);
            uint8x16_t v4 = vld1q_u8(src_address + 48);

            s1 = vminq_u8(s1, v1);
            s2 = vminq_u8(s2, v2);
            s3 = vminq_u8(s3, v3);
            s4 = vminq_u8(s4, v4);
        }

        vst1q_u8(dstBase + i + 0, s1);
        vst1q_u8(dstBase + i + 16, s2);
        vst1q_u8(dstBase + i + 32, s3);
        vst1q_u8(dstBase + i + 48, s4);
    }

    for (; i + 16 <= size.width; i += 16)
    {
        const u8* src_address = srcBase + i;
        uint8x16_t s1 = vld1q_u8(src_address);
        src_address += srcStride;
        for(size_t h = 1; h < size.height; ++h, src_address += srcStride)
        {
            internal::prefetch(src_address + srcStride, 0);

            uint8x16_t v1 = vld1q_u8(src_address);
            s1 = vminq_u8(s1, v1);
        }
        vst1q_u8(dstBase + i, s1);
    }

    if (i < size.width)
        for(size_t h = 1; h < size.height; ++h)
            for(size_t j = i ; j < size.width; j++ )
                dstBase[j] = std::min(dstBase[j], srcBase[j + srcStride * h]);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
#endif
}

void reduceColSum(const Size2D &size,
                  const f32 * srcBase, ptrdiff_t srcStride,
                  f32 * dstBase)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    memcpy(dstBase, srcBase, size.width*sizeof(f32));
    size_t srcstep = srcStride/sizeof(f32);
    size_t i = 0;
    for (; i + 16 <= size.width; i += 16)
    {
        const f32* src_address = srcBase + i;

        float32x4_t s1 = vld1q_f32(src_address + 0);
        float32x4_t s2 = vld1q_f32(src_address + 4);
        float32x4_t s3 = vld1q_f32(src_address + 8);
        float32x4_t s4 = vld1q_f32(src_address + 12);

        src_address += srcstep;

        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
        {
            internal::prefetch(src_address + srcstep, 0);
            internal::prefetch(src_address + srcstep, 32);

            float32x4_t v1 = vld1q_f32(src_address + 0);
            float32x4_t v2 = vld1q_f32(src_address + 4);
            float32x4_t v3 = vld1q_f32(src_address + 8);
            float32x4_t v4 = vld1q_f32(src_address + 12);

            s1 = vaddq_f32(s1, v1);
            s2 = vaddq_f32(s2, v2);
            s3 = vaddq_f32(s3, v3);
            s4 = vaddq_f32(s4, v4);
        }

        vst1q_f32(dstBase + i + 0, s1);
        vst1q_f32(dstBase + i + 4, s2);
        vst1q_f32(dstBase + i + 8, s3);
        vst1q_f32(dstBase + i + 12, s4);
    }

    for (; i + 4 <= size.width; i += 4)
    {
        const f32* src_address = srcBase + i;
        float32x4_t s1 = vld1q_f32(src_address);
        src_address += srcstep;
        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
        {
            internal::prefetch(src_address + srcstep, 0);

            float32x4_t v1 = vld1q_f32(src_address);
            s1 = vaddq_f32(s1, v1);
        }
        vst1q_f32(dstBase + i, s1);
    }

    if (i < size.width)
        for(size_t h = 1; h < size.height; ++h)
        {
            for(size_t j = i ; j < size.width; j++ )
            {
                dstBase[j] += srcBase[j + srcstep * h];
            }
        }
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
#endif
}

void reduceColMax(const Size2D &size,
                  const f32 * srcBase, ptrdiff_t srcStride,
                  f32 * dstBase)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    memcpy(dstBase, srcBase, size.width*sizeof(f32));
    size_t srcstep = srcStride/sizeof(f32);
    size_t i = 0;
    for (; i + 16 <= size.width; i += 16)
    {
        const f32* src_address = srcBase + i;

        float32x4_t s1 = vld1q_f32(src_address + 0);
        float32x4_t s2 = vld1q_f32(src_address + 4);
        float32x4_t s3 = vld1q_f32(src_address + 8);
        float32x4_t s4 = vld1q_f32(src_address + 12);

        src_address += srcstep;

        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
        {
            internal::prefetch(src_address + srcstep, 0);
            internal::prefetch(src_address + srcstep, 32);

            float32x4_t v1 = vld1q_f32(src_address + 0);
            float32x4_t v2 = vld1q_f32(src_address + 4);
            float32x4_t v3 = vld1q_f32(src_address + 8);
            float32x4_t v4 = vld1q_f32(src_address + 12);

            s1 = vmaxq_f32(s1, v1);
            s2 = vmaxq_f32(s2, v2);
            s3 = vmaxq_f32(s3, v3);
            s4 = vmaxq_f32(s4, v4);
        }

        vst1q_f32(dstBase + i + 0, s1);
        vst1q_f32(dstBase + i + 4, s2);
        vst1q_f32(dstBase + i + 8, s3);
        vst1q_f32(dstBase + i + 12, s4);
    }

    for (; i + 4 <= size.width; i += 4)
    {
        const f32* src_address = srcBase + i;
        float32x4_t s1 = vld1q_f32(src_address);
        src_address += srcstep;
        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
        {
            internal::prefetch(src_address + srcstep, 0);

            float32x4_t v1 = vld1q_f32(src_address);
            s1 = vmaxq_f32(s1, v1);
        }
        vst1q_f32(dstBase + i, s1);
    }

    if (i < size.width)
        for(size_t h = 1; h < size.height; ++h)
            for(size_t j = i ; j < size.width; j++ )
                dstBase[j] = std::max(dstBase[j], srcBase[j + srcstep * h]);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
#endif
}

void reduceColMin(const Size2D &size,
                  const f32 * srcBase, ptrdiff_t srcStride,
                  f32 * dstBase)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    memcpy(dstBase, srcBase, size.width*sizeof(f32));
    size_t srcstep = srcStride/sizeof(f32);
    size_t i = 0;
    for (; i + 16 <= size.width; i += 16)
    {
        const f32* src_address = srcBase + i;

        float32x4_t s1 = vld1q_f32(src_address + 0);
        float32x4_t s2 = vld1q_f32(src_address + 4);
        float32x4_t s3 = vld1q_f32(src_address + 8);
        float32x4_t s4 = vld1q_f32(src_address + 12);

        src_address += srcstep;

        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
        {
            internal::prefetch(src_address + srcstep, 0);
            internal::prefetch(src_address + srcstep, 32);

            float32x4_t v1 = vld1q_f32(src_address + 0);
            float32x4_t v2 = vld1q_f32(src_address + 4);
            float32x4_t v3 = vld1q_f32(src_address + 8);
            float32x4_t v4 = vld1q_f32(src_address + 12);

            s1 = vminq_f32(s1, v1);
            s2 = vminq_f32(s2, v2);
            s3 = vminq_f32(s3, v3);
            s4 = vminq_f32(s4, v4);
        }

        vst1q_f32(dstBase + i + 0, s1);
        vst1q_f32(dstBase + i + 4, s2);
        vst1q_f32(dstBase + i + 8, s3);
        vst1q_f32(dstBase + i + 12, s4);
    }

    for (; i + 4 <= size.width; i += 4)
    {
        const f32* src_address = srcBase + i;
        float32x4_t s1 = vld1q_f32(src_address);
        src_address += srcstep;
        for(size_t h = 1; h < size.height; ++h, src_address += srcstep)
        {
            internal::prefetch(src_address + srcstep, 0);

            float32x4_t v1 = vld1q_f32(src_address);
            s1 = vminq_f32(s1, v1);
        }
        vst1q_f32(dstBase + i, s1);
    }

    if (i < size.width)
        for(size_t h = 1; h < size.height; ++h)
            for(size_t j = i ; j < size.width; j++ )
                dstBase[j] = std::min(dstBase[j], srcBase[j + srcstep * h]);
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
#endif
}

} // namespace CAROTENE_NS