/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#include "common.hpp"
#include "vtransform.hpp"

namespace CAROTENE_NS {

#ifdef CAROTENE_NEON

namespace {

inline void vnst(u8* dst, uint8x16_t v1, uint8x16_t v2) { vst1q_u8(dst, v1); vst1q_u8(dst+16, v2); }
inline void vnst(u8* dst, uint16x8_t v1, uint16x8_t v2) { vst1q_u8(dst, vcombine_u8(vmovn_u16(v1), vmovn_u16(v2))); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }

template <typename Op, int elsize> struct vtail
{
    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
                               u8 * dst, const Op & op,
                               size_t &x, size_t width)
    {
        //do nothing since there couldn't be enough data
        (void)src0;
        (void)src1;
        (void)dst;
        (void)op;
        (void)x;
        (void)width;
    }
};
template <typename Op> struct vtail<Op, 2>
{
    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
                               u8 * dst, const Op & op,
                               size_t &x, size_t width)
    {
        typedef typename Op::type type;
        typedef typename internal::VecTraits<type>::vec128 vec128;
        typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
        //There no more than 15 elements in the tail, so we could handle 8 element vector only once
        if( x + 8 < width)
        {
            vec128  v_src0, v_src1;
            uvec128 v_dst;

            v_src0 = internal::vld1q(src0 + x);
            v_src1 = internal::vld1q(src1 + x);
            op(v_src0, v_src1, v_dst);
            internal::vst1(dst + x, internal::vmovn(v_dst));
            x+=8;
        }
    }
};
template <typename Op> struct vtail<Op, 1>
{
    static inline void compare(const typename Op::type * src0, const typename Op::type * src1,
                               u8 * dst, const Op & op,
                               size_t &x, size_t width)
    {
        typedef typename Op::type type;
        typedef typename internal::VecTraits<type>::vec128 vec128;
        typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;
        typedef typename internal::VecTraits<type>::vec64 vec64;
        typedef typename internal::VecTraits<type>::unsign::vec64 uvec64;
        //There no more than 31 elements in the tail, so we could handle once 16+8 or 16 or 8 elements
        if( x + 16 < width)
        {
            vec128  v_src0, v_src1;
            uvec128 v_dst;

            v_src0 = internal::vld1q(src0 + x);
            v_src1 = internal::vld1q(src1 + x);
            op(v_src0, v_src1, v_dst);
            internal::vst1q(dst + x, v_dst);
            x+=16;
        }
        if( x + 8 < width)
        {
            vec64  v_src0, v_src1;
            uvec64 v_dst;

            v_src0 = internal::vld1(src0 + x);
            v_src1 = internal::vld1(src1 + x);
            op(v_src0, v_src1, v_dst);
            internal::vst1(dst + x, v_dst);
            x+=8;
        }
    }
};

template <typename Op>
void vcompare(Size2D size,
              const typename Op::type * src0Base, ptrdiff_t src0Stride,
              const typename Op::type * src1Base, ptrdiff_t src1Stride,
              u8 * dstBase, ptrdiff_t dstStride, const Op & op)
{
    typedef typename Op::type type;
    typedef typename internal::VecTraits<type>::vec128 vec128;
    typedef typename internal::VecTraits<type>::unsign::vec128 uvec128;

    if (src0Stride == src1Stride && src0Stride == dstStride &&
        src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
    {
        size.width *= size.height;
        size.height = 1;
    }

    const u32 step_base = 32 / sizeof(type);
    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;

    for (size_t y = 0; y < size.height; ++y)
    {
        const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
        const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
        u8 * dst = internal::getRowPtr(dstBase, dstStride, y);
        size_t x = 0;

        for( ; x < roiw_base; x += step_base )
        {
            internal::prefetch(src0 + x);
            internal::prefetch(src1 + x);

            vec128 v_src00 = internal::vld1q(src0 + x), v_src01 = internal::vld1q(src0 + x + 16 / sizeof(type));
            vec128 v_src10 = internal::vld1q(src1 + x), v_src11 = internal::vld1q(src1 + x + 16 / sizeof(type));
            uvec128 v_dst0;
            uvec128 v_dst1;

            op(v_src00, v_src10, v_dst0);
            op(v_src01, v_src11, v_dst1);

            vnst(dst + x, v_dst0, v_dst1);
        }

        vtail<Op, sizeof(type)>::compare(src0, src1, dst, op, x, size.width);

        for (; x < size.width; ++x)
        {
            op(src0 + x, src1 + x, dst + x);
        }
    }
}

template<typename T>
struct OpCmpEQ
{
    typedef T type;

    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
    {
        v_dst = internal::vceqq(v_src0, v_src1);
    }

    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
    {
        v_dst = internal::vceq(v_src0, v_src1);
    }

    void operator() (const T * src0, const T * src1, u8 * dst) const
    {
        dst[0] = src0[0] == src1[0] ? 255 : 0;
    }
};

template<typename T>
struct OpCmpNE
{
    typedef T type;

    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
    {
        v_dst = internal::vmvnq(internal::vceqq(v_src0, v_src1));
    }

    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
    {
        v_dst = internal::vmvn(internal::vceq(v_src0, v_src1));
    }

    void operator() (const T * src0, const T * src1, u8 * dst) const
    {
        dst[0] = src0[0] == src1[0] ? 0 : 255;
    }
};

template<typename T>
struct OpCmpGT
{
    typedef T type;

    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
    {
        v_dst = internal::vcgtq(v_src0, v_src1);
    }

    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
    {
        v_dst = internal::vcgt(v_src0, v_src1);
    }

    void operator() (const T * src0, const T * src1, u8 * dst) const
    {
        dst[0] = src0[0] > src1[0] ? 255 : 0;
    }
};

template<typename T>
struct OpCmpGE
{
    typedef T type;

    void operator() (const typename internal::VecTraits<T>::vec128 & v_src0, const typename internal::VecTraits<T>::vec128 & v_src1,
              typename internal::VecTraits<T>::unsign::vec128 & v_dst) const
    {
        v_dst = internal::vcgeq(v_src0, v_src1);
    }

    void operator() (const typename internal::VecTraits<T>::vec64 & v_src0, const typename internal::VecTraits<T>::vec64 & v_src1,
              typename internal::VecTraits<T>::unsign::vec64 & v_dst) const
    {
        v_dst = internal::vcge(v_src0, v_src1);
    }

    void operator() (const T * src0, const T * src1, u8 * dst) const
    {
        dst[0] = src0[0] >= src1[0] ? 255 : 0;
    }
};

}

#define IMPL_CMPOP(op, type)                              \
void cmp##op(const Size2D &size,                          \
             const type * src0Base, ptrdiff_t src0Stride, \
             const type * src1Base, ptrdiff_t src1Stride, \
                       u8 *dstBase, ptrdiff_t dstStride)  \
{                                                         \
    internal::assertSupportedConfiguration();             \
    vcompare(size,                                        \
             src0Base, src0Stride,                        \
             src1Base, src1Stride,                        \
             dstBase, dstStride,                          \
             OpCmp##op<type>());                          \
}

#else

#define IMPL_CMPOP(op, type)                              \
void cmp##op(const Size2D &size,                          \
             const type * src0Base, ptrdiff_t src0Stride, \
             const type * src1Base, ptrdiff_t src1Stride, \
             u8 *dstBase, ptrdiff_t dstStride)            \
{                                                         \
    internal::assertSupportedConfiguration();             \
    (void)size;                                           \
    (void)src0Base;                                       \
    (void)src0Stride;                                     \
    (void)src1Base;                                       \
    (void)src1Stride;                                     \
    (void)dstBase;                                        \
    (void)dstStride;                                      \
}

#endif

IMPL_CMPOP(EQ, u8)
IMPL_CMPOP(EQ, s8)
IMPL_CMPOP(EQ, u16)
IMPL_CMPOP(EQ, s16)
IMPL_CMPOP(EQ, u32)
IMPL_CMPOP(EQ, s32)
IMPL_CMPOP(EQ, f32)

IMPL_CMPOP(NE, u8)
IMPL_CMPOP(NE, s8)
IMPL_CMPOP(NE, u16)
IMPL_CMPOP(NE, s16)
IMPL_CMPOP(NE, u32)
IMPL_CMPOP(NE, s32)
IMPL_CMPOP(NE, f32)

IMPL_CMPOP(GT, u8)
IMPL_CMPOP(GT, s8)
IMPL_CMPOP(GT, u16)
IMPL_CMPOP(GT, s16)
IMPL_CMPOP(GT, u32)
IMPL_CMPOP(GT, s32)
IMPL_CMPOP(GT, f32)

IMPL_CMPOP(GE, u8)
IMPL_CMPOP(GE, s8)
IMPL_CMPOP(GE, u16)
IMPL_CMPOP(GE, s16)
IMPL_CMPOP(GE, u32)
IMPL_CMPOP(GE, s32)
IMPL_CMPOP(GE, f32)

} // namespace CAROTENE_NS