/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#include "common.hpp"
#include "vtransform.hpp"

#include <cstring>

namespace CAROTENE_NS {

bool isFlipSupported(FLIP_MODE flipMode, u32 elemSize)
{
    bool supportedElemSize = (elemSize == 1) || (elemSize == 2) || (elemSize == 3) || (elemSize == 4);
    return isSupportedConfiguration() &&
            ((supportedElemSize && ((flipMode == FLIP_BOTH_MODE) || (flipMode == FLIP_HORIZONTAL_MODE))) ||
             (flipMode == FLIP_VERTICAL_MODE));
}

#ifdef CAROTENE_NEON

namespace {

template <typename T>
void flip(const Size2D & size,
          const void * srcBase, ptrdiff_t srcStride,
          void * dstBase, ptrdiff_t dstStride,
          FLIP_MODE flipMode)
{
    using namespace internal;

    typedef typename VecTraits<T>::vec128 vec128;
    typedef typename VecTraits<T>::vec64 vec64;

    u32 step_base = 16 / sizeof(T), step_tail = 8 / sizeof(T);
    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;

    for (size_t i = 0; i < size.height; ++i)
    {
        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
        size_t js = 0, jd = size.width;

        for (; js < roiw_base; js += step_base, jd -= step_base)
        {
            prefetch(src + js);

            vec128 v_src = vld1q(src + js);
            vec128 v_dst = vrev64q(v_src);
            v_dst = vcombine(vget_high(v_dst), vget_low(v_dst));
            vst1q(dst + jd - step_base, v_dst);
        }
        for (; js < roiw_tail; js += step_tail, jd -= step_tail)
        {
            vec64 v_src = vld1(src + js);
            vst1(dst + jd - step_tail, vrev64(v_src));
        }

        for (--jd; js < size.width; ++js, --jd)
            dst[jd] = src[js];
    }
}

template <typename T>
void flip3(const Size2D & size,
           const void * srcBase, ptrdiff_t srcStride,
           void * dstBase, ptrdiff_t dstStride,
           FLIP_MODE flipMode)
{
    using namespace internal;

#ifndef __ANDROID__
    typedef typename VecTraits<T, 3>::vec128 vec128;
#endif
    typedef typename VecTraits<T, 3>::vec64 vec64;

#ifndef __ANDROID__
    u32 step_base = 16 / sizeof(T), step_base3 = step_base * 3;
    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
#endif
    u32 step_tail = 8 / sizeof(T), step_tail3 = step_tail * 3;
    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;

    for (size_t i = 0; i < size.height; ++i)
    {
        const T * src = getRowPtr((const T *)srcBase, srcStride, i);
        T * dst = getRowPtr((T *)dstBase, dstStride, (flipMode & FLIP_VERTICAL_MODE) != 0 ? size.height - i - 1 : i);
        size_t j = 0, js = 0, jd = size.width * 3;

#ifndef __ANDROID__
        for (; j < roiw_base; j += step_base, js += step_base3, jd -= step_base3)
        {
            prefetch(src + js);

            vec128 v_src = vld3q(src + js), v_dst;
            v_src.val[0] = vrev64q(v_src.val[0]);
            v_src.val[1] = vrev64q(v_src.val[1]);
            v_src.val[2] = vrev64q(v_src.val[2]);

            v_dst.val[0] = vcombine(vget_high(v_src.val[0]), vget_low(v_src.val[0]));
            v_dst.val[1] = vcombine(vget_high(v_src.val[1]), vget_low(v_src.val[1]));
            v_dst.val[2] = vcombine(vget_high(v_src.val[2]), vget_low(v_src.val[2]));

            vst3q(dst + jd - step_base3, v_dst);
        }
#endif // __ANDROID__

        for (; j < roiw_tail; j += step_tail, js += step_tail3, jd -= step_tail3)
        {
            vec64 v_src = vld3(src + js), v_dst;
            v_dst.val[0] = vrev64(v_src.val[0]);
            v_dst.val[1] = vrev64(v_src.val[1]);
            v_dst.val[2] = vrev64(v_src.val[2]);

            vst3(dst + jd - step_tail3, v_dst);
        }

        for (jd -= 3; j < size.width; ++j, js += 3, jd -= 3)
        {
            dst[jd] = src[js];
            dst[jd + 1] = src[js + 1];
            dst[jd + 2] = src[js + 2];
        }
    }
}

typedef void (* flipFunc)(const Size2D &size,
                  const void * srcBase, ptrdiff_t srcStride,
                  void * dstBase, ptrdiff_t dstStride,
                  FLIP_MODE flipMode);

} // namespace

#endif

void flip(const Size2D &size,
          const u8 * srcBase, ptrdiff_t srcStride,
          u8 * dstBase, ptrdiff_t dstStride,
          FLIP_MODE flipMode, u32 elemSize)
{
    internal::assertSupportedConfiguration(isFlipSupported(flipMode, elemSize));
#ifdef CAROTENE_NEON

    if (flipMode == FLIP_VERTICAL_MODE)
    {
        for (size_t y = 0; y < size.height; ++y)
        {
            const u8 * src_row = internal::getRowPtr(srcBase, srcStride, y);
            u8 * dst_row = internal::getRowPtr(dstBase, dstStride, size.height - y - 1);

            std::memcpy(dst_row, src_row, elemSize * size.width);
        }
        return;
    }

    flipFunc func = NULL;

    if (elemSize == (u32)sizeof(u8))
        func = &flip<u8>;
    if (elemSize == (u32)sizeof(u16))
        func = &flip<u16>;
    if (elemSize == (u32)sizeof(u32))
        func = &flip<u32>;
    if (elemSize == (u32)sizeof(u8) * 3)
        func = &flip3<u8>;

    if (func == NULL)
        return;

    func(size,
         srcBase, srcStride,
         dstBase, dstStride,
         flipMode);

#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)dstBase;
    (void)dstStride;
    (void)flipMode;
    (void)elemSize;
#endif
}

} // namespace CAROTENE_NS