/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2014-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#ifndef CAROTENE_SRC_VTRANSFORM_HPP
#define CAROTENE_SRC_VTRANSFORM_HPP

#include "common.hpp"

#include <carotene/types.hpp>

#ifdef CAROTENE_NEON

namespace CAROTENE_NS { namespace internal {

////////////////////////////// Type Traits ///////////////////////

template <typename T, int cn = 1>
struct VecTraits;

template <> struct VecTraits< u8, 1> { typedef  uint8x16_t vec128; typedef   uint8x8_t vec64; typedef VecTraits<  u8, 1> unsign; };
template <> struct VecTraits< s8, 1> { typedef   int8x16_t vec128; typedef    int8x8_t vec64; typedef VecTraits<  u8, 1> unsign; };
template <> struct VecTraits<u16, 1> { typedef  uint16x8_t vec128; typedef  uint16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s16, 1> { typedef   int16x8_t vec128; typedef   int16x4_t vec64; typedef VecTraits< u16, 1> unsign; };
template <> struct VecTraits<s32, 1> { typedef   int32x4_t vec128; typedef   int32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<u32, 1> { typedef  uint32x4_t vec128; typedef  uint32x2_t vec64; typedef VecTraits< u32, 1> unsign; };
template <> struct VecTraits<s64, 1> { typedef   int64x2_t vec128; typedef   int64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<u64, 1> { typedef  uint64x2_t vec128; typedef  uint64x1_t vec64; typedef VecTraits< u64, 1> unsign; };
template <> struct VecTraits<f32, 1> { typedef float32x4_t vec128; typedef float32x2_t vec64; typedef VecTraits< u32, 1> unsign; };

template <> struct VecTraits< u8, 2> { typedef  uint8x16x2_t vec128; typedef   uint8x8x2_t vec64; typedef VecTraits<  u8, 2> unsign; };
template <> struct VecTraits< s8, 2> { typedef   int8x16x2_t vec128; typedef    int8x8x2_t vec64; typedef VecTraits<  u8, 2> unsign; };
template <> struct VecTraits<u16, 2> { typedef  uint16x8x2_t vec128; typedef  uint16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s16, 2> { typedef   int16x8x2_t vec128; typedef   int16x4x2_t vec64; typedef VecTraits< u16, 2> unsign; };
template <> struct VecTraits<s32, 2> { typedef   int32x4x2_t vec128; typedef   int32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<u32, 2> { typedef  uint32x4x2_t vec128; typedef  uint32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };
template <> struct VecTraits<s64, 2> { typedef   int64x2x2_t vec128; typedef   int64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 2> { typedef  uint64x2x2_t vec128; typedef  uint64x1x2_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 2> { typedef float32x4x2_t vec128; typedef float32x2x2_t vec64; typedef VecTraits< u32, 2> unsign; };

template <> struct VecTraits< u8, 3> { typedef  uint8x16x3_t vec128; typedef   uint8x8x3_t vec64; typedef VecTraits<  u8, 3> unsign; };
template <> struct VecTraits< s8, 3> { typedef   int8x16x3_t vec128; typedef    int8x8x3_t vec64; typedef VecTraits<  u8, 3> unsign; };
template <> struct VecTraits<u16, 3> { typedef  uint16x8x3_t vec128; typedef  uint16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 3> { typedef   int16x8x3_t vec128; typedef   int16x4x3_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 3> { typedef   int32x4x3_t vec128; typedef   int32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 3> { typedef  uint32x4x3_t vec128; typedef  uint32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 3> { typedef   int64x2x3_t vec128; typedef   int64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 3> { typedef  uint64x2x3_t vec128; typedef  uint64x1x3_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 3> { typedef float32x4x3_t vec128; typedef float32x2x3_t vec64; typedef VecTraits< u32, 3> unsign; };

template <> struct VecTraits< u8, 4> { typedef  uint8x16x4_t vec128; typedef   uint8x8x4_t vec64; typedef VecTraits<  u8, 3> unsign; };
template <> struct VecTraits< s8, 4> { typedef   int8x16x4_t vec128; typedef    int8x8x4_t vec64; typedef VecTraits<  u8, 3> unsign; };
template <> struct VecTraits<u16, 4> { typedef  uint16x8x4_t vec128; typedef  uint16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s16, 4> { typedef   int16x8x4_t vec128; typedef   int16x4x4_t vec64; typedef VecTraits< u16, 3> unsign; };
template <> struct VecTraits<s32, 4> { typedef   int32x4x4_t vec128; typedef   int32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<u32, 4> { typedef  uint32x4x4_t vec128; typedef  uint32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };
template <> struct VecTraits<s64, 4> { typedef   int64x2x4_t vec128; typedef   int64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<u64, 4> { typedef  uint64x2x4_t vec128; typedef  uint64x1x4_t vec64; typedef VecTraits< u64, 2> unsign; };
template <> struct VecTraits<f32, 4> { typedef float32x4x4_t vec128; typedef float32x2x4_t vec64; typedef VecTraits< u32, 3> unsign; };

////////////////////////////// vld1q ///////////////////////

inline  uint8x16_t vld1q(const u8  * ptr) { return  vld1q_u8(ptr); }
inline   int8x16_t vld1q(const s8  * ptr) { return  vld1q_s8(ptr); }
inline  uint16x8_t vld1q(const u16 * ptr) { return vld1q_u16(ptr); }
inline   int16x8_t vld1q(const s16 * ptr) { return vld1q_s16(ptr); }
inline  uint32x4_t vld1q(const u32 * ptr) { return vld1q_u32(ptr); }
inline   int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
inline float32x4_t vld1q(const f32 * ptr) { return vld1q_f32(ptr); }

////////////////////////////// vld1 ///////////////////////

inline   uint8x8_t vld1(const u8  * ptr) { return  vld1_u8(ptr); }
inline    int8x8_t vld1(const s8  * ptr) { return  vld1_s8(ptr); }
inline  uint16x4_t vld1(const u16 * ptr) { return vld1_u16(ptr); }
inline   int16x4_t vld1(const s16 * ptr) { return vld1_s16(ptr); }
inline  uint32x2_t vld1(const u32 * ptr) { return vld1_u32(ptr); }
inline   int32x2_t vld1(const s32 * ptr) { return vld1_s32(ptr); }
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }

////////////////////////////// vld2q ///////////////////////

inline  uint8x16x2_t vld2q(const u8  * ptr) { return  vld2q_u8(ptr); }
inline   int8x16x2_t vld2q(const s8  * ptr) { return  vld2q_s8(ptr); }
inline  uint16x8x2_t vld2q(const u16 * ptr) { return vld2q_u16(ptr); }
inline   int16x8x2_t vld2q(const s16 * ptr) { return vld2q_s16(ptr); }
inline  uint32x4x2_t vld2q(const u32 * ptr) { return vld2q_u32(ptr); }
inline   int32x4x2_t vld2q(const s32 * ptr) { return vld2q_s32(ptr); }
inline float32x4x2_t vld2q(const f32 * ptr) { return vld2q_f32(ptr); }

////////////////////////////// vld2 ///////////////////////

inline   uint8x8x2_t vld2(const u8  * ptr) { return  vld2_u8(ptr); }
inline    int8x8x2_t vld2(const s8  * ptr) { return  vld2_s8(ptr); }
inline  uint16x4x2_t vld2(const u16 * ptr) { return vld2_u16(ptr); }
inline   int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
inline  uint32x2x2_t vld2(const u32 * ptr) { return vld2_u32(ptr); }
inline   int32x2x2_t vld2(const s32 * ptr) { return vld2_s32(ptr); }
inline float32x2x2_t vld2(const f32 * ptr) { return vld2_f32(ptr); }

////////////////////////////// vld3q ///////////////////////

inline  uint8x16x3_t vld3q(const u8  * ptr) { return  vld3q_u8(ptr); }
inline   int8x16x3_t vld3q(const s8  * ptr) { return  vld3q_s8(ptr); }
inline  uint16x8x3_t vld3q(const u16 * ptr) { return vld3q_u16(ptr); }
inline   int16x8x3_t vld3q(const s16 * ptr) { return vld3q_s16(ptr); }
inline  uint32x4x3_t vld3q(const u32 * ptr) { return vld3q_u32(ptr); }
inline   int32x4x3_t vld3q(const s32 * ptr) { return vld3q_s32(ptr); }
inline float32x4x3_t vld3q(const f32 * ptr) { return vld3q_f32(ptr); }

////////////////////////////// vld3 ///////////////////////

inline   uint8x8x3_t vld3(const u8  * ptr) { return  vld3_u8(ptr); }
inline    int8x8x3_t vld3(const s8  * ptr) { return  vld3_s8(ptr); }
inline  uint16x4x3_t vld3(const u16 * ptr) { return vld3_u16(ptr); }
inline   int16x4x3_t vld3(const s16 * ptr) { return vld3_s16(ptr); }
inline  uint32x2x3_t vld3(const u32 * ptr) { return vld3_u32(ptr); }
inline   int32x2x3_t vld3(const s32 * ptr) { return vld3_s32(ptr); }
inline float32x2x3_t vld3(const f32 * ptr) { return vld3_f32(ptr); }

////////////////////////////// vld4q ///////////////////////

inline  uint8x16x4_t vld4q(const u8  * ptr) { return  vld4q_u8(ptr); }
inline   int8x16x4_t vld4q(const s8  * ptr) { return  vld4q_s8(ptr); }
inline  uint16x8x4_t vld4q(const u16 * ptr) { return vld4q_u16(ptr); }
inline   int16x8x4_t vld4q(const s16 * ptr) { return vld4q_s16(ptr); }
inline  uint32x4x4_t vld4q(const u32 * ptr) { return vld4q_u32(ptr); }
inline   int32x4x4_t vld4q(const s32 * ptr) { return vld4q_s32(ptr); }
inline float32x4x4_t vld4q(const f32 * ptr) { return vld4q_f32(ptr); }

////////////////////////////// vld4 ///////////////////////

inline   uint8x8x4_t vld4(const u8  * ptr) { return  vld4_u8(ptr); }
inline    int8x8x4_t vld4(const s8  * ptr) { return  vld4_s8(ptr); }
inline  uint16x4x4_t vld4(const u16 * ptr) { return vld4_u16(ptr); }
inline   int16x4x4_t vld4(const s16 * ptr) { return vld4_s16(ptr); }
inline  uint32x2x4_t vld4(const u32 * ptr) { return vld4_u32(ptr); }
inline   int32x2x4_t vld4(const s32 * ptr) { return vld4_s32(ptr); }
inline float32x2x4_t vld4(const f32 * ptr) { return vld4_f32(ptr); }

////////////////////////////// vst1q ///////////////////////

inline void vst1q(u8  * ptr, const uint8x16_t  & v) { return vst1q_u8(ptr,  v); }
inline void vst1q(s8  * ptr, const int8x16_t   & v) { return vst1q_s8(ptr,  v); }
inline void vst1q(u16 * ptr, const uint16x8_t  & v) { return vst1q_u16(ptr, v); }
inline void vst1q(s16 * ptr, const int16x8_t   & v) { return vst1q_s16(ptr, v); }
inline void vst1q(u32 * ptr, const uint32x4_t  & v) { return vst1q_u32(ptr, v); }
inline void vst1q(s32 * ptr, const int32x4_t   & v) { return vst1q_s32(ptr, v); }
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }

////////////////////////////// vst1 ///////////////////////

inline void vst1(u8  * ptr, const uint8x8_t   & v) { return vst1_u8(ptr,  v); }
inline void vst1(s8  * ptr, const int8x8_t    & v) { return vst1_s8(ptr,  v); }
inline void vst1(u16 * ptr, const uint16x4_t  & v) { return vst1_u16(ptr, v); }
inline void vst1(s16 * ptr, const int16x4_t   & v) { return vst1_s16(ptr, v); }
inline void vst1(u32 * ptr, const uint32x2_t  & v) { return vst1_u32(ptr, v); }
inline void vst1(s32 * ptr, const int32x2_t   & v) { return vst1_s32(ptr, v); }
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }

////////////////////////////// vst2q ///////////////////////

inline void vst2q(u8  * ptr, const uint8x16x2_t  & v) { return vst2q_u8(ptr,  v); }
inline void vst2q(s8  * ptr, const int8x16x2_t   & v) { return vst2q_s8(ptr,  v); }
inline void vst2q(u16 * ptr, const uint16x8x2_t  & v) { return vst2q_u16(ptr, v); }
inline void vst2q(s16 * ptr, const int16x8x2_t   & v) { return vst2q_s16(ptr, v); }
inline void vst2q(u32 * ptr, const uint32x4x2_t  & v) { return vst2q_u32(ptr, v); }
inline void vst2q(s32 * ptr, const int32x4x2_t   & v) { return vst2q_s32(ptr, v); }
inline void vst2q(f32 * ptr, const float32x4x2_t & v) { return vst2q_f32(ptr, v); }

////////////////////////////// vst2 ///////////////////////

inline void vst2(u8  * ptr, const uint8x8x2_t   & v) { return vst2_u8(ptr,  v); }
inline void vst2(s8  * ptr, const int8x8x2_t    & v) { return vst2_s8(ptr,  v); }
inline void vst2(u16 * ptr, const uint16x4x2_t  & v) { return vst2_u16(ptr, v); }
inline void vst2(s16 * ptr, const int16x4x2_t   & v) { return vst2_s16(ptr, v); }
inline void vst2(u32 * ptr, const uint32x2x2_t  & v) { return vst2_u32(ptr, v); }
inline void vst2(s32 * ptr, const int32x2x2_t   & v) { return vst2_s32(ptr, v); }
inline void vst2(f32 * ptr, const float32x2x2_t & v) { return vst2_f32(ptr, v); }

////////////////////////////// vst3q ///////////////////////

inline void vst3q(u8  * ptr, const uint8x16x3_t  & v) { return vst3q_u8(ptr,  v); }
inline void vst3q(s8  * ptr, const int8x16x3_t   & v) { return vst3q_s8(ptr,  v); }
inline void vst3q(u16 * ptr, const uint16x8x3_t  & v) { return vst3q_u16(ptr, v); }
inline void vst3q(s16 * ptr, const int16x8x3_t   & v) { return vst3q_s16(ptr, v); }
inline void vst3q(u32 * ptr, const uint32x4x3_t  & v) { return vst3q_u32(ptr, v); }
inline void vst3q(s32 * ptr, const int32x4x3_t   & v) { return vst3q_s32(ptr, v); }
inline void vst3q(f32 * ptr, const float32x4x3_t & v) { return vst3q_f32(ptr, v); }

////////////////////////////// vst3 ///////////////////////

inline void vst3(u8  * ptr, const uint8x8x3_t   & v) { return vst3_u8(ptr,  v); }
inline void vst3(s8  * ptr, const int8x8x3_t    & v) { return vst3_s8(ptr,  v); }
inline void vst3(u16 * ptr, const uint16x4x3_t  & v) { return vst3_u16(ptr, v); }
inline void vst3(s16 * ptr, const int16x4x3_t   & v) { return vst3_s16(ptr, v); }
inline void vst3(u32 * ptr, const uint32x2x3_t  & v) { return vst3_u32(ptr, v); }
inline void vst3(s32 * ptr, const int32x2x3_t   & v) { return vst3_s32(ptr, v); }
inline void vst3(f32 * ptr, const float32x2x3_t & v) { return vst3_f32(ptr, v); }

////////////////////////////// vst4q ///////////////////////

inline void vst4q(u8  * ptr, const uint8x16x4_t  & v) { return vst4q_u8(ptr,  v); }
inline void vst4q(s8  * ptr, const int8x16x4_t   & v) { return vst4q_s8(ptr,  v); }
inline void vst4q(u16 * ptr, const uint16x8x4_t  & v) { return vst4q_u16(ptr, v); }
inline void vst4q(s16 * ptr, const int16x8x4_t   & v) { return vst4q_s16(ptr, v); }
inline void vst4q(u32 * ptr, const uint32x4x4_t  & v) { return vst4q_u32(ptr, v); }
inline void vst4q(s32 * ptr, const int32x4x4_t   & v) { return vst4q_s32(ptr, v); }
inline void vst4q(f32 * ptr, const float32x4x4_t & v) { return vst4q_f32(ptr, v); }

////////////////////////////// vst4 ///////////////////////

inline void vst4(u8  * ptr, const uint8x8x4_t   & v) { return vst4_u8(ptr,  v); }
inline void vst4(s8  * ptr, const int8x8x4_t    & v) { return vst4_s8(ptr,  v); }
inline void vst4(u16 * ptr, const uint16x4x4_t  & v) { return vst4_u16(ptr, v); }
inline void vst4(s16 * ptr, const int16x4x4_t   & v) { return vst4_s16(ptr, v); }
inline void vst4(u32 * ptr, const uint32x2x4_t  & v) { return vst4_u32(ptr, v); }
inline void vst4(s32 * ptr, const int32x2x4_t   & v) { return vst4_s32(ptr, v); }
inline void vst4(f32 * ptr, const float32x2x4_t & v) { return vst4_f32(ptr, v); }

////////////////////////////// vabdq ///////////////////////

inline  uint8x16_t vabdq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vabdq_u8 (v0, v1); }
inline   int8x16_t vabdq(const int8x16_t   & v0, const int8x16_t   & v1) { return vabdq_s8 (v0, v1); }
inline  uint16x8_t vabdq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vabdq_u16(v0, v1); }
inline   int16x8_t vabdq(const int16x8_t   & v0, const int16x8_t   & v1) { return vabdq_s16(v0, v1); }
inline  uint32x4_t vabdq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vabdq_u32(v0, v1); }
inline   int32x4_t vabdq(const int32x4_t   & v0, const int32x4_t   & v1) { return vabdq_s32(v0, v1); }
inline float32x4_t vabdq(const float32x4_t & v0, const float32x4_t & v1) { return vabdq_f32(v0, v1); }

////////////////////////////// vabd ///////////////////////

inline   uint8x8_t vabd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vabd_u8 (v0, v1); }
inline    int8x8_t vabd(const int8x8_t    & v0, const int8x8_t    & v1) { return vabd_s8 (v0, v1); }
inline  uint16x4_t vabd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vabd_u16(v0, v1); }
inline   int16x4_t vabd(const int16x4_t   & v0, const int16x4_t   & v1) { return vabd_s16(v0, v1); }
inline  uint32x2_t vabd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vabd_u32(v0, v1); }
inline   int32x2_t vabd(const int32x2_t   & v0, const int32x2_t   & v1) { return vabd_s32(v0, v1); }
inline float32x2_t vabd(const float32x2_t & v0, const float32x2_t & v1) { return vabd_f32(v0, v1); }

////////////////////////////// vminq ///////////////////////

inline  uint8x16_t vminq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vminq_u8 (v0, v1); }
inline   int8x16_t vminq(const int8x16_t   & v0, const int8x16_t   & v1) { return vminq_s8 (v0, v1); }
inline  uint16x8_t vminq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vminq_u16(v0, v1); }
inline   int16x8_t vminq(const int16x8_t   & v0, const int16x8_t   & v1) { return vminq_s16(v0, v1); }
inline  uint32x4_t vminq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vminq_u32(v0, v1); }
inline   int32x4_t vminq(const int32x4_t   & v0, const int32x4_t   & v1) { return vminq_s32(v0, v1); }
inline float32x4_t vminq(const float32x4_t & v0, const float32x4_t & v1) { return vminq_f32(v0, v1); }

////////////////////////////// vmin ///////////////////////

inline   uint8x8_t vmin(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmin_u8 (v0, v1); }
inline    int8x8_t vmin(const int8x8_t    & v0, const int8x8_t    & v1) { return vmin_s8 (v0, v1); }
inline  uint16x4_t vmin(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmin_u16(v0, v1); }
inline   int16x4_t vmin(const int16x4_t   & v0, const int16x4_t   & v1) { return vmin_s16(v0, v1); }
inline  uint32x2_t vmin(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmin_u32(v0, v1); }
inline   int32x2_t vmin(const int32x2_t   & v0, const int32x2_t   & v1) { return vmin_s32(v0, v1); }
inline float32x2_t vmin(const float32x2_t & v0, const float32x2_t & v1) { return vmin_f32(v0, v1); }

////////////////////////////// vmaxq ///////////////////////

inline  uint8x16_t vmaxq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vmaxq_u8 (v0, v1); }
inline   int8x16_t vmaxq(const int8x16_t   & v0, const int8x16_t   & v1) { return vmaxq_s8 (v0, v1); }
inline  uint16x8_t vmaxq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vmaxq_u16(v0, v1); }
inline   int16x8_t vmaxq(const int16x8_t   & v0, const int16x8_t   & v1) { return vmaxq_s16(v0, v1); }
inline  uint32x4_t vmaxq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vmaxq_u32(v0, v1); }
inline   int32x4_t vmaxq(const int32x4_t   & v0, const int32x4_t   & v1) { return vmaxq_s32(v0, v1); }
inline float32x4_t vmaxq(const float32x4_t & v0, const float32x4_t & v1) { return vmaxq_f32(v0, v1); }

////////////////////////////// vmax ///////////////////////

inline   uint8x8_t vmax(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmax_u8 (v0, v1); }
inline    int8x8_t vmax(const int8x8_t    & v0, const int8x8_t    & v1) { return vmax_s8 (v0, v1); }
inline  uint16x4_t vmax(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmax_u16(v0, v1); }
inline   int16x4_t vmax(const int16x4_t   & v0, const int16x4_t   & v1) { return vmax_s16(v0, v1); }
inline  uint32x2_t vmax(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmax_u32(v0, v1); }
inline   int32x2_t vmax(const int32x2_t   & v0, const int32x2_t   & v1) { return vmax_s32(v0, v1); }
inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }

////////////////////////////// vdupq_n ///////////////////////

inline  uint8x16_t vdupq_n(const u8  & val) { return  vdupq_n_u8(val); }
inline   int8x16_t vdupq_n(const s8  & val) { return  vdupq_n_s8(val); }
inline  uint16x8_t vdupq_n(const u16 & val) { return vdupq_n_u16(val); }
inline   int16x8_t vdupq_n(const s16 & val) { return vdupq_n_s16(val); }
inline  uint32x4_t vdupq_n(const u32 & val) { return vdupq_n_u32(val); }
inline   int32x4_t vdupq_n(const s32 & val) { return vdupq_n_s32(val); }
inline  uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }
inline   int64x2_t vdupq_n(const s64 & val) { return vdupq_n_s64(val); }
inline float32x4_t vdupq_n(const f32 & val) { return vdupq_n_f32(val); }

////////////////////////////// vdup_n ///////////////////////

inline   uint8x8_t vdup_n(const u8  & val) { return  vdup_n_u8(val); }
inline    int8x8_t vdup_n(const s8  & val) { return  vdup_n_s8(val); }
inline  uint16x4_t vdup_n(const u16 & val) { return vdup_n_u16(val); }
inline   int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
inline  uint32x2_t vdup_n(const u32 & val) { return vdup_n_u32(val); }
inline   int32x2_t vdup_n(const s32 & val) { return vdup_n_s32(val); }
inline  uint64x1_t vdup_n(const u64 & val) { return vdup_n_u64(val); }
inline   int64x1_t vdup_n(const s64 & val) { return vdup_n_s64(val); }
inline float32x2_t vdup_n(const f32 & val) { return vdup_n_f32(val); }

////////////////////////////// vget_low ///////////////////////

inline uint8x8_t   vget_low(const uint8x16_t  & v) { return vget_low_u8 (v); }
inline int8x8_t    vget_low(const int8x16_t   & v) { return vget_low_s8 (v); }
inline uint16x4_t  vget_low(const uint16x8_t  & v) { return vget_low_u16(v); }
inline int16x4_t   vget_low(const int16x8_t   & v) { return vget_low_s16(v); }
inline uint32x2_t  vget_low(const uint32x4_t  & v) { return vget_low_u32(v); }
inline int32x2_t   vget_low(const int32x4_t   & v) { return vget_low_s32(v); }
inline float32x2_t vget_low(const float32x4_t & v) { return vget_low_f32(v); }

////////////////////////////// vget_high ///////////////////////

inline uint8x8_t   vget_high(const uint8x16_t  & v) { return vget_high_u8 (v); }
inline int8x8_t    vget_high(const int8x16_t   & v) { return vget_high_s8 (v); }
inline uint16x4_t  vget_high(const uint16x8_t  & v) { return vget_high_u16(v); }
inline int16x4_t   vget_high(const int16x8_t   & v) { return vget_high_s16(v); }
inline uint32x2_t  vget_high(const uint32x4_t  & v) { return vget_high_u32(v); }
inline int32x2_t   vget_high(const int32x4_t   & v) { return vget_high_s32(v); }
inline float32x2_t vget_high(const float32x4_t & v) { return vget_high_f32(v); }

////////////////////////////// vcombine ///////////////////////

inline   uint8x16_t vcombine(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcombine_u8 (v0, v1); }
inline    int8x16_t vcombine(const int8x8_t    & v0, const int8x8_t    & v1) { return vcombine_s8 (v0, v1); }
inline  uint16x8_t  vcombine(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcombine_u16(v0, v1); }
inline   int16x8_t  vcombine(const int16x4_t   & v0, const int16x4_t   & v1) { return vcombine_s16(v0, v1); }
inline  uint32x4_t  vcombine(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcombine_u32(v0, v1); }
inline   int32x4_t  vcombine(const int32x2_t   & v0, const int32x2_t   & v1) { return vcombine_s32(v0, v1); }
inline float32x4_t  vcombine(const float32x2_t & v0, const float32x2_t & v1) { return vcombine_f32(v0, v1); }

////////////////////////////// vaddq ///////////////////////

inline  uint8x16_t vaddq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vaddq_u8 (v0, v1); }
inline   int8x16_t vaddq(const int8x16_t   & v0, const int8x16_t   & v1) { return vaddq_s8 (v0, v1); }
inline  uint16x8_t vaddq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vaddq_u16(v0, v1); }
inline   int16x8_t vaddq(const int16x8_t   & v0, const int16x8_t   & v1) { return vaddq_s16(v0, v1); }
inline  uint32x4_t vaddq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vaddq_u32(v0, v1); }
inline   int32x4_t vaddq(const int32x4_t   & v0, const int32x4_t   & v1) { return vaddq_s32(v0, v1); }
inline float32x4_t vaddq(const float32x4_t & v0, const float32x4_t & v1) { return vaddq_f32(v0, v1); }

////////////////////////////// vadd ///////////////////////

inline   uint8x8_t vadd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vadd_u8 (v0, v1); }
inline    int8x8_t vadd(const int8x8_t    & v0, const int8x8_t    & v1) { return vadd_s8 (v0, v1); }
inline  uint16x4_t vadd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vadd_u16(v0, v1); }
inline   int16x4_t vadd(const int16x4_t   & v0, const int16x4_t   & v1) { return vadd_s16(v0, v1); }
inline  uint32x2_t vadd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vadd_u32(v0, v1); }
inline   int32x2_t vadd(const int32x2_t   & v0, const int32x2_t   & v1) { return vadd_s32(v0, v1); }
inline float32x2_t vadd(const float32x2_t & v0, const float32x2_t & v1) { return vadd_f32(v0, v1); }

////////////////////////////// vqaddq ///////////////////////

inline  uint8x16_t vqaddq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vqaddq_u8 (v0, v1); }
inline   int8x16_t vqaddq(const int8x16_t   & v0, const int8x16_t   & v1) { return vqaddq_s8 (v0, v1); }
inline  uint16x8_t vqaddq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vqaddq_u16(v0, v1); }
inline   int16x8_t vqaddq(const int16x8_t   & v0, const int16x8_t   & v1) { return vqaddq_s16(v0, v1); }
inline  uint32x4_t vqaddq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vqaddq_u32(v0, v1); }
inline   int32x4_t vqaddq(const int32x4_t   & v0, const int32x4_t   & v1) { return vqaddq_s32(v0, v1); }

////////////////////////////// vqadd ///////////////////////

inline   uint8x8_t vqadd(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vqadd_u8 (v0, v1); }
inline    int8x8_t vqadd(const int8x8_t    & v0, const int8x8_t    & v1) { return vqadd_s8 (v0, v1); }
inline  uint16x4_t vqadd(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vqadd_u16(v0, v1); }
inline   int16x4_t vqadd(const int16x4_t   & v0, const int16x4_t   & v1) { return vqadd_s16(v0, v1); }
inline  uint32x2_t vqadd(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vqadd_u32(v0, v1); }
inline   int32x2_t vqadd(const int32x2_t   & v0, const int32x2_t   & v1) { return vqadd_s32(v0, v1); }

////////////////////////////// vsubq ///////////////////////

inline  uint8x16_t vsubq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vsubq_u8 (v0, v1); }
inline   int8x16_t vsubq(const int8x16_t   & v0, const int8x16_t   & v1) { return vsubq_s8 (v0, v1); }
inline  uint16x8_t vsubq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vsubq_u16(v0, v1); }
inline   int16x8_t vsubq(const int16x8_t   & v0, const int16x8_t   & v1) { return vsubq_s16(v0, v1); }
inline  uint32x4_t vsubq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vsubq_u32(v0, v1); }
inline   int32x4_t vsubq(const int32x4_t   & v0, const int32x4_t   & v1) { return vsubq_s32(v0, v1); }
inline float32x4_t vsubq(const float32x4_t & v0, const float32x4_t & v1) { return vsubq_f32(v0, v1); }

////////////////////////////// vsub ///////////////////////

inline   uint8x8_t vsub(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vsub_u8 (v0, v1); }
inline    int8x8_t vsub(const int8x8_t    & v0, const int8x8_t    & v1) { return vsub_s8 (v0, v1); }
inline  uint16x4_t vsub(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vsub_u16(v0, v1); }
inline   int16x4_t vsub(const int16x4_t   & v0, const int16x4_t   & v1) { return vsub_s16(v0, v1); }
inline  uint32x2_t vsub(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vsub_u32(v0, v1); }
inline   int32x2_t vsub(const int32x2_t   & v0, const int32x2_t   & v1) { return vsub_s32(v0, v1); }
inline float32x2_t vsub(const float32x2_t & v0, const float32x2_t & v1) { return vsub_f32(v0, v1); }

////////////////////////////// vqsubq ///////////////////////

inline  uint8x16_t vqsubq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vqsubq_u8 (v0, v1); }
inline   int8x16_t vqsubq(const int8x16_t   & v0, const int8x16_t   & v1) { return vqsubq_s8 (v0, v1); }
inline  uint16x8_t vqsubq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vqsubq_u16(v0, v1); }
inline   int16x8_t vqsubq(const int16x8_t   & v0, const int16x8_t   & v1) { return vqsubq_s16(v0, v1); }
inline  uint32x4_t vqsubq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vqsubq_u32(v0, v1); }
inline   int32x4_t vqsubq(const int32x4_t   & v0, const int32x4_t   & v1) { return vqsubq_s32(v0, v1); }
inline  uint64x2_t vqsubq(const uint64x2_t  & v0, const uint64x2_t  & v1) { return vqsubq_u64(v0, v1); }
inline   int64x2_t vqsubq(const int64x2_t   & v0, const int64x2_t   & v1) { return vqsubq_s64(v0, v1); }

////////////////////////////// vqsub ///////////////////////

inline   uint8x8_t vqsub(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vqsub_u8 (v0, v1); }
inline    int8x8_t vqsub(const int8x8_t    & v0, const int8x8_t    & v1) { return vqsub_s8 (v0, v1); }
inline  uint16x4_t vqsub(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vqsub_u16(v0, v1); }
inline   int16x4_t vqsub(const int16x4_t   & v0, const int16x4_t   & v1) { return vqsub_s16(v0, v1); }
inline  uint32x2_t vqsub(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vqsub_u32(v0, v1); }
inline   int32x2_t vqsub(const int32x2_t   & v0, const int32x2_t   & v1) { return vqsub_s32(v0, v1); }
inline  uint64x1_t vqsub(const uint64x1_t  & v0, const uint64x1_t  & v1) { return vqsub_u64(v0, v1); }
inline   int64x1_t vqsub(const int64x1_t   & v0, const int64x1_t   & v1) { return vqsub_s64(v0, v1); }

////////////////////////////// vmull ///////////////////////

inline  uint16x8_t vmull(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vmull_u8 (v0, v1); }
inline   int16x8_t vmull(const int8x8_t    & v0, const int8x8_t    & v1) { return vmull_s8 (v0, v1); }
inline  uint32x4_t vmull(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vmull_u16(v0, v1); }
inline   int32x4_t vmull(const int16x4_t   & v0, const int16x4_t   & v1) { return vmull_s16(v0, v1); }
inline  uint64x2_t vmull(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vmull_u32(v0, v1); }
inline   int64x2_t vmull(const int32x2_t   & v0, const int32x2_t   & v1) { return vmull_s32(v0, v1); }

////////////////////////////// vrev64q ///////////////////////

inline uint8x16_t  vrev64q(const uint8x16_t  & v) { return vrev64q_u8 (v); }
inline int8x16_t   vrev64q(const int8x16_t   & v) { return vrev64q_s8 (v); }
inline uint16x8_t  vrev64q(const uint16x8_t  & v) { return vrev64q_u16(v); }
inline int16x8_t   vrev64q(const int16x8_t   & v) { return vrev64q_s16(v); }
inline uint32x4_t  vrev64q(const uint32x4_t  & v) { return vrev64q_u32(v); }
inline int32x4_t   vrev64q(const int32x4_t   & v) { return vrev64q_s32(v); }
inline float32x4_t vrev64q(const float32x4_t & v) { return vrev64q_f32(v); }

////////////////////////////// vrev64 ///////////////////////

inline uint8x8_t   vrev64(const uint8x8_t   & v) { return vrev64_u8 (v); }
inline int8x8_t    vrev64(const int8x8_t    & v) { return vrev64_s8 (v); }
inline uint16x4_t  vrev64(const uint16x4_t  & v) { return vrev64_u16(v); }
inline int16x4_t   vrev64(const int16x4_t   & v) { return vrev64_s16(v); }
inline uint32x2_t  vrev64(const uint32x2_t  & v) { return vrev64_u32(v); }
inline int32x2_t   vrev64(const int32x2_t   & v) { return vrev64_s32(v); }
inline float32x2_t vrev64(const float32x2_t & v) { return vrev64_f32(v); }

////////////////////////////// vceqq ///////////////////////

inline  uint8x16_t vceqq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vceqq_u8 (v0, v1); }
inline  uint8x16_t vceqq(const int8x16_t   & v0, const int8x16_t   & v1) { return vceqq_s8 (v0, v1); }
inline  uint16x8_t vceqq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vceqq_u16(v0, v1); }
inline  uint16x8_t vceqq(const int16x8_t   & v0, const int16x8_t   & v1) { return vceqq_s16(v0, v1); }
inline  uint32x4_t vceqq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vceqq_u32(v0, v1); }
inline  uint32x4_t vceqq(const int32x4_t   & v0, const int32x4_t   & v1) { return vceqq_s32(v0, v1); }
inline  uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }

////////////////////////////// vceq ///////////////////////

inline   uint8x8_t vceq(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vceq_u8 (v0, v1); }
inline   uint8x8_t vceq(const int8x8_t    & v0, const int8x8_t    & v1) { return vceq_s8 (v0, v1); }
inline  uint16x4_t vceq(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vceq_u16(v0, v1); }
inline  uint16x4_t vceq(const int16x4_t   & v0, const int16x4_t   & v1) { return vceq_s16(v0, v1); }
inline  uint32x2_t vceq(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vceq_u32(v0, v1); }
inline  uint32x2_t vceq(const int32x2_t   & v0, const int32x2_t   & v1) { return vceq_s32(v0, v1); }
inline  uint32x2_t vceq(const float32x2_t & v0, const float32x2_t & v1) { return vceq_f32(v0, v1); }

////////////////////////////// vcgtq ///////////////////////

inline  uint8x16_t vcgtq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vcgtq_u8 (v0, v1); }
inline  uint8x16_t vcgtq(const int8x16_t   & v0, const int8x16_t   & v1) { return vcgtq_s8 (v0, v1); }
inline  uint16x8_t vcgtq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vcgtq_u16(v0, v1); }
inline  uint16x8_t vcgtq(const int16x8_t   & v0, const int16x8_t   & v1) { return vcgtq_s16(v0, v1); }
inline  uint32x4_t vcgtq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vcgtq_u32(v0, v1); }
inline  uint32x4_t vcgtq(const int32x4_t   & v0, const int32x4_t   & v1) { return vcgtq_s32(v0, v1); }
inline  uint32x4_t vcgtq(const float32x4_t & v0, const float32x4_t & v1) { return vcgtq_f32(v0, v1); }

////////////////////////////// vcgt ///////////////////////

inline   uint8x8_t vcgt(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcgt_u8 (v0, v1); }
inline   uint8x8_t vcgt(const int8x8_t    & v0, const int8x8_t    & v1) { return vcgt_s8 (v0, v1); }
inline  uint16x4_t vcgt(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcgt_u16(v0, v1); }
inline  uint16x4_t vcgt(const int16x4_t   & v0, const int16x4_t   & v1) { return vcgt_s16(v0, v1); }
inline  uint32x2_t vcgt(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcgt_u32(v0, v1); }
inline  uint32x2_t vcgt(const int32x2_t   & v0, const int32x2_t   & v1) { return vcgt_s32(v0, v1); }
inline  uint32x2_t vcgt(const float32x2_t & v0, const float32x2_t & v1) { return vcgt_f32(v0, v1); }

////////////////////////////// vcgeq ///////////////////////

inline  uint8x16_t vcgeq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vcgeq_u8 (v0, v1); }
inline  uint8x16_t vcgeq(const int8x16_t   & v0, const int8x16_t   & v1) { return vcgeq_s8 (v0, v1); }
inline  uint16x8_t vcgeq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vcgeq_u16(v0, v1); }
inline  uint16x8_t vcgeq(const int16x8_t   & v0, const int16x8_t   & v1) { return vcgeq_s16(v0, v1); }
inline  uint32x4_t vcgeq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vcgeq_u32(v0, v1); }
inline  uint32x4_t vcgeq(const int32x4_t   & v0, const int32x4_t   & v1) { return vcgeq_s32(v0, v1); }
inline  uint32x4_t vcgeq(const float32x4_t & v0, const float32x4_t & v1) { return vcgeq_f32(v0, v1); }

////////////////////////////// vcge ///////////////////////

inline   uint8x8_t vcge(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vcge_u8 (v0, v1); }
inline   uint8x8_t vcge(const int8x8_t    & v0, const int8x8_t    & v1) { return vcge_s8 (v0, v1); }
inline  uint16x4_t vcge(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vcge_u16(v0, v1); }
inline  uint16x4_t vcge(const int16x4_t   & v0, const int16x4_t   & v1) { return vcge_s16(v0, v1); }
inline  uint32x2_t vcge(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vcge_u32(v0, v1); }
inline  uint32x2_t vcge(const int32x2_t   & v0, const int32x2_t   & v1) { return vcge_s32(v0, v1); }
inline  uint32x2_t vcge(const float32x2_t & v0, const float32x2_t & v1) { return vcge_f32(v0, v1); }

////////////////////////////// vandq ///////////////////////

inline  uint8x16_t vandq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vandq_u8 (v0, v1); }
inline   int8x16_t vandq(const int8x16_t   & v0, const int8x16_t   & v1) { return vandq_s8 (v0, v1); }
inline  uint16x8_t vandq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vandq_u16(v0, v1); }
inline   int16x8_t vandq(const int16x8_t   & v0, const int16x8_t   & v1) { return vandq_s16(v0, v1); }
inline  uint32x4_t vandq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vandq_u32(v0, v1); }
inline   int32x4_t vandq(const int32x4_t   & v0, const int32x4_t   & v1) { return vandq_s32(v0, v1); }

////////////////////////////// vand ///////////////////////

inline   uint8x8_t vand(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vand_u8 (v0, v1); }
inline    int8x8_t vand(const int8x8_t    & v0, const int8x8_t    & v1) { return vand_s8 (v0, v1); }
inline  uint16x4_t vand(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vand_u16(v0, v1); }
inline   int16x4_t vand(const int16x4_t   & v0, const int16x4_t   & v1) { return vand_s16(v0, v1); }
inline  uint32x2_t vand(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vand_u32(v0, v1); }
inline   int32x2_t vand(const int32x2_t   & v0, const int32x2_t   & v1) { return vand_s32(v0, v1); }

////////////////////////////// vmovn ///////////////////////

inline uint8x8_t   vmovn(const uint16x8_t  & v) { return vmovn_u16(v); }
inline int8x8_t    vmovn(const int16x8_t   & v) { return vmovn_s16(v); }
inline uint16x4_t  vmovn(const uint32x4_t  & v) { return vmovn_u32(v); }
inline int16x4_t   vmovn(const int32x4_t   & v) { return vmovn_s32(v); }
inline uint32x2_t  vmovn(const uint64x2_t  & v) { return vmovn_u64(v); }
inline int32x2_t   vmovn(const int64x2_t   & v) { return vmovn_s64(v); }

////////////////////////////// vqmovn ///////////////////////

inline uint8x8_t   vqmovn(const uint16x8_t  & v) { return vqmovn_u16(v); }
inline int8x8_t    vqmovn(const int16x8_t   & v) { return vqmovn_s16(v); }
inline uint16x4_t  vqmovn(const uint32x4_t  & v) { return vqmovn_u32(v); }
inline int16x4_t   vqmovn(const int32x4_t   & v) { return vqmovn_s32(v); }
inline uint32x2_t  vqmovn(const uint64x2_t  & v) { return vqmovn_u64(v); }
inline int32x2_t   vqmovn(const int64x2_t   & v) { return vqmovn_s64(v); }

////////////////////////////// vmovl ///////////////////////

inline uint16x8_t  vmovl(const uint8x8_t   & v) { return  vmovl_u8(v); }
inline int16x8_t   vmovl(const int8x8_t    & v) { return  vmovl_s8(v); }
inline uint32x4_t  vmovl(const uint16x4_t  & v) { return vmovl_u16(v); }
inline int32x4_t   vmovl(const int16x4_t   & v) { return vmovl_s16(v); }

////////////////////////////// vmvnq ///////////////////////

inline uint8x16_t  vmvnq(const uint8x16_t  & v) { return vmvnq_u8 (v); }
inline int8x16_t   vmvnq(const int8x16_t   & v) { return vmvnq_s8 (v); }
inline uint16x8_t  vmvnq(const uint16x8_t  & v) { return vmvnq_u16(v); }
inline int16x8_t   vmvnq(const int16x8_t   & v) { return vmvnq_s16(v); }
inline uint32x4_t  vmvnq(const uint32x4_t  & v) { return vmvnq_u32(v); }
inline int32x4_t   vmvnq(const int32x4_t   & v) { return vmvnq_s32(v); }

////////////////////////////// vmvn ///////////////////////

inline uint8x8_t   vmvn(const uint8x8_t   & v) { return vmvn_u8 (v); }
inline int8x8_t    vmvn(const int8x8_t    & v) { return vmvn_s8 (v); }
inline uint16x4_t  vmvn(const uint16x4_t  & v) { return vmvn_u16(v); }
inline int16x4_t   vmvn(const int16x4_t   & v) { return vmvn_s16(v); }
inline uint32x2_t  vmvn(const uint32x2_t  & v) { return vmvn_u32(v); }
inline int32x2_t   vmvn(const int32x2_t   & v) { return vmvn_s32(v); }

////////////////////////////// vbicq ///////////////////////

inline  uint8x16_t vbicq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vbicq_u8 (v0, v1); }
inline   int8x16_t vbicq(const int8x16_t   & v0, const int8x16_t   & v1) { return vbicq_s8 (v0, v1); }
inline  uint16x8_t vbicq(const uint16x8_t  & v0, const uint16x8_t  & v1) { return vbicq_u16(v0, v1); }
inline   int16x8_t vbicq(const int16x8_t   & v0, const int16x8_t   & v1) { return vbicq_s16(v0, v1); }
inline  uint32x4_t vbicq(const uint32x4_t  & v0, const uint32x4_t  & v1) { return vbicq_u32(v0, v1); }
inline   int32x4_t vbicq(const int32x4_t   & v0, const int32x4_t   & v1) { return vbicq_s32(v0, v1); }
inline  uint64x2_t vbicq(const uint64x2_t  & v0, const uint64x2_t  & v1) { return vbicq_u64(v0, v1); }
inline   int64x2_t vbicq(const int64x2_t   & v0, const int64x2_t   & v1) { return vbicq_s64(v0, v1); }

////////////////////////////// vbic ///////////////////////

inline   uint8x8_t vbic(const uint8x8_t   & v0, const uint8x8_t   & v1) { return vbic_u8 (v0, v1); }
inline    int8x8_t vbic(const int8x8_t    & v0, const int8x8_t    & v1) { return vbic_s8 (v0, v1); }
inline  uint16x4_t vbic(const uint16x4_t  & v0, const uint16x4_t  & v1) { return vbic_u16(v0, v1); }
inline   int16x4_t vbic(const int16x4_t   & v0, const int16x4_t   & v1) { return vbic_s16(v0, v1); }
inline  uint32x2_t vbic(const uint32x2_t  & v0, const uint32x2_t  & v1) { return vbic_u32(v0, v1); }
inline   int32x2_t vbic(const int32x2_t   & v0, const int32x2_t   & v1) { return vbic_s32(v0, v1); }
inline  uint64x1_t vbic(const uint64x1_t  & v0, const uint64x1_t  & v1) { return vbic_u64(v0, v1); }
inline   int64x1_t vbic(const int64x1_t   & v0, const int64x1_t   & v1) { return vbic_s64(v0, v1); }

////////////////////////////// vtransform ///////////////////////

template <typename Op>
void vtransform(Size2D size,
                const typename Op::type * src0Base, ptrdiff_t src0Stride,
                const typename Op::type * src1Base, ptrdiff_t src1Stride,
                typename Op::type * dstBase, ptrdiff_t dstStride, const Op & op)
{
    typedef typename Op::type type;
    typedef typename VecTraits<type>::vec128 vec128;
    typedef typename VecTraits<type>::vec64 vec64;

    if (src0Stride == src1Stride && src0Stride == dstStride &&
        src0Stride == (ptrdiff_t)(size.width * sizeof(type)))
    {
        size.width *= size.height;
        size.height = 1;
    }

    const size_t step_base = 32 / sizeof(type);
    size_t roiw_base = size.width >= (step_base - 1) ? size.width - step_base + 1 : 0;
    const size_t step_tail = 8 / sizeof(type);
    size_t roiw_tail = size.width >= (step_tail - 1) ? size.width - step_tail + 1 : 0;

    for (size_t y = 0; y < size.height; ++y)
    {
        const type * src0 = internal::getRowPtr(src0Base, src0Stride, y);
        const type * src1 = internal::getRowPtr(src1Base, src1Stride, y);
        typename Op::type * dst = internal::getRowPtr(dstBase, dstStride, y);
        size_t x = 0;

        for( ; x < roiw_base; x += step_base )
        {
            internal::prefetch(src0 + x);
            internal::prefetch(src1 + x);

            vec128 v_src00 = vld1q(src0 + x), v_src01 = vld1q(src0 + x + 16 / sizeof(type));
            vec128 v_src10 = vld1q(src1 + x), v_src11 = vld1q(src1 + x + 16 / sizeof(type));
            vec128 v_dst;

            op(v_src00, v_src10, v_dst);
            vst1q(dst + x, v_dst);

            op(v_src01, v_src11, v_dst);
            vst1q(dst + x + 16 / sizeof(type), v_dst);
        }
        for( ; x < roiw_tail; x += step_tail )
        {
            vec64 v_src0 = vld1(src0 + x);
            vec64 v_src1 = vld1(src1 + x);
            vec64 v_dst;

            op(v_src0, v_src1, v_dst);
            vst1(dst + x, v_dst);
        }

        for (; x < size.width; ++x)
        {
            op(src0 + x, src1 + x, dst + x);
        }
    }
}

} }

#endif // CAROTENE_NEON

#endif