/*
 * By downloading, copying, installing or using the software you agree to this license.
 * If you do not agree to this license, do not download, install,
 * copy or use the software.
 *
 *
 *                           License Agreement
 *                For Open Source Computer Vision Library
 *                        (3-clause BSD License)
 *
 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
 * Third party copyrights are property of their respective owners.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright notice,
 *     this list of conditions and the following disclaimer.
 *
 *   * Redistributions in binary form must reproduce the above copyright notice,
 *     this list of conditions and the following disclaimer in the documentation
 *     and/or other materials provided with the distribution.
 *
 *   * Neither the names of the copyright holders nor the names of the contributors
 *     may be used to endorse or promote products derived from this software
 *     without specific prior written permission.
 *
 * This software is provided by the copyright holders and contributors "as is" and
 * any express or implied warranties, including, but not limited to, the implied
 * warranties of merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall copyright holders or contributors be liable for any direct,
 * indirect, incidental, special, exemplary, or consequential damages
 * (including, but not limited to, procurement of substitute goods or services;
 * loss of use, data, or profits; or business interruption) however caused
 * and on any theory of liability, whether in contract, strict liability,
 * or tort (including negligence or otherwise) arising in any way out of
 * the use of this software, even if advised of the possibility of such damage.
 */

#include "common.hpp"
#include "saturate_cast.hpp"
#include <vector>
#include <float.h> // For FLT_EPSILON

namespace CAROTENE_NS {

#define CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))

/*
 *        Pyramidal Lucas-Kanade Optical Flow level processing
 */
void pyrLKOptFlowLevel(const Size2D &size, s32 cn,
                       const u8 *prevData, ptrdiff_t prevStride,
                       const s16 *prevDerivData, ptrdiff_t prevDerivStride,
                       const u8 *nextData, ptrdiff_t nextStride,
                       u32 ptCount,
                       const f32 *prevPts, f32 *nextPts,
                       u8 *status, f32 *err,
                       const Size2D &winSize,
                       u32 terminationCount, f64 terminationEpsilon,
                       u32 level, u32 maxLevel, bool useInitialFlow, bool getMinEigenVals,
                       f32 minEigThreshold)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    f32 halfWinX = (winSize.width-1)*0.5f, halfWinY = (winSize.height-1)*0.5f;
    s32 cn2 = cn*2;

    std::vector<s16> _buf(winSize.total()*(cn + cn2));
    s16* IWinBuf = &_buf[0];
    s32  IWinBufStride = winSize.width*cn;
    s16* derivIWinBuf = &_buf[winSize.total()*cn];
    s32  derivIWinBufStride = winSize.width*cn2;

    for( u32 ptidx = 0; ptidx < ptCount; ptidx++ )
    {
        f32 levscale = (1./(1 << level));
        u32 ptref = ptidx << 1;
        f32 prevPtX = prevPts[ptref+0]*levscale;
        f32 prevPtY = prevPts[ptref+1]*levscale;
        f32 nextPtX;
        f32 nextPtY;
        if( level == maxLevel )
        {
            if( useInitialFlow )
            {
                nextPtX = nextPts[ptref+0]*levscale;
                nextPtY = nextPts[ptref+1]*levscale;
            }
            else
            {
                nextPtX = prevPtX;
                nextPtY = prevPtY;
            }
        }
        else
        {
            nextPtX = nextPts[ptref+0]*2.f;
            nextPtY = nextPts[ptref+1]*2.f;
        }
        nextPts[ptref+0] = nextPtX;
        nextPts[ptref+1] = nextPtY;

        s32 iprevPtX, iprevPtY;
        s32 inextPtX, inextPtY;
        prevPtX -= halfWinX;
        prevPtY -= halfWinY;
        iprevPtX = floor(prevPtX);
        iprevPtY = floor(prevPtY);

        if( iprevPtX < -(s32)winSize.width || iprevPtX >= (s32)size.width ||
            iprevPtY < -(s32)winSize.height || iprevPtY >= (s32)size.height )
        {
            if( level == 0 )
            {
                if( status )
                    status[ptidx] = false;
                if( err )
                    err[ptidx] = 0;
            }
            continue;
        }

        f32 a = prevPtX - iprevPtX;
        f32 b = prevPtY - iprevPtY;
        const s32 W_BITS = 14, W_BITS1 = 14;
        const f32 FLT_SCALE = 1.f/(1 << 20);
        s32 iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
        s32 iw01 = round(a*(1.f - b)*(1 << W_BITS));
        s32 iw10 = round((1.f - a)*b*(1 << W_BITS));
        s32 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;

        s32 dstep = prevDerivStride/sizeof(s16);
        f32 A11 = 0, A12 = 0, A22 = 0;

        int16x4_t viw00 = vmov_n_s16((s16)iw00);
        int16x4_t viw01 = vmov_n_s16((s16)iw01);
        int16x4_t viw10 = vmov_n_s16((s16)iw10);
        int16x4_t viw11 = vmov_n_s16((s16)iw11);

        float32x4_t vA11 = vmovq_n_f32(0);
        float32x4_t vA12 = vmovq_n_f32(0);
        float32x4_t vA22 = vmovq_n_f32(0);

        s32 wwcn = winSize.width*cn;

        // extract the patch from the first image, compute covariation matrix of derivatives
        s32 x = 0;
        for(s32 y = 0; y < (s32)winSize.height; y++ )
        {
            const u8* src = prevData + prevStride*(y + iprevPtY) + iprevPtX*cn;
            const s16* dsrc = prevDerivData + dstep*(y + iprevPtY) + iprevPtX*cn2;

            s16* Iptr = IWinBuf + y*IWinBufStride;
            s16* dIptr = derivIWinBuf + y*derivIWinBufStride;

            internal::prefetch(src + x + prevStride * 2, 0);
            for(x = 0; x <= wwcn - 8; x += 8)
            {
                uint8x8_t vsrc00 = vld1_u8(src + x);
                uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
                uint8x8_t vsrc01 = vld1_u8(src + x + cn);
                uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);

                int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vsrc00));
                int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vsrc10));
                int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vsrc01));
                int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vsrc11));

                int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
                int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);

                vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
                vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);

                vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
                vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);

                vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
                vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);

                int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
                int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);

                vst1q_s16(Iptr + x, vcombine_s16(vsumnl, vsumnh));
            }
            for(; x <= wwcn - 4; x += 4)
            {
                uint8x8_t vsrc00 = vld1_u8(src + x);
                uint8x8_t vsrc10 = vld1_u8(src + x + prevStride);
                uint8x8_t vsrc01 = vld1_u8(src + x + cn);
                uint8x8_t vsrc11 = vld1_u8(src + x + prevStride + cn);

                int16x4_t vs00 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc00)));
                int16x4_t vs10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc10)));
                int16x4_t vs01 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc01)));
                int16x4_t vs11 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vsrc11)));

                int32x4_t vsuml1 = vmull_s16(vs00, viw00);
                int32x4_t vsuml2 = vmull_s16(vs01, viw01);
                vsuml1 = vmlal_s16(vsuml1, vs10, viw10);
                vsuml2 = vmlal_s16(vsuml2, vs11, viw11);
                int32x4_t vsuml = vaddq_s32(vsuml1, vsuml2);

                int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);

                vst1_s16(Iptr + x, vsumnl);
            }

            internal::prefetch(dsrc + dstep * 2, 0);
            for(x = 0; x <= wwcn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
            {
#if 0
                __asm__ (
                    "vld2.16 {d0-d1}, [%[dsrc00]]                         \n\t"
                    "vld2.16 {d2-d3}, [%[dsrc10]]                         \n\t"
                    "vld2.16 {d4-d5}, [%[dsrc01]]                         \n\t"
                    "vld2.16 {d6-d7}, [%[dsrc11]]                         \n\t"
                    "vmull.s16 q4, d3, %P[viw10]                           \n\t"
                    "vmull.s16 q5, d0, %P[viw00]                           \n\t"
                    "vmlal.s16 q4, d7, %P[viw11]                           \n\t"
                    "vmlal.s16 q5, d4, %P[viw01]                           \n\t"
                    "vmlal.s16 q4, d1, %P[viw00]                           \n\t"
                    "vmlal.s16 q5, d2, %P[viw10]                           \n\t"
                    "vmlal.s16 q4, d5, %P[viw01]                           \n\t"
                    "vmlal.s16 q5, d6, %P[viw11]                            \n\t"
                    "vrshrn.s32 d13, q4, %[W_BITS1]                       \n\t"
                    "vrshrn.s32 d12, q5, %[W_BITS1]                       \n\t"
                    "vmull.s16 q3, d13, d13                               \n\t"
                    "vmull.s16 q4, d12, d12                               \n\t"
                    "vmull.s16 q5, d13, d12                               \n\t"
                    "vcvt.f32.s32 q3, q3                                  \n\t"
                    "vcvt.f32.s32 q4, q4                                  \n\t"
                    "vcvt.f32.s32 q5, q5                                  \n\t"
                    "vadd.f32 %q[vA22], q3                                \n\t"
                    "vadd.f32 %q[vA11], q4                                \n\t"
                    "vadd.f32 %q[vA12], q5                                \n\t"
                    "vst2.16 {d12-d13}, [%[out]]                          \n\t"
                    : [vA22] "=w" (vA22),
                      [vA11] "=w" (vA11),
                      [vA12] "=w" (vA12)
                    : "0" (vA22),
                      "1" (vA11),
                      "2" (vA12),
                      [out] "r" (dIptr),
                      [dsrc00] "r" (dsrc),
                      [dsrc10] "r" (dsrc + dstep),
                      [dsrc01] "r" (dsrc + cn2),
                      [dsrc11] "r" (dsrc + dstep + cn2),
                      [viw00] "w" (viw00),
                      [viw10] "w" (viw10),
                      [viw01] "w" (viw01),
                      [viw11] "w" (viw11),
                      [W_BITS1] "I" (W_BITS1)
                    : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
                );
#else
                int16x4x2_t vdsrc00 = vld2_s16(dsrc);
                int16x4x2_t vdsrc10 = vld2_s16(dsrc + dstep);
                int16x4x2_t vdsrc01 = vld2_s16(dsrc + cn2);
                int16x4x2_t vdsrc11 = vld2_s16(dsrc + dstep + cn2);

                int32x4_t vsumy = vmull_s16(vdsrc10.val[1], viw10);
                int32x4_t vsumx = vmull_s16(vdsrc00.val[0], viw00);

                vsumy = vmlal_s16(vsumy, vdsrc11.val[1], viw11);
                vsumx = vmlal_s16(vsumx, vdsrc01.val[0], viw01);

                vsumy = vmlal_s16(vsumy, vdsrc00.val[1], viw00);
                vsumx = vmlal_s16(vsumx, vdsrc10.val[0], viw10);

                vsumy = vmlal_s16(vsumy, vdsrc01.val[1], viw01);
                vsumx = vmlal_s16(vsumx, vdsrc11.val[0], viw11);

                int16x4_t vsumny = vrshrn_n_s32(vsumy, W_BITS1);
                int16x4_t vsumnx = vrshrn_n_s32(vsumx, W_BITS1);

                int32x4_t va22i = vmull_s16(vsumny, vsumny);
                int32x4_t va11i = vmull_s16(vsumnx, vsumnx);
                int32x4_t va12i = vmull_s16(vsumnx, vsumny);

                float32x4_t va22f = vcvtq_f32_s32(va22i);
                float32x4_t va11f = vcvtq_f32_s32(va11i);
                float32x4_t va12f = vcvtq_f32_s32(va12i);

                vA22 = vaddq_f32(vA22, va22f);
                vA11 = vaddq_f32(vA11, va11f);
                vA12 = vaddq_f32(vA12, va12f);

                int16x4x2_t vsum;
                vsum.val[0] = vsumnx;
                vsum.val[1] = vsumny;
                vst2_s16(dIptr, vsum);
#endif
            }

            for( ; x < wwcn; x++, dsrc += 2, dIptr += 2 )
            {
                s32 ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
                                      src[x+prevStride]*iw10 + src[x+prevStride+cn]*iw11, W_BITS1-5);
                s32 ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
                                       dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
                s32 iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
                                       dsrc[dstep+cn2+1]*iw11, W_BITS1);
                Iptr[x] = (s16)ival;
                dIptr[0] = (s16)ixval;
                dIptr[1] = (s16)iyval;

                A11 += (f32)(ixval*ixval);
                A12 += (f32)(ixval*iyval);
                A22 += (f32)(iyval*iyval);
            }
        }

        f32 A11buf[2], A12buf[2], A22buf[2];
        vst1_f32(A11buf, vadd_f32(vget_low_f32(vA11), vget_high_f32(vA11)));
        vst1_f32(A12buf, vadd_f32(vget_low_f32(vA12), vget_high_f32(vA12)));
        vst1_f32(A22buf, vadd_f32(vget_low_f32(vA22), vget_high_f32(vA22)));
        A11 += A11buf[0] + A11buf[1];
        A12 += A12buf[0] + A12buf[1];
        A22 += A22buf[0] + A22buf[1];

        A11 *= FLT_SCALE;
        A12 *= FLT_SCALE;
        A22 *= FLT_SCALE;

        f32 D = A11*A22 - A12*A12;
        f32 minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
                        4.f*A12*A12))/(2*winSize.width*winSize.height);

        if( err && getMinEigenVals )
            err[ptidx] = (f32)minEig;

        if( minEig < minEigThreshold || D < FLT_EPSILON )
        {
            if( level == 0 && status )
                status[ptidx] = false;
            continue;
        }

        D = 1.f/D;

        nextPtX -= halfWinX;
        nextPtY -= halfWinY;
        f32 prevDeltaX = 0;
        f32 prevDeltaY = 0;

        for(u32 j = 0; j < terminationCount; j++ )
        {
            inextPtX = floor(nextPtX);
            inextPtY = floor(nextPtY);

            if( inextPtX < -(s32)winSize.width || inextPtX >= (s32)size.width ||
               inextPtY < -(s32)winSize.height || inextPtY >= (s32)size.height )
            {
                if( level == 0 && status )
                    status[ptidx] = false;
                break;
            }

            a = nextPtX - inextPtX;
            b = nextPtY - inextPtY;
            iw00 = round((1.f - a)*(1.f - b)*(1 << W_BITS));
            iw01 = round(a*(1.f - b)*(1 << W_BITS));
            iw10 = round((1.f - a)*b*(1 << W_BITS));
            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
            f32 b1 = 0, b2 = 0;

            viw00 = vmov_n_s16((s16)iw00);
            viw01 = vmov_n_s16((s16)iw01);
            viw10 = vmov_n_s16((s16)iw10);
            viw11 = vmov_n_s16((s16)iw11);

            float32x4_t vb1 = vmovq_n_f32(0);
            float32x4_t vb2 = vmovq_n_f32(0);

            for(s32 y = 0; y < (s32)winSize.height; y++ )
            {
                const u8* Jptr = nextData + nextStride*(y + inextPtY) + inextPtX*cn;
                const s16* Iptr = IWinBuf + y*IWinBufStride;
                const s16* dIptr = derivIWinBuf + y*derivIWinBufStride;

                x = 0;

                internal::prefetch(Jptr, nextStride * 2);
                internal::prefetch(Iptr, IWinBufStride/2);
                internal::prefetch(dIptr, derivIWinBufStride/2);

                for( ; x <= wwcn - 8; x += 8, dIptr += 8*2 )
                {
                    uint8x8_t vj00 = vld1_u8(Jptr + x);
                    uint8x8_t vj10 = vld1_u8(Jptr + x + nextStride);
                    uint8x8_t vj01 = vld1_u8(Jptr + x + cn);
                    uint8x8_t vj11 = vld1_u8(Jptr + x + nextStride + cn);
                    int16x8_t vI = vld1q_s16(Iptr + x);
                    int16x8x2_t vDerivI = vld2q_s16(dIptr);

                    int16x8_t vs00 = vreinterpretq_s16_u16(vmovl_u8(vj00));
                    int16x8_t vs10 = vreinterpretq_s16_u16(vmovl_u8(vj10));
                    int16x8_t vs01 = vreinterpretq_s16_u16(vmovl_u8(vj01));
                    int16x8_t vs11 = vreinterpretq_s16_u16(vmovl_u8(vj11));

                    int32x4_t vsuml = vmull_s16(vget_low_s16(vs00), viw00);
                    int32x4_t vsumh = vmull_s16(vget_high_s16(vs10), viw10);

                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs01), viw01);
                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs11), viw11);

                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs10), viw10);
                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs00), viw00);

                    vsuml = vmlal_s16(vsuml, vget_low_s16(vs11), viw11);
                    vsumh = vmlal_s16(vsumh, vget_high_s16(vs01), viw01);

                    int16x4_t vsumnl = vrshrn_n_s32(vsuml, W_BITS1-5);
                    int16x4_t vsumnh = vrshrn_n_s32(vsumh, W_BITS1-5);

                    int16x8_t diff = vqsubq_s16(vcombine_s16(vsumnl, vsumnh), vI);

                    int32x4_t vb1l = vmull_s16(vget_low_s16(diff), vget_low_s16(vDerivI.val[0]));
                    int32x4_t vb2h = vmull_s16(vget_high_s16(diff), vget_high_s16(vDerivI.val[1]));
                    int32x4_t vb1i = vmlal_s16(vb1l, vget_high_s16(diff), vget_high_s16(vDerivI.val[0]));
                    int32x4_t vb2i = vmlal_s16(vb2h, vget_low_s16(diff), vget_low_s16(vDerivI.val[1]));

                    float32x4_t vb1f = vcvtq_f32_s32(vb1i);
                    float32x4_t vb2f = vcvtq_f32_s32(vb2i);

                    vb1 = vaddq_f32(vb1, vb1f);
                    vb2 = vaddq_f32(vb2, vb2f);
                }

                for( ; x < wwcn; x++, dIptr += 2 )
                {
                    s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                          Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
                                          W_BITS1-5) - Iptr[x];
                    b1 += (f32)(diff*dIptr[0]);
                    b2 += (f32)(diff*dIptr[1]);
                }
            }

            f32 bbuf[2];
            float32x2_t vb = vpadd_f32(vadd_f32(vget_low_f32(vb1), vget_high_f32(vb1)), vadd_f32(vget_low_f32(vb2), vget_high_f32(vb2)));
            vst1_f32(bbuf, vb);
            b1 += bbuf[0];
            b2 += bbuf[1];

            b1 *= FLT_SCALE;
            b2 *= FLT_SCALE;

            f32 deltaX = (f32)((A12*b2 - A22*b1) * D);
            f32 deltaY = (f32)((A12*b1 - A11*b2) * D);

            nextPtX += deltaX;
            nextPtY += deltaY;
            nextPts[ptref+0] = nextPtX + halfWinX;
            nextPts[ptref+1] = nextPtY + halfWinY;

            if( ((double)deltaX*deltaX + (double)deltaY*deltaY) <= terminationEpsilon )
                break;

            if( j > 0 && std::abs(deltaX + prevDeltaX) < 0.01 &&
               std::abs(deltaY + prevDeltaY) < 0.01 )
            {
                nextPts[ptref+0] -= deltaX*0.5f;
                nextPts[ptref+1] -= deltaY*0.5f;
                break;
            }
            prevDeltaX = deltaX;
            prevDeltaY = deltaY;
        }

        if( status && status[ptidx] && err && level == 0 && !getMinEigenVals )
        {
            f32 nextPointX = nextPts[ptref+0] - halfWinX;
            f32 nextPointY = nextPts[ptref+1] - halfWinY;

            s32 inextPointX = floor(nextPointX);
            s32 inextPointY = floor(nextPointY);

            if( inextPointX < -(s32)winSize.width || inextPointX >= (s32)size.width ||
                inextPointY < -(s32)winSize.height || inextPointY >= (s32)size.height )
            {
                if( status )
                    status[ptidx] = false;
                continue;
            }

            f32 aa = nextPointX - inextPointX;
            f32 bb = nextPointY - inextPointY;
            iw00 = round((1.f - aa)*(1.f - bb)*(1 << W_BITS));
            iw01 = round(aa*(1.f - bb)*(1 << W_BITS));
            iw10 = round((1.f - aa)*bb*(1 << W_BITS));
            iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
            f32 errval = 0.f;

            for(s32 y = 0; y < (s32)winSize.height; y++ )
            {
                const u8* Jptr = nextData + nextStride*(y + inextPointY) + inextPointX*cn;
                const s16* Iptr = IWinBuf + y*IWinBufStride;

                for( x = 0; x < wwcn; x++ )
                {
                    s32 diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
                                          Jptr[x+nextStride]*iw10 + Jptr[x+nextStride+cn]*iw11,
                                          W_BITS1-5) - Iptr[x];
                    errval += std::abs((f32)diff);
                }
            }
            err[ptidx] = errval / (32*wwcn*winSize.height);
        }
    }
#else
    (void)size;
    (void)cn;
    (void)prevData;
    (void)prevStride;
    (void)prevDerivData;
    (void)prevDerivStride;
    (void)nextData;
    (void)nextStride;
    (void)prevPts;
    (void)nextPts;
    (void)status;
    (void)err;
    (void)winSize;
    (void)terminationCount;
    (void)terminationEpsilon;
    (void)level;
    (void)maxLevel;
    (void)useInitialFlow;
    (void)getMinEigenVals;
    (void)minEigThreshold;
    (void)ptCount;
#endif
}

}//CAROTENE_NS