/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                        Intel License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of Intel Corporation may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

/* Haar features calculation */

#include "precomp.hpp"
#include "haar.hpp"

namespace cv_haar_avx
{

// AVX version icvEvalHidHaarClassifier.  Process 8 CvHidHaarClassifiers per call. Check AVX support before invocation!!
#if CV_HAAR_USE_AVX
double icvEvalHidHaarClassifierAVX(CvHidHaarClassifier* classifier,
    double variance_norm_factor, size_t p_offset)
{
    int  CV_DECL_ALIGNED(32) idxV[8] = { 0,0,0,0,0,0,0,0 };
    uchar flags[8] = { 0,0,0,0,0,0,0,0 };
    CvHidHaarTreeNode* nodes[8];
    double res = 0;
    uchar exitConditionFlag = 0;
    for (;;)
    {
        float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
        nodes[0] = (classifier + 0)->node + idxV[0];
        nodes[1] = (classifier + 1)->node + idxV[1];
        nodes[2] = (classifier + 2)->node + idxV[2];
        nodes[3] = (classifier + 3)->node + idxV[3];
        nodes[4] = (classifier + 4)->node + idxV[4];
        nodes[5] = (classifier + 5)->node + idxV[5];
        nodes[6] = (classifier + 6)->node + idxV[6];
        nodes[7] = (classifier + 7)->node + idxV[7];

        __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));

        t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
            nodes[6]->threshold,
            nodes[5]->threshold,
            nodes[4]->threshold,
            nodes[3]->threshold,
            nodes[2]->threshold,
            nodes[1]->threshold,
            nodes[0]->threshold));

        __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
            calc_sumf(nodes[6]->feature.rect[0], p_offset),
            calc_sumf(nodes[5]->feature.rect[0], p_offset),
            calc_sumf(nodes[4]->feature.rect[0], p_offset),
            calc_sumf(nodes[3]->feature.rect[0], p_offset),
            calc_sumf(nodes[2]->feature.rect[0], p_offset),
            calc_sumf(nodes[1]->feature.rect[0], p_offset),
            calc_sumf(nodes[0]->feature.rect[0], p_offset));

        __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
            nodes[6]->feature.rect[0].weight,
            nodes[5]->feature.rect[0].weight,
            nodes[4]->feature.rect[0].weight,
            nodes[3]->feature.rect[0].weight,
            nodes[2]->feature.rect[0].weight,
            nodes[1]->feature.rect[0].weight,
            nodes[0]->feature.rect[0].weight);

        __m256 sum = _mm256_mul_ps(offset, weight);

        offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
            calc_sumf(nodes[6]->feature.rect[1], p_offset),
            calc_sumf(nodes[5]->feature.rect[1], p_offset),
            calc_sumf(nodes[4]->feature.rect[1], p_offset),
            calc_sumf(nodes[3]->feature.rect[1], p_offset),
            calc_sumf(nodes[2]->feature.rect[1], p_offset),
            calc_sumf(nodes[1]->feature.rect[1], p_offset),
            calc_sumf(nodes[0]->feature.rect[1], p_offset));

        weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
            nodes[6]->feature.rect[1].weight,
            nodes[5]->feature.rect[1].weight,
            nodes[4]->feature.rect[1].weight,
            nodes[3]->feature.rect[1].weight,
            nodes[2]->feature.rect[1].weight,
            nodes[1]->feature.rect[1].weight,
            nodes[0]->feature.rect[1].weight);

        sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));

        if (nodes[0]->feature.rect[2].p0)
            tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
        if (nodes[1]->feature.rect[2].p0)
            tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
        if (nodes[2]->feature.rect[2].p0)
            tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
        if (nodes[3]->feature.rect[2].p0)
            tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
        if (nodes[4]->feature.rect[2].p0)
            tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
        if (nodes[5]->feature.rect[2].p0)
            tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
        if (nodes[6]->feature.rect[2].p0)
            tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
        if (nodes[7]->feature.rect[2].p0)
            tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;

        sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));

        __m256 left = _mm256_set_ps(static_cast<float>(nodes[7]->left), static_cast<float>(nodes[6]->left),
            static_cast<float>(nodes[5]->left), static_cast<float>(nodes[4]->left),
            static_cast<float>(nodes[3]->left), static_cast<float>(nodes[2]->left),
            static_cast<float>(nodes[1]->left), static_cast<float>(nodes[0]->left));
        __m256 right = _mm256_set_ps(static_cast<float>(nodes[7]->right), static_cast<float>(nodes[6]->right),
            static_cast<float>(nodes[5]->right), static_cast<float>(nodes[4]->right),
            static_cast<float>(nodes[3]->right), static_cast<float>(nodes[2]->right),
            static_cast<float>(nodes[1]->right), static_cast<float>(nodes[0]->right));

        _mm256_store_si256((__m256i*)idxV, _mm256_cvttps_epi32(_mm256_blendv_ps(right, left, _mm256_cmp_ps(sum, t, _CMP_LT_OQ))));

        for (int i = 0; i < 8; i++)
        {
            if (idxV[i] <= 0)
            {
                if (!flags[i])
                {
                    exitConditionFlag++;
                    flags[i] = 1;
                    res += (classifier + i)->alpha[-idxV[i]];
                }
                idxV[i] = 0;
            }
        }
        if (exitConditionFlag == 8)
            return res;
    }
}

double icvEvalHidHaarStumpClassifierAVX(CvHidHaarClassifier* classifier,
    double variance_norm_factor, size_t p_offset)
{
    float CV_DECL_ALIGNED(32) tmp[8] = { 0,0,0,0,0,0,0,0 };
    CvHidHaarTreeNode* nodes[8];

    nodes[0] = classifier[0].node;
    nodes[1] = classifier[1].node;
    nodes[2] = classifier[2].node;
    nodes[3] = classifier[3].node;
    nodes[4] = classifier[4].node;
    nodes[5] = classifier[5].node;
    nodes[6] = classifier[6].node;
    nodes[7] = classifier[7].node;

    __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));

    t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
        nodes[6]->threshold,
        nodes[5]->threshold,
        nodes[4]->threshold,
        nodes[3]->threshold,
        nodes[2]->threshold,
        nodes[1]->threshold,
        nodes[0]->threshold));

    __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
        calc_sumf(nodes[6]->feature.rect[0], p_offset),
        calc_sumf(nodes[5]->feature.rect[0], p_offset),
        calc_sumf(nodes[4]->feature.rect[0], p_offset),
        calc_sumf(nodes[3]->feature.rect[0], p_offset),
        calc_sumf(nodes[2]->feature.rect[0], p_offset),
        calc_sumf(nodes[1]->feature.rect[0], p_offset),
        calc_sumf(nodes[0]->feature.rect[0], p_offset));

    __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
        nodes[6]->feature.rect[0].weight,
        nodes[5]->feature.rect[0].weight,
        nodes[4]->feature.rect[0].weight,
        nodes[3]->feature.rect[0].weight,
        nodes[2]->feature.rect[0].weight,
        nodes[1]->feature.rect[0].weight,
        nodes[0]->feature.rect[0].weight);

    __m256 sum = _mm256_mul_ps(offset, weight);

    offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
        calc_sumf(nodes[6]->feature.rect[1], p_offset),
        calc_sumf(nodes[5]->feature.rect[1], p_offset),
        calc_sumf(nodes[4]->feature.rect[1], p_offset),
        calc_sumf(nodes[3]->feature.rect[1], p_offset),
        calc_sumf(nodes[2]->feature.rect[1], p_offset),
        calc_sumf(nodes[1]->feature.rect[1], p_offset),
        calc_sumf(nodes[0]->feature.rect[1], p_offset));

    weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
        nodes[6]->feature.rect[1].weight,
        nodes[5]->feature.rect[1].weight,
        nodes[4]->feature.rect[1].weight,
        nodes[3]->feature.rect[1].weight,
        nodes[2]->feature.rect[1].weight,
        nodes[1]->feature.rect[1].weight,
        nodes[0]->feature.rect[1].weight);

    sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));

    if (nodes[0]->feature.rect[2].p0)
        tmp[0] = calc_sumf(nodes[0]->feature.rect[2], p_offset) * nodes[0]->feature.rect[2].weight;
    if (nodes[1]->feature.rect[2].p0)
        tmp[1] = calc_sumf(nodes[1]->feature.rect[2], p_offset) * nodes[1]->feature.rect[2].weight;
    if (nodes[2]->feature.rect[2].p0)
        tmp[2] = calc_sumf(nodes[2]->feature.rect[2], p_offset) * nodes[2]->feature.rect[2].weight;
    if (nodes[3]->feature.rect[2].p0)
        tmp[3] = calc_sumf(nodes[3]->feature.rect[2], p_offset) * nodes[3]->feature.rect[2].weight;
    if (nodes[4]->feature.rect[2].p0)
        tmp[4] = calc_sumf(nodes[4]->feature.rect[2], p_offset) * nodes[4]->feature.rect[2].weight;
    if (nodes[5]->feature.rect[2].p0)
        tmp[5] = calc_sumf(nodes[5]->feature.rect[2], p_offset) * nodes[5]->feature.rect[2].weight;
    if (nodes[6]->feature.rect[2].p0)
        tmp[6] = calc_sumf(nodes[6]->feature.rect[2], p_offset) * nodes[6]->feature.rect[2].weight;
    if (nodes[7]->feature.rect[2].p0)
        tmp[7] = calc_sumf(nodes[7]->feature.rect[2], p_offset) * nodes[7]->feature.rect[2].weight;

    sum = _mm256_add_ps(sum, _mm256_load_ps(tmp));

    __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
        classifier[6].alpha[0],
        classifier[5].alpha[0],
        classifier[4].alpha[0],
        classifier[3].alpha[0],
        classifier[2].alpha[0],
        classifier[1].alpha[0],
        classifier[0].alpha[0]);
    __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
        classifier[6].alpha[1],
        classifier[5].alpha[1],
        classifier[4].alpha[1],
        classifier[3].alpha[1],
        classifier[2].alpha[1],
        classifier[1].alpha[1],
        classifier[0].alpha[1]);

    __m256 outBuf = _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ));
    outBuf = _mm256_hadd_ps(outBuf, outBuf);
    outBuf = _mm256_hadd_ps(outBuf, outBuf);
    _mm256_store_ps(tmp, outBuf);
    return (tmp[0] + tmp[4]);
}

double icvEvalHidHaarStumpClassifierTwoRectAVX(CvHidHaarClassifier* classifier,
    double variance_norm_factor, size_t p_offset)
{
    float CV_DECL_ALIGNED(32) buf[8];
    CvHidHaarTreeNode* nodes[8];
    nodes[0] = classifier[0].node;
    nodes[1] = classifier[1].node;
    nodes[2] = classifier[2].node;
    nodes[3] = classifier[3].node;
    nodes[4] = classifier[4].node;
    nodes[5] = classifier[5].node;
    nodes[6] = classifier[6].node;
    nodes[7] = classifier[7].node;

    __m256 t = _mm256_set1_ps(static_cast<float>(variance_norm_factor));
    t = _mm256_mul_ps(t, _mm256_set_ps(nodes[7]->threshold,
        nodes[6]->threshold,
        nodes[5]->threshold,
        nodes[4]->threshold,
        nodes[3]->threshold,
        nodes[2]->threshold,
        nodes[1]->threshold,
        nodes[0]->threshold));

    __m256 offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[0], p_offset),
        calc_sumf(nodes[6]->feature.rect[0], p_offset),
        calc_sumf(nodes[5]->feature.rect[0], p_offset),
        calc_sumf(nodes[4]->feature.rect[0], p_offset),
        calc_sumf(nodes[3]->feature.rect[0], p_offset),
        calc_sumf(nodes[2]->feature.rect[0], p_offset),
        calc_sumf(nodes[1]->feature.rect[0], p_offset),
        calc_sumf(nodes[0]->feature.rect[0], p_offset));

    __m256 weight = _mm256_set_ps(nodes[7]->feature.rect[0].weight,
        nodes[6]->feature.rect[0].weight,
        nodes[5]->feature.rect[0].weight,
        nodes[4]->feature.rect[0].weight,
        nodes[3]->feature.rect[0].weight,
        nodes[2]->feature.rect[0].weight,
        nodes[1]->feature.rect[0].weight,
        nodes[0]->feature.rect[0].weight);

    __m256 sum = _mm256_mul_ps(offset, weight);

    offset = _mm256_set_ps(calc_sumf(nodes[7]->feature.rect[1], p_offset),
        calc_sumf(nodes[6]->feature.rect[1], p_offset),
        calc_sumf(nodes[5]->feature.rect[1], p_offset),
        calc_sumf(nodes[4]->feature.rect[1], p_offset),
        calc_sumf(nodes[3]->feature.rect[1], p_offset),
        calc_sumf(nodes[2]->feature.rect[1], p_offset),
        calc_sumf(nodes[1]->feature.rect[1], p_offset),
        calc_sumf(nodes[0]->feature.rect[1], p_offset));

    weight = _mm256_set_ps(nodes[7]->feature.rect[1].weight,
        nodes[6]->feature.rect[1].weight,
        nodes[5]->feature.rect[1].weight,
        nodes[4]->feature.rect[1].weight,
        nodes[3]->feature.rect[1].weight,
        nodes[2]->feature.rect[1].weight,
        nodes[1]->feature.rect[1].weight,
        nodes[0]->feature.rect[1].weight);

    sum = _mm256_add_ps(sum, _mm256_mul_ps(offset, weight));

    __m256 alpha0 = _mm256_set_ps(classifier[7].alpha[0],
        classifier[6].alpha[0],
        classifier[5].alpha[0],
        classifier[4].alpha[0],
        classifier[3].alpha[0],
        classifier[2].alpha[0],
        classifier[1].alpha[0],
        classifier[0].alpha[0]);
    __m256 alpha1 = _mm256_set_ps(classifier[7].alpha[1],
        classifier[6].alpha[1],
        classifier[5].alpha[1],
        classifier[4].alpha[1],
        classifier[3].alpha[1],
        classifier[2].alpha[1],
        classifier[1].alpha[1],
        classifier[0].alpha[1]);

    _mm256_store_ps(buf, _mm256_blendv_ps(alpha0, alpha1, _mm256_cmp_ps(t, sum, _CMP_LE_OQ)));
    return (buf[0] + buf[1] + buf[2] + buf[3] + buf[4] + buf[5] + buf[6] + buf[7]);
}

#endif //CV_HAAR_USE_AVX

}

/* End of file. */