dec_sse2.c 35.2 KB
Newer Older
wester committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of some decoding functions (idct, loop filtering).
//
// Author: somnath@google.com (Somnath Banerjee)
//         cduvivier@google.com (Christian Duvivier)

#include "./dsp.h"

a  
Kai Westerkamp committed
17 18 19
#if defined(__cplusplus) || defined(c_plusplus)
extern "C" {
#endif
wester committed
20

a  
Kai Westerkamp committed
21
#if defined(WEBP_USE_SSE2)
wester committed
22 23

#include <emmintrin.h>
a  
Kai Westerkamp committed
24
#include "../dec/vp8i.h"
wester committed
25 26 27 28

//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)

a  
Kai Westerkamp committed
29
static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
wester committed
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
  //
  // To be able to use signed 16-bit integers, we use the following trick to
  // have constants within range:
  // - Associated constants are obtained by subtracting the 16-bit fixed point
  //   version of one:
  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
  //      K1 = 85267  =>  k1 =  20091
  //      K2 = 35468  =>  k2 = -30068
  // - The multiplication of a variable by a constant become the sum of the
  //   variable and the multiplication of that variable by the associated
  //   constant:
  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
  const __m128i k1 = _mm_set1_epi16(20091);
  const __m128i k2 = _mm_set1_epi16(-30068);
  __m128i T0, T1, T2, T3;

  // Load and concatenate the transform coefficients (we'll do two transforms
  // in parallel). In the case of only one transform, the second half of the
  // vectors will just contain random value we'll never use nor store.
  __m128i in0, in1, in2, in3;
  {
a  
Kai Westerkamp committed
55 56 57 58
    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
wester committed
59 60 61 62 63
    // a00 a10 a20 a30   x x x x
    // a01 a11 a21 a31   x x x x
    // a02 a12 a22 a32   x x x x
    // a03 a13 a23 a33   x x x x
    if (do_two) {
a  
Kai Westerkamp committed
64 65 66 67
      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
wester committed
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
      in0 = _mm_unpacklo_epi64(in0, inB0);
      in1 = _mm_unpacklo_epi64(in1, inB1);
      in2 = _mm_unpacklo_epi64(in2, inB2);
      in3 = _mm_unpacklo_epi64(in3, inB3);
      // a00 a10 a20 a30   b00 b10 b20 b30
      // a01 a11 a21 a31   b01 b11 b21 b31
      // a02 a12 a22 a32   b02 b12 b22 b32
      // a03 a13 a23 a33   b03 b13 b23 b33
    }
  }

  // Vertical pass and subsequent transpose.
  {
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i a = _mm_add_epi16(in0, in2);
    const __m128i b = _mm_sub_epi16(in0, in2);
    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
    const __m128i c3 = _mm_sub_epi16(in1, in3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
    const __m128i d3 = _mm_add_epi16(in1, in3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);

    // Transpose the two 4x4.
a  
Kai Westerkamp committed
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
wester committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
  }

  // Horizontal pass and subsequent transpose.
  {
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i four = _mm_set1_epi16(4);
    const __m128i dc = _mm_add_epi16(T0, four);
    const __m128i a =  _mm_add_epi16(dc, T2);
    const __m128i b =  _mm_sub_epi16(dc, T2);
    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
    const __m128i c3 = _mm_sub_epi16(T1, T3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
    const __m128i d3 = _mm_add_epi16(T1, T3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);
    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

    // Transpose the two 4x4.
a  
Kai Westerkamp committed
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
wester committed
195 196 197 198 199 200 201 202 203
  }

  // Add inverse transform to 'dst' and store.
  {
    const __m128i zero = _mm_setzero_si128();
    // Load the reference(s).
    __m128i dst0, dst1, dst2, dst3;
    if (do_two) {
      // Load eight bytes/pixels per line.
a  
Kai Westerkamp committed
204 205 206 207
      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
wester committed
208 209
    } else {
      // Load four bytes/pixels per line.
a  
Kai Westerkamp committed
210 211 212 213
      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
wester committed
214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
    }
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
    dst1 = _mm_unpacklo_epi8(dst1, zero);
    dst2 = _mm_unpacklo_epi8(dst2, zero);
    dst3 = _mm_unpacklo_epi8(dst3, zero);
    // Add the inverse transform(s).
    dst0 = _mm_add_epi16(dst0, T0);
    dst1 = _mm_add_epi16(dst1, T1);
    dst2 = _mm_add_epi16(dst2, T2);
    dst3 = _mm_add_epi16(dst3, T3);
    // Unsigned saturate to 8b.
    dst0 = _mm_packus_epi16(dst0, dst0);
    dst1 = _mm_packus_epi16(dst1, dst1);
    dst2 = _mm_packus_epi16(dst2, dst2);
    dst3 = _mm_packus_epi16(dst3, dst3);
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
a  
Kai Westerkamp committed
233 234 235 236
      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
wester committed
237 238
    } else {
      // Store four bytes/pixels per line.
a  
Kai Westerkamp committed
239 240 241 242
      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
wester committed
243 244 245 246 247 248 249 250 251 252 253 254
    }
  }
}

//------------------------------------------------------------------------------
// Loop Filter (Paragraph 15)

// Compute abs(p - q) = subs(p - q) OR subs(q - p)
#define MM_ABS(p, q)  _mm_or_si128(                                            \
    _mm_subs_epu8((q), (p)),                                                   \
    _mm_subs_epu8((p), (q)))

a  
Kai Westerkamp committed
255 256 257 258 259 260 261 262 263 264 265 266 267 268
// Shift each byte of "a" by N bits while preserving by the sign bit.
//
// It first shifts the lower bytes of the words and then the upper bytes and
// then merges the results together.
#define SIGNED_SHIFT_N(a, N) {                                                 \
  __m128i t = a;                                                               \
  t = _mm_slli_epi16(t, 8);                                                    \
  t = _mm_srai_epi16(t, N);                                                    \
  t = _mm_srli_epi16(t, 8);                                                    \
                                                                               \
  a = _mm_srai_epi16(a, N + 8);                                                \
  a = _mm_slli_epi16(a, 8);                                                    \
                                                                               \
  a = _mm_or_si128(t, a);                                                      \
wester committed
269 270 271 272 273 274 275 276 277 278 279 280
}

#define FLIP_SIGN_BIT2(a, b) {                                                 \
  a = _mm_xor_si128(a, sign_bit);                                              \
  b = _mm_xor_si128(b, sign_bit);                                              \
}

#define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
  FLIP_SIGN_BIT2(a, b);                                                        \
  FLIP_SIGN_BIT2(c, d);                                                        \
}

a  
Kai Westerkamp committed
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) {                      \
  const __m128i zero = _mm_setzero_si128();                                    \
  const __m128i t_1 = MM_ABS(p1, p0);                                          \
  const __m128i t_2 = MM_ABS(q1, q0);                                          \
                                                                               \
  const __m128i h = _mm_set1_epi8(hev_thresh);                                 \
  const __m128i t_3 = _mm_subs_epu8(t_1, h);  /* abs(p1 - p0) - hev_tresh */   \
  const __m128i t_4 = _mm_subs_epu8(t_2, h);  /* abs(q1 - q0) - hev_tresh */   \
                                                                               \
  not_hev = _mm_or_si128(t_3, t_4);                                            \
  not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\
}

#define GET_BASE_DELTA(p1, p0, q0, q1, o) {                                    \
  const __m128i qp0 = _mm_subs_epi8(q0, p0);  /* q0 - p0 */                    \
  o = _mm_subs_epi8(p1, q1);            /* p1 - q1 */                          \
  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 1 * (q0 - p0) */          \
  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 2 * (q0 - p0) */          \
  o = _mm_adds_epi8(o, qp0);            /* p1 - q1 + 3 * (q0 - p0) */          \
}

#define DO_SIMPLE_FILTER(p0, q0, fl) {                                         \
  const __m128i three = _mm_set1_epi8(3);                                      \
  const __m128i four = _mm_set1_epi8(4);                                       \
  __m128i v3 = _mm_adds_epi8(fl, three);                                       \
  __m128i v4 = _mm_adds_epi8(fl, four);                                        \
                                                                               \
  /* Do +4 side */                                                             \
  SIGNED_SHIFT_N(v4, 3);                /* v4 >> 3  */                         \
  q0 = _mm_subs_epi8(q0, v4);           /* q0 -= v4 */                         \
                                                                               \
  /* Now do +3 side */                                                         \
  SIGNED_SHIFT_N(v3, 3);                /* v3 >> 3  */                         \
  p0 = _mm_adds_epi8(p0, v3);           /* p0 += v3 */                         \
wester committed
315 316 317 318 319
}

// Updates values of 2 pixels at MB edge during complex filtering.
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
a  
Kai Westerkamp committed
320 321 322 323 324 325
#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) {                                   \
  const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7);                               \
  const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7);                               \
  const __m128i delta = _mm_packs_epi16(a_lo7, a_hi7);                         \
  pi = _mm_adds_epi8(pi, delta);                                               \
  qi = _mm_subs_epi8(qi, delta);                                               \
wester committed
326 327
}

a  
Kai Westerkamp committed
328 329 330 331 332 333 334 335 336 337 338 339 340 341
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
                        const __m128i* q1, int thresh, __m128i *mask) {
  __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
  *mask = _mm_set1_epi8(0xFE);
  t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
  t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2

  *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
  *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
  *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2

  t1 = _mm_set1_epi8(thresh);
  *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
  *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
wester committed
342 343 344 345 346 347
}

//------------------------------------------------------------------------------
// Edge filtering functions

// Applies filter on 2 pixels (p0 and q0)
a  
Kai Westerkamp committed
348 349
static WEBP_INLINE void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0,
                                  const __m128i* q1, int thresh) {
wester committed
350 351 352 353 354 355 356
  __m128i a, mask;
  const __m128i sign_bit = _mm_set1_epi8(0x80);
  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);

  NeedsFilter(p1, p0, q0, q1, thresh, &mask);

a  
Kai Westerkamp committed
357
  // convert to signed values
wester committed
358
  FLIP_SIGN_BIT2(*p0, *q0);
a  
Kai Westerkamp committed
359 360

  GET_BASE_DELTA(p1s, *p0, *q0, q1s, a);
wester committed
361
  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
a  
Kai Westerkamp committed
362 363 364
  DO_SIMPLE_FILTER(*p0, *q0, a);

  // unoffset
wester committed
365 366 367 368
  FLIP_SIGN_BIT2(*p0, *q0);
}

// Applies filter on 4 pixels (p1, p0, q0 and q1)
a  
Kai Westerkamp committed
369 370 371
static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
                                  __m128i* q0, __m128i* q1,
                                  const __m128i* mask, int hev_thresh) {
wester committed
372 373
  __m128i not_hev;
  __m128i t1, t2, t3;
a  
Kai Westerkamp committed
374
  const __m128i sign_bit = _mm_set1_epi8(0x80);
wester committed
375 376

  // compute hev mask
a  
Kai Westerkamp committed
377
  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
wester committed
378 379 380 381 382 383 384 385 386 387 388 389

  // convert to signed values
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

  t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
  t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
  t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

a  
Kai Westerkamp committed
390 391 392 393 394 395 396 397 398 399 400
  // Do +4 side
  t2 = _mm_set1_epi8(4);
  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
  t3 = t2;                           // save t2
  *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2

  // Now do +3 side
  t2 = _mm_set1_epi8(3);
  t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
  SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
wester committed
401 402
  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2

a  
Kai Westerkamp committed
403 404 405
  t2 = _mm_set1_epi8(1);
  t3 = _mm_adds_epi8(t3, t2);
  SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4
wester committed
406 407 408 409

  t3 = _mm_and_si128(not_hev, t3);   // if !hev
  *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
  *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
a  
Kai Westerkamp committed
410 411 412

  // unoffset
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
wester committed
413 414 415
}

// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
a  
Kai Westerkamp committed
416 417 418
static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
                                  __m128i* q0, __m128i* q1, __m128i *q2,
                                  const __m128i* mask, int hev_thresh) {
wester committed
419
  __m128i a, not_hev;
a  
Kai Westerkamp committed
420
  const __m128i sign_bit = _mm_set1_epi8(0x80);
wester committed
421 422

  // compute hev mask
a  
Kai Westerkamp committed
423
  GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);
wester committed
424

a  
Kai Westerkamp committed
425
  // convert to signed values
wester committed
426 427
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
  FLIP_SIGN_BIT2(*p2, *q2);
a  
Kai Westerkamp committed
428 429

  GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);
wester committed
430 431 432 433

  { // do simple filter on pixels with hev
    const __m128i m = _mm_andnot_si128(not_hev, *mask);
    const __m128i f = _mm_and_si128(a, m);
a  
Kai Westerkamp committed
434
    DO_SIMPLE_FILTER(*p0, *q0, f);
wester committed
435 436
  }
  { // do strong filter on pixels with not hev
a  
Kai Westerkamp committed
437 438 439
    const __m128i zero = _mm_setzero_si128();
    const __m128i nine = _mm_set1_epi16(0x0900);
    const __m128i sixty_three = _mm_set1_epi16(63);
wester committed
440 441 442 443 444 445

    const __m128i m = _mm_and_si128(not_hev, *mask);
    const __m128i f = _mm_and_si128(a, m);
    const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
    const __m128i f_hi = _mm_unpackhi_epi8(zero, f);

a  
Kai Westerkamp committed
446 447 448 449
    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine);   // Filter (lo) * 9
    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine);   // Filter (hi) * 9
    const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo);  // Filter (lo) * 18
    const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi);  // Filter (hi) * 18
wester committed
450

a  
Kai Westerkamp committed
451 452
    const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three);  // Filter * 9 + 63
    const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three);  // Filter * 9 + 63
wester committed
453

a  
Kai Westerkamp committed
454 455
    const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three);  // F... * 18 + 63
    const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three);  // F... * 18 + 63
wester committed
456

a  
Kai Westerkamp committed
457 458
    const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo);  // Filter * 27 + 63
    const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi);  // Filter * 27 + 63
wester committed
459

a  
Kai Westerkamp committed
460 461 462
    UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
    UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
    UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
wester committed
463
  }
a  
Kai Westerkamp committed
464 465 466 467

  // unoffset
  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
  FLIP_SIGN_BIT2(*p2, *q2);
wester committed
468 469 470
}

// reads 8 rows across a vertical edge.
a  
Kai Westerkamp committed
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
//
// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
// two Load4x4() to avoid code duplication.
static WEBP_INLINE void Load8x4(const uint8_t* b, int stride,
                                __m128i* p, __m128i* q) {
  __m128i t1, t2;

  // Load 0th, 1st, 4th and 5th rows
  __m128i r0 =  _mm_cvtsi32_si128(*((int*)&b[0 * stride]));  // 03 02 01 00
  __m128i r1 =  _mm_cvtsi32_si128(*((int*)&b[1 * stride]));  // 13 12 11 10
  __m128i r4 =  _mm_cvtsi32_si128(*((int*)&b[4 * stride]));  // 43 42 41 40
  __m128i r5 =  _mm_cvtsi32_si128(*((int*)&b[5 * stride]));  // 53 52 51 50

  r0 = _mm_unpacklo_epi32(r0, r4);               // 43 42 41 40 03 02 01 00
  r1 = _mm_unpacklo_epi32(r1, r5);               // 53 52 51 50 13 12 11 10

  // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
  t1 = _mm_unpacklo_epi8(r0, r1);

  // Load 2nd, 3rd, 6th and 7th rows
  r0 =  _mm_cvtsi32_si128(*((int*)&b[2 * stride]));          // 23 22 21 22
  r1 =  _mm_cvtsi32_si128(*((int*)&b[3 * stride]));          // 33 32 31 30
  r4 =  _mm_cvtsi32_si128(*((int*)&b[6 * stride]));          // 63 62 61 60
  r5 =  _mm_cvtsi32_si128(*((int*)&b[7 * stride]));          // 73 72 71 70

  r0 = _mm_unpacklo_epi32(r0, r4);               // 63 62 61 60 23 22 21 20
  r1 = _mm_unpacklo_epi32(r1, r5);               // 73 72 71 70 33 32 31 30

  // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
  t2 = _mm_unpacklo_epi8(r0, r1);

  // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
  // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
  r0 = t1;
  t1 = _mm_unpacklo_epi16(t1, t2);
  t2 = _mm_unpackhi_epi16(r0, t2);
wester committed
507 508 509

  // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
  // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
a  
Kai Westerkamp committed
510 511
  *p = _mm_unpacklo_epi32(t1, t2);
  *q = _mm_unpackhi_epi32(t1, t2);
wester committed
512 513
}

a  
Kai Westerkamp committed
514
static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8,
wester committed
515
                                 int stride,
a  
Kai Westerkamp committed
516 517 518
                                 __m128i* p1, __m128i* p0,
                                 __m128i* q0, __m128i* q1) {
  __m128i t1, t2;
wester committed
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
  // Assume the pixels around the edge (|) are numbered as follows
  //                00 01 | 02 03
  //                10 11 | 12 13
  //                 ...  |  ...
  //                e0 e1 | e2 e3
  //                f0 f1 | f2 f3
  //
  // r0 is pointing to the 0th row (00)
  // r8 is pointing to the 8th row (80)

  // Load
  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
  Load8x4(r0, stride, p1, q0);
  Load8x4(r8, stride, p0, q1);

a  
Kai Westerkamp committed
537 538 539 540 541 542 543 544 545 546
  t1 = *p1;
  t2 = *q0;
  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
  *p1 = _mm_unpacklo_epi64(t1, *p0);
  *p0 = _mm_unpackhi_epi64(t1, *p0);
  *q0 = _mm_unpacklo_epi64(t2, *q1);
  *q1 = _mm_unpackhi_epi64(t2, *q1);
wester committed
547 548
}

a  
Kai Westerkamp committed
549
static WEBP_INLINE void Store4x4(__m128i* x, uint8_t* dst, int stride) {
wester committed
550 551
  int i;
  for (i = 0; i < 4; ++i, dst += stride) {
a  
Kai Westerkamp committed
552
    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
wester committed
553 554 555 556 557
    *x = _mm_srli_si128(*x, 4);
  }
}

// Transpose back and store
a  
Kai Westerkamp committed
558 559 560 561
static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
                                  __m128i* p1, __m128i* p0,
                                  __m128i* q0, __m128i* q1) {
  __m128i t1;
wester committed
562 563 564 565

  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
  t1 = *p0;
a  
Kai Westerkamp committed
566 567
  *p0 = _mm_unpacklo_epi8(*p1, t1);
  *p1 = _mm_unpackhi_epi8(*p1, t1);
wester committed
568 569 570 571

  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
  t1 = *q0;
a  
Kai Westerkamp committed
572 573
  *q0 = _mm_unpacklo_epi8(t1, *q1);
  *q1 = _mm_unpackhi_epi8(t1, *q1);
wester committed
574 575 576

  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
a  
Kai Westerkamp committed
577 578 579
  t1 = *p0;
  *p0 = _mm_unpacklo_epi16(t1, *q0);
  *q0 = _mm_unpackhi_epi16(t1, *q0);
wester committed
580 581 582

  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
a  
Kai Westerkamp committed
583 584 585
  t1 = *p1;
  *p1 = _mm_unpacklo_epi16(t1, *q1);
  *q1 = _mm_unpackhi_epi16(t1, *q1);
wester committed
586

a  
Kai Westerkamp committed
587
  Store4x4(p0, r0, stride);
wester committed
588
  r0 += 4 * stride;
a  
Kai Westerkamp committed
589
  Store4x4(q0, r0, stride);
wester committed
590

a  
Kai Westerkamp committed
591
  Store4x4(p1, r8, stride);
wester committed
592
  r8 += 4 * stride;
a  
Kai Westerkamp committed
593
  Store4x4(q1, r8, stride);
wester committed
594 595 596 597 598
}

//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)

a  
Kai Westerkamp committed
599
static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) {
wester committed
600 601 602 603 604 605 606 607 608 609
  // Load
  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
  __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
  __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);

  DoFilter2(&p1, &p0, &q0, &q1, thresh);

  // Store
  _mm_storeu_si128((__m128i*)&p[-stride], p0);
a  
Kai Westerkamp committed
610
  _mm_storeu_si128((__m128i*)p, q0);
wester committed
611 612
}

a  
Kai Westerkamp committed
613
static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) {
wester committed
614 615 616 617
  __m128i p1, p0, q0, q1;

  p -= 2;  // beginning of p1

a  
Kai Westerkamp committed
618
  Load16x4(p, p + 8 * stride,  stride, &p1, &p0, &q0, &q1);
wester committed
619
  DoFilter2(&p1, &p0, &q0, &q1, thresh);
a  
Kai Westerkamp committed
620
  Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
wester committed
621 622
}

a  
Kai Westerkamp committed
623
static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) {
wester committed
624 625 626
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
a  
Kai Westerkamp committed
627
    SimpleVFilter16SSE2(p, stride, thresh);
wester committed
628 629 630
  }
}

a  
Kai Westerkamp committed
631
static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) {
wester committed
632 633 634
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
a  
Kai Westerkamp committed
635
    SimpleHFilter16SSE2(p, stride, thresh);
wester committed
636 637 638 639 640 641
  }
}

//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)

a  
Kai Westerkamp committed
642 643
#define MAX_DIFF1(p3, p2, p1, p0, m) {                                         \
  m = MM_ABS(p3, p2);                                                          \
wester committed
644 645
  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
a  
Kai Westerkamp committed
646 647 648
}

#define MAX_DIFF2(p3, p2, p1, p0, m) {                                         \
wester committed
649 650
  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
a  
Kai Westerkamp committed
651 652
  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
}
wester committed
653 654 655 656 657 658 659 660

#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
}

a  
Kai Westerkamp committed
661 662 663 664
#define LOADUV_H_EDGE(p, u, v, stride) {                                       \
  p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                               \
  p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)]));        \
}
wester committed
665 666 667 668 669 670 671 672 673 674 675 676 677 678

#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
}

#define STOREUV(p, u, v, stride) {                                             \
  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
  p = _mm_srli_si128(p, 8);                                                    \
  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
}

a  
Kai Westerkamp committed
679 680 681 682 683 684 685
#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) {               \
  __m128i fl_yes;                                                              \
  const __m128i it = _mm_set1_epi8(ithresh);                                   \
  mask = _mm_subs_epu8(mask, it);                                              \
  mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128());                            \
  NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes);                            \
  mask = _mm_and_si128(mask, fl_yes);                                          \
wester committed
686 687 688
}

// on macroblock edges
a  
Kai Westerkamp committed
689 690
static void VFilter16SSE2(uint8_t* p, int stride,
                          int thresh, int ithresh, int hev_thresh) {
wester committed
691 692 693 694 695 696 697 698 699 700 701 702
  __m128i t1;
  __m128i mask;
  __m128i p2, p1, p0, q0, q1, q2;

  // Load p3, p2, p1, p0
  LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
  MAX_DIFF1(t1, p2, p1, p0, mask);

  // Load q0, q1, q2, q3
  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);

a  
Kai Westerkamp committed
703
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
wester committed
704 705 706 707 708 709
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

  // Store
  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
  _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
  _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
a  
Kai Westerkamp committed
710 711 712
  _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
  _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
  _mm_storeu_si128((__m128i*)&p[2 * stride], q2);
wester committed
713 714
}

a  
Kai Westerkamp committed
715 716
static void HFilter16SSE2(uint8_t* p, int stride,
                          int thresh, int ithresh, int hev_thresh) {
wester committed
717 718 719 720 721 722 723 724 725 726
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

  uint8_t* const b = p - 4;
  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
  MAX_DIFF1(p3, p2, p1, p0, mask);

  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
  MAX_DIFF2(q3, q2, q1, q0, mask);

a  
Kai Westerkamp committed
727
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
wester committed
728 729
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

a  
Kai Westerkamp committed
730 731
  Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);
  Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);
wester committed
732 733 734
}

// on three inner edges
a  
Kai Westerkamp committed
735 736
static void VFilter16iSSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) {
wester committed
737
  int k;
a  
Kai Westerkamp committed
738 739
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
wester committed
740 741

  for (k = 3; k > 0; --k) {
a  
Kai Westerkamp committed
742 743 744 745
    // Load p3, p2, p1, p0
    LOAD_H_EDGES4(p, stride, t2, t1, p1, p0);
    MAX_DIFF1(t2, t1, p1, p0, mask);

wester committed
746 747
    p += 4 * stride;

a  
Kai Westerkamp committed
748 749 750
    // Load q0, q1, q2, q3
    LOAD_H_EDGES4(p, stride, q0, q1, t1, t2);
    MAX_DIFF2(t2, t1, q1, q0, mask);
wester committed
751

a  
Kai Westerkamp committed
752 753
    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
wester committed
754 755

    // Store
a  
Kai Westerkamp committed
756 757 758 759
    _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
    _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
    _mm_storeu_si128((__m128i*)&p[0 * stride], q0);
    _mm_storeu_si128((__m128i*)&p[1 * stride], q1);
wester committed
760 761 762
  }
}

a  
Kai Westerkamp committed
763 764
static void HFilter16iSSE2(uint8_t* p, int stride,
                           int thresh, int ithresh, int hev_thresh) {
wester committed
765
  int k;
a  
Kai Westerkamp committed
766 767 768
  uint8_t* b;
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
wester committed
769 770

  for (k = 3; k > 0; --k) {
a  
Kai Westerkamp committed
771 772 773
    b = p;
    Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0);  // p3, p2, p1, p0
    MAX_DIFF1(t2, t1, p1, p0, mask);
wester committed
774

a  
Kai Westerkamp committed
775 776 777
    b += 4;  // beginning of q0
    Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
    MAX_DIFF2(t2, t1, q1, q0, mask);
wester committed
778

a  
Kai Westerkamp committed
779 780
    COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
    DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
wester committed
781

a  
Kai Westerkamp committed
782 783
    b -= 2;  // beginning of p1
    Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1);
wester committed
784

a  
Kai Westerkamp committed
785
    p += 4;
wester committed
786 787 788 789
  }
}

// 8-pixels wide variant, for chroma filtering
a  
Kai Westerkamp committed
790 791
static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
                         int thresh, int ithresh, int hev_thresh) {
wester committed
792 793 794 795 796 797 798 799 800 801 802
  __m128i mask;
  __m128i t1, p2, p1, p0, q0, q1, q2;

  // Load p3, p2, p1, p0
  LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0);
  MAX_DIFF1(t1, p2, p1, p0, mask);

  // Load q0, q1, q2, q3
  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
  MAX_DIFF2(t1, q2, q1, q0, mask);

a  
Kai Westerkamp committed
803
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
wester committed
804 805 806 807 808 809 810 811 812 813 814
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

  // Store
  STOREUV(p2, u, v, -3 * stride);
  STOREUV(p1, u, v, -2 * stride);
  STOREUV(p0, u, v, -1 * stride);
  STOREUV(q0, u, v, 0 * stride);
  STOREUV(q1, u, v, 1 * stride);
  STOREUV(q2, u, v, 2 * stride);
}

a  
Kai Westerkamp committed
815 816
static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride,
                         int thresh, int ithresh, int hev_thresh) {
wester committed
817 818 819 820 821 822 823 824 825 826 827
  __m128i mask;
  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

  uint8_t* const tu = u - 4;
  uint8_t* const tv = v - 4;
  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
  MAX_DIFF1(p3, p2, p1, p0, mask);

  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
  MAX_DIFF2(q3, q2, q1, q0, mask);

a  
Kai Westerkamp committed
828
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
wester committed
829 830
  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);

a  
Kai Westerkamp committed
831 832
  Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0);
  Store16x4(u, v, stride, &q0, &q1, &q2, &q3);
wester committed
833 834
}

a  
Kai Westerkamp committed
835 836
static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) {
wester committed
837 838 839 840 841 842 843 844 845 846 847 848 849 850
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;

  // Load p3, p2, p1, p0
  LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0);
  MAX_DIFF1(t2, t1, p1, p0, mask);

  u += 4 * stride;
  v += 4 * stride;

  // Load q0, q1, q2, q3
  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
  MAX_DIFF2(t2, t1, q1, q0, mask);

a  
Kai Westerkamp committed
851
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
wester committed
852 853 854 855 856 857 858 859 860
  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

  // Store
  STOREUV(p1, u, v, -2 * stride);
  STOREUV(p0, u, v, -1 * stride);
  STOREUV(q0, u, v, 0 * stride);
  STOREUV(q1, u, v, 1 * stride);
}

a  
Kai Westerkamp committed
861 862
static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride,
                          int thresh, int ithresh, int hev_thresh) {
wester committed
863 864 865 866 867 868 869 870 871 872
  __m128i mask;
  __m128i t1, t2, p1, p0, q0, q1;
  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
  MAX_DIFF1(t2, t1, p1, p0, mask);

  u += 4;  // beginning of q0
  v += 4;
  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
  MAX_DIFF2(t2, t1, q1, q0, mask);

a  
Kai Westerkamp committed
873
  COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask);
wester committed
874 875 876 877
  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);

  u -= 2;  // beginning of p1
  v -= 2;
a  
Kai Westerkamp committed
878
  Store16x4(u, v, stride, &p1, &p0, &q0, &q1);
wester committed
879 880
}

a  
Kai Westerkamp committed
881
#endif   // WEBP_USE_SSE2
wester committed
882 883 884 885 886 887

//------------------------------------------------------------------------------
// Entry point

extern void VP8DspInitSSE2(void);

a  
Kai Westerkamp committed
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909
void VP8DspInitSSE2(void) {
#if defined(WEBP_USE_SSE2)
  VP8Transform = TransformSSE2;

  VP8VFilter16 = VFilter16SSE2;
  VP8HFilter16 = HFilter16SSE2;
  VP8VFilter8 = VFilter8SSE2;
  VP8HFilter8 = HFilter8SSE2;
  VP8VFilter16i = VFilter16iSSE2;
  VP8HFilter16i = HFilter16iSSE2;
  VP8VFilter8i = VFilter8iSSE2;
  VP8HFilter8i = HFilter8iSSE2;

  VP8SimpleVFilter16 = SimpleVFilter16SSE2;
  VP8SimpleHFilter16 = SimpleHFilter16SSE2;
  VP8SimpleVFilter16i = SimpleVFilter16iSSE2;
  VP8SimpleHFilter16i = SimpleHFilter16iSSE2;
#endif   // WEBP_USE_SSE2
}

#if defined(__cplusplus) || defined(c_plusplus)
}    // extern "C"
wester committed
910
#endif