[libav-devel] [PATCH 8/8] hevcdsp: add x86 SIMD for MC

Anton Khirnov anton at khirnov.net
Wed Aug 19 21:43:23 CEST 2015


---
 libavcodec/hevc.c             |   6 +-
 libavcodec/hevc.h             |   2 +-
 libavcodec/hevcdsp.c          |  24 +-
 libavcodec/hevcdsp.h          |   5 +-
 libavcodec/hevcdsp_template.c |   8 +-
 libavcodec/x86/Makefile       |   3 +-
 libavcodec/x86/hevc_mc.asm    | 816 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/hevcdsp_init.c | 405 +++++++++++++++++++++
 8 files changed, 1258 insertions(+), 11 deletions(-)
 create mode 100644 libavcodec/x86/hevc_mc.asm

diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
index dd54525..9cae92c 100644
--- a/libavcodec/hevc.c
+++ b/libavcodec/hevc.c
@@ -38,9 +38,9 @@
 #include "golomb.h"
 #include "hevc.h"
 
-const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 2 };
-const uint8_t ff_hevc_qpel_extra_after[4]  = { 0, 3, 4, 4 };
-const uint8_t ff_hevc_qpel_extra[4]        = { 0, 6, 7, 6 };
+const uint8_t ff_hevc_qpel_extra_before[4] = { 0, 3, 3, 3 };
+const uint8_t ff_hevc_qpel_extra_after[4]  = { 0, 4, 4, 4 };
+const uint8_t ff_hevc_qpel_extra[4]        = { 0, 7, 7, 7 };
 
 static const uint8_t scan_1x1[1] = { 0 };
 
diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
index c6e05bc..7c87a55 100644
--- a/libavcodec/hevc.h
+++ b/libavcodec/hevc.h
@@ -740,7 +740,7 @@ typedef struct HEVCPredContext {
 } HEVCPredContext;
 
 typedef struct HEVCLocalContext {
-    DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 7) * MAX_PB_SIZE]);
+    DECLARE_ALIGNED(16, int16_t, mc_buffer[(MAX_PB_SIZE + 24) * MAX_PB_SIZE]);
     uint8_t cabac_state[HEVC_CONTEXTS];
 
     uint8_t first_qp_group;
diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
index ab9ba3b..2b29b19 100644
--- a/libavcodec/hevcdsp.c
+++ b/libavcodec/hevcdsp.c
@@ -89,7 +89,7 @@ static const int8_t transform[32][32] = {
       90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
 };
 
-DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
+DECLARE_ALIGNED(16, const int16_t, ff_hevc_epel_coeffs[7][16]) = {
     { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
     { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
     { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
@@ -99,6 +99,28 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_filters[7][16]) = {
     { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
 };
 
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_epel_coeffs8[7][16]) = {
+    { -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2 },
+    { -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2 },
+    { -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4 },
+    { -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4 },
+    { -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6 },
+    { -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4 },
+    { -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2 },
+};
+
+DECLARE_ALIGNED(16, const int16_t, ff_hevc_qpel_coeffs[3][8]) = {
+    { -1, 4, -10, 58, 17, -5,  1,  0 },
+    { -1, 4, -11, 40, 40, -11, 4, -1 },
+    {  0, 1,  -5, 17, 58, -10, 4, -1 },
+};
+
+DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_coeffs8[3][16]) = {
+    { -1, 4, -10, 58, 17, -5,  1,  0, -1, 4, -10, 58, 17, -5,  1,  0 },
+    { -1, 4, -11, 40, 40, -11, 4, -1, -1, 4, -11, 40, 40, -11, 4, -1 },
+    {  0, 1,  -5, 17, 58, -10, 4, -1,  0, 1,  -5, 17, 58, -10, 4, -1 },
+};
+
 #define BIT_DEPTH 8
 #include "hevcdsp_template.c"
 #undef BIT_DEPTH
diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
index ee3aa70..4daa2e5 100644
--- a/libavcodec/hevcdsp.h
+++ b/libavcodec/hevcdsp.h
@@ -118,6 +118,9 @@ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
 
-extern const int8_t ff_hevc_epel_filters[7][16];
+extern const int16_t ff_hevc_epel_coeffs[7][16];
+extern const int8_t ff_hevc_epel_coeffs8[7][16];
+extern const int16_t ff_hevc_qpel_coeffs[3][8];
+extern const int8_t ff_hevc_qpel_coeffs8[3][16];
 
 #endif /* AVCODEC_HEVCDSP_H */
diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
index 8bb0a57..1f9adee 100644
--- a/libavcodec/hevcdsp_template.c
+++ b/libavcodec/hevcdsp_template.c
@@ -999,7 +999,7 @@ static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride,
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
-    const int8_t *filter = ff_hevc_epel_filters[mx - 1];
+    const int16_t *filter = ff_hevc_epel_coeffs[mx - 1];
     int8_t filter_0 = filter[0];
     int8_t filter_1 = filter[1];
     int8_t filter_2 = filter[2];
@@ -1021,7 +1021,7 @@ static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride,
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int8_t *filter = ff_hevc_epel_filters[my - 1];
+    const int16_t *filter = ff_hevc_epel_coeffs[my - 1];
     int8_t filter_0 = filter[0];
     int8_t filter_1 = filter[1];
     int8_t filter_2 = filter[2];
@@ -1044,8 +1044,8 @@ static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride,
     int x, y;
     pixel *src = (pixel *)_src;
     ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-    const int8_t *filter_h = ff_hevc_epel_filters[mx - 1];
-    const int8_t *filter_v = ff_hevc_epel_filters[my - 1];
+    const int16_t *filter_h = ff_hevc_epel_coeffs[mx - 1];
+    const int16_t *filter_v = ff_hevc_epel_coeffs[my - 1];
     int8_t filter_0 = filter_h[0];
     int8_t filter_1 = filter_h[1];
     int8_t filter_2 = filter_h[2];
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index cc80007..daf7417 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -117,7 +117,8 @@ YASM-OBJS-$(CONFIG_VP8DSP)             += x86/vp8dsp.o                  \
 YASM-OBJS-$(CONFIG_AAC_DECODER)        += x86/sbrdsp.o
 YASM-OBJS-$(CONFIG_APE_DECODER)        += x86/apedsp.o
 YASM-OBJS-$(CONFIG_DCA_DECODER)        += x86/dcadsp.o
-YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_deblock.o
+YASM-OBJS-$(CONFIG_HEVC_DECODER)       += x86/hevc_deblock.o            \
+                                          x86/hevc_mc.o
 YASM-OBJS-$(CONFIG_PNG_DECODER)        += x86/pngdsp.o
 YASM-OBJS-$(CONFIG_PRORES_DECODER)     += x86/proresdsp.o
 YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv40dsp.o
diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
new file mode 100644
index 0000000..52a89dc
--- /dev/null
+++ b/libavcodec/x86/hevc_mc.asm
@@ -0,0 +1,816 @@
+;*****************************************************************************
+;* x86-optimized HEVC MC
+;* Copyright 2015 Anton Khirnov
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .rodata
+
+pw_1023: times 8 dw 1023
+
+cextern hevc_qpel_coeffs
+cextern hevc_qpel_coeffs8
+
+cextern hevc_epel_coeffs
+cextern hevc_epel_coeffs8
+
+cextern pw_8
+cextern pw_16
+cextern pw_32
+cextern pw_64
+
+SECTION .text
+
+; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
+;                         pixel   *src, ptrdiff_t srcstride,
+;                         int height, int mx, int my, int *mcbuffer)
+
+; %1: block width
+; %2: bit depth
+; %3: log2 of height unroll
+%macro GET_PIXELS 3
+cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 8, 8, 2, dst, dststride, src, srcstride, height, mx, my, mcbuffer
+
+%assign shift 14 - %2
+
+%if %2 > 8
+    %define LOAD_4PIX movq
+    %define LOAD_8PIX movu
+    %define UNPACK(dest, zero)
+    %define pixel_size 2
+%else
+    %define LOAD_4PIX movd
+    %define LOAD_8PIX movq
+    %define UNPACK(dest, zero) punpcklbw dest, zero
+    %define pixel_size 1
+%endif
+
+    pxor      m0, m0
+
+%if %3
+    shr       heightq, %3
+%endif
+
+.loop:
+
+%assign i 0
+%rep (1 << %3)
+
+%assign j 0
+%rep (%1 + 7) / 8
+
+    %if (j + 1) * 8 > %1
+        %define LOAD  LOAD_4PIX
+        %define STORE movh
+    %else
+        %define LOAD  LOAD_8PIX
+        %define STORE mova
+    %endif
+
+    LOAD    m1, [srcq + 8 * j * pixel_size]
+    UNPACK(m1, m0)
+    psllw   m1, shift
+    STORE   [dstq + 16 * j], m1
+
+%assign j (j + 1)
+%endrep
+
+    add       dstq, dststrideq
+    add       srcq, srcstrideq
+
+%assign i (i + 1)
+%endrep
+
+    dec heightq
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+GET_PIXELS 4,  8, 1
+GET_PIXELS 8,  8, 1
+GET_PIXELS 12, 8, 3
+GET_PIXELS 16, 8, 2
+GET_PIXELS 24, 8, 3
+GET_PIXELS 32, 8, 3
+GET_PIXELS 48, 8, 3
+GET_PIXELS 64, 8, 3
+
+GET_PIXELS 4,  10, 1
+GET_PIXELS 8,  10, 1
+GET_PIXELS 12, 10, 3
+GET_PIXELS 16, 10, 2
+GET_PIXELS 24, 10, 3
+GET_PIXELS 32, 10, 3
+GET_PIXELS 48, 10, 3
+GET_PIXELS 64, 10, 3
+
+; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
+;                     uint8_t *src, ptrdiff_t srcstride,
+;                     int height, int mx, int my, int *mcbuffer)
+
+; 8-bit qpel interpolation
+; %1: block width
+; %2: 0 - horizontal; 1 - vertical
+%macro QPEL_8 2
+%if %2
+    %define postfix    v
+    %define mvfrac     myq
+    %define pixstride  srcstrideq
+    %define pixstride3 sstride3q
+    %define src_m3     srcm3q
+%else
+    %define postfix    h
+    %define mvfrac     mxq
+    %define pixstride  1
+    %define pixstride3 3
+    %define src_m3     (srcq - 3)
+%endif
+
+cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 8, 9, 7, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3
+    dec       mvfrac
+    shl       mvfrac, 4
+    mova      m0, [hevc_qpel_coeffs8 + mvfrac]
+
+    SPLATW    m1, m0, 1
+    SPLATW    m2, m0, 2
+    SPLATW    m3, m0, 3
+    SPLATW    m0, m0, 0
+
+%if %2
+    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
+    mov       srcm3q,    srcq
+    sub       srcm3q,    sstride3q
+%endif
+
+.loop
+
+%assign i 0
+%rep (%1 + 7) / 8
+
+%if (i + 1) * 8 > %1
+    %define LOAD  movd
+    %define STORE movh
+%else
+    %define LOAD  movq
+    %define STORE mova
+%endif
+
+    LOAD m4, [src_m3 + 8 * i]
+    LOAD m5, [src_m3 + 8 * i + 1 * pixstride]
+    punpcklbw m4, m5
+    pmaddubsw m4, m0
+
+    LOAD m5, [src_m3 + 8 * i + 2 * pixstride]
+    LOAD m6, [srcq   + 8 * i]
+    punpcklbw m5, m6
+    pmaddubsw m5, m1
+    paddsw    m4, m5
+
+    LOAD m5, [srcq + 8 * i + 1 * pixstride]
+    LOAD m6, [srcq + 8 * i + 2 * pixstride]
+    punpcklbw m5, m6
+    pmaddubsw m5, m2
+    paddsw    m4, m5
+
+    LOAD m5, [srcq + 8 * i +     pixstride3]
+    LOAD m6, [srcq + 8 * i + 4 * pixstride]
+    punpcklbw m5, m6
+    pmaddubsw m5, m3
+    paddsw    m4, m5
+
+    STORE [dstq + 16 * i], m4
+
+%assign i (i + 1)
+%endrep
+
+    add       dstq,   dststrideq
+    add       srcq,   srcstrideq
+%if %2
+    add       srcm3q, srcstrideq
+%endif
+
+    dec heightq
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+QPEL_8 4,  0
+QPEL_8 8,  0
+QPEL_8 12, 0
+QPEL_8 16, 0
+QPEL_8 24, 0
+QPEL_8 32, 0
+QPEL_8 48, 0
+QPEL_8 64, 0
+
+QPEL_8 4,  1
+QPEL_8 8,  1
+QPEL_8 12, 1
+QPEL_8 16, 1
+QPEL_8 24, 1
+QPEL_8 32, 1
+QPEL_8 48, 1
+QPEL_8 64, 1
+
+%macro QPEL_16 3
+%if %3
+    %define mvfrac     myq
+    %define pixstride  srcstrideq
+    %define pixstride3 sstride3q
+    %define src_m3     srcm3q
+%else
+    %define mvfrac     mxq
+    %define pixstride  2
+    %define pixstride3 6
+    %define src_m3     (srcq - 6)
+%endif
+
+    dec       mvfrac
+    shl       mvfrac, 4
+    mova      m0, [hevc_qpel_coeffs + mvfrac]
+
+    pshufd    m1, m0, 0x55
+    pshufd    m2, m0, 0xaa
+    pshufd    m3, m0, 0xff
+    pshufd    m0, m0, 0x00
+
+%if %3
+    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
+    mov       srcm3q, srcq
+    sub       srcm3q, sstride3q
+%endif
+
+.loop
+
+%assign i 0
+%rep (%1 + 7) / 8
+
+    %if (i + 1) * 8 > %1
+        %define LOAD  movq
+    %else
+        %define LOAD  movu
+    %endif
+
+    LOAD m4,  [src_m3 + 16 * i]
+    LOAD m5,  [src_m3 + 16 * i + 1 * pixstride]
+    LOAD m6,  [src_m3 + 16 * i + 2 * pixstride]
+    LOAD m7,  [srcq   + 16 * i + 0 * pixstride]
+    LOAD m8,  [srcq   + 16 * i + 1 * pixstride]
+    LOAD m9,  [srcq   + 16 * i + 2 * pixstride]
+    LOAD m10, [srcq   + 16 * i +     pixstride3]
+    LOAD m11, [srcq   + 16 * i + 4 * pixstride]
+
+    punpcklwd m12, m4, m5
+    pmaddwd   m12, m0
+
+    punpcklwd m13, m6, m7
+    pmaddwd   m13, m1
+    paddd     m12, m13
+
+    punpcklwd m13, m8, m9
+    pmaddwd   m13, m2
+    paddd     m12, m13
+
+    punpcklwd m13, m10, m11
+    pmaddwd   m13, m3
+    paddd     m12, m13
+    psrad     m12, %2
+
+    %if (i + 1) * 8 > %1
+        packssdw m12, m12
+        movq [dstq + 16 * i], m12
+    %else
+        punpckhwd m4, m5
+        pmaddwd   m4, m0
+
+        punpckhwd m6, m7
+        pmaddwd   m6, m1
+        paddd     m4, m6
+
+        punpckhwd m8, m9
+        pmaddwd   m8, m2
+        paddd     m4, m8
+
+        punpckhwd m10, m11
+        pmaddwd   m10, m3
+        paddd     m4, m10
+
+        psrad     m4, %2
+        packssdw  m12, m4
+        mova [dstq + 16 * i], m12
+    %endif
+
+%assign i (i + 1)
+%endrep
+
+    add       dstq,   dststrideq
+    add       srcq,   srcstrideq
+%if %3
+    add       srcm3q, srcstrideq
+%endif
+
+    dec heightq
+    jg .loop
+    RET
+%endmacro
+
+%macro QPEL_H_10 1
+cglobal hevc_qpel_h_ %+ %1 %+ _10, 8, 8, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer
+QPEL_16 %1, 2, 0
+%endmacro
+
+INIT_XMM avx
+QPEL_H_10 4
+QPEL_H_10 8
+QPEL_H_10 12
+QPEL_H_10 16
+QPEL_H_10 24
+QPEL_H_10 32
+QPEL_H_10 48
+QPEL_H_10 64
+
+%macro QPEL_V_10 1
+cglobal hevc_qpel_v_ %+ %1 %+ _10, 8, 9, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3
+QPEL_16 %1, 2, 1
+%endmacro
+
+INIT_XMM avx
+QPEL_V_10 4
+QPEL_V_10 8
+QPEL_V_10 12
+QPEL_V_10 16
+QPEL_V_10 24
+QPEL_V_10 32
+QPEL_V_10 48
+QPEL_V_10 64
+
+; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
+;                  uint8_t *src, ptrdiff_t srcstride,
+;                  int height, int mx, int my, int *mcbuffer)
+
+%macro QPEL_HV 1
+cglobal hevc_qpel_hv_ %+ %1, 8, 9, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3
+QPEL_16 %1, 6, 1
+%endmacro
+
+INIT_XMM avx
+QPEL_HV 4
+QPEL_HV 8
+QPEL_HV 12
+QPEL_HV 16
+QPEL_HV 24
+QPEL_HV 32
+QPEL_HV 48
+QPEL_HV 64
+
+; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
+;                     uint8_t *src, ptrdiff_t srcstride,
+;                     int height, int mx, int my, int *mcbuffer)
+
+; 8-bit epel interpolation
+; %1: block width
+; %2: 0 - horizontal; 1 - vertical
+%macro EPEL_8 2
+%if %2
+    %define postfix    v
+    %define mvfrac     myq
+    %define pixstride  srcstrideq
+    %define pixstride3 sstride3q
+%else
+    %define postfix    h
+    %define mvfrac     mxq
+    %define pixstride  1
+    %define pixstride3 3
+%endif
+
+cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 8, 9, 6, dst, dststride, src, srcstride, height, mx, my, sstride3
+    dec       mvfrac
+    shl       mvfrac, 4
+    movq      m0, [hevc_epel_coeffs8 + mvfrac]
+
+    SPLATW    m1, m0, 1
+    SPLATW    m0, m0, 0
+
+%if %2
+    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
+%endif
+    sub       srcq, pixstride
+
+.loop
+
+%assign i 0
+%rep (%1 + 7) / 8
+
+    %if (i + 1) * 8 > %1
+        %define LOAD  movd
+        %define STORE movh
+    %else
+        %define LOAD  movq
+        %define STORE mova
+    %endif
+
+    LOAD m2, [srcq + 8 * i + 0 * pixstride]
+    LOAD m3, [srcq + 8 * i + 1 * pixstride]
+    LOAD m4, [srcq + 8 * i + 2 * pixstride]
+    LOAD m5, [srcq + 8 * i +     pixstride3]
+
+    punpcklbw m2, m3
+    punpcklbw m4, m5
+
+    pmaddubsw m2, m0
+    pmaddubsw m4, m1
+
+    paddsw    m2, m4
+
+    STORE [dstq + 16 * i], m2
+
+%assign i (i + 1)
+%endrep
+
+    add       dstq, dststrideq
+    add       srcq, srcstrideq
+
+    dec heightq
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+EPEL_8 4,  0
+EPEL_8 8,  0
+EPEL_8 12, 0
+EPEL_8 16, 0
+EPEL_8 24, 0
+EPEL_8 32, 0
+
+EPEL_8 4,  1
+EPEL_8 8,  1
+EPEL_8 12, 1
+EPEL_8 16, 1
+EPEL_8 24, 1
+EPEL_8 32, 1
+
+%macro EPEL_16 3
+%if %3
+    %define mvfrac     myq
+    %define pixstride  srcstrideq
+    %define pixstride3 sstride3q
+%else
+    %define mvfrac     mxq
+    %define pixstride  2
+    %define pixstride3 6
+%endif
+    dec       mvfrac
+    shl       mvfrac, 5
+    mova      m0, [hevc_epel_coeffs + mvfrac]
+
+    pshufd    m1, m0, 0x55
+    pshufd    m0, m0, 0x00
+
+%if %3
+    lea       sstride3q, [srcstrideq + 2 * srcstrideq]
+%endif
+    sub       srcq, pixstride
+
+.loop
+
+%assign i 0
+%rep (%1 + 7) / 8
+
+    %if (i + 1) * 8 > %1
+        %define LOAD  movq
+    %else
+        %define LOAD  movu
+    %endif
+
+    LOAD m2, [srcq + 16 * i + 0 * pixstride]
+    LOAD m3, [srcq + 16 * i + 1 * pixstride]
+    LOAD m4, [srcq + 16 * i + 2 * pixstride]
+    LOAD m5, [srcq + 16 * i +     pixstride3]
+
+    punpcklwd m6, m2, m3
+    punpcklwd m7, m4, m5
+    pmaddwd   m6, m0
+    pmaddwd   m7, m1
+    paddd     m6, m7
+    psrad     m6, %2
+
+    %if (i + 1) * 8 > %1
+        packssdw  m6, m6
+        movq [dstq + 16 * i], m6
+    %else
+        punpckhwd m2, m3
+        punpckhwd m4, m5
+        pmaddwd   m2, m0
+        pmaddwd   m4, m1
+        paddd     m2, m4
+        psrad     m2, %2
+
+        packssdw  m6, m2
+        mova [dstq + 16 * i], m6
+    %endif
+
+%assign i (i + 1)
+%endrep
+
+    add       dstq,   dststrideq
+    add       srcq,   srcstrideq
+
+    dec heightq
+    jg .loop
+    RET
+%endmacro
+
+%macro EPEL_H_10 1
+cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 8, 8, dst, dststride, src, srcstride, height, mx, my, sstride3
+EPEL_16 %1, 2, 0
+%endmacro
+
+INIT_XMM avx
+EPEL_H_10 4
+EPEL_H_10 8
+EPEL_H_10 12
+EPEL_H_10 16
+EPEL_H_10 24
+EPEL_H_10 32
+
+%macro EPEL_V_10 1
+cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 8, 8, dst, dststride, src, srcstride, height, mx, my, sstride3
+EPEL_16 %1, 2, 1
+%endmacro
+
+INIT_XMM avx
+EPEL_V_10 4
+EPEL_V_10 8
+EPEL_V_10 12
+EPEL_V_10 16
+EPEL_V_10 24
+EPEL_V_10 32
+
+; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
+;                    int16_t *src, ptrdiff_t srcstride,
+;                    int height, int mx, int my, int *mcbuffer)
+
+%macro EPEL_HV 1
+cglobal hevc_epel_hv_ %+ %1, 8, 8, 8, dst, dststride, src, srcstride, height, mx, my, sstride3
+EPEL_16 %1, 6, 1
+%endmacro
+
+INIT_XMM avx
+EPEL_HV 4
+EPEL_HV 8
+EPEL_HV 12
+EPEL_HV 16
+EPEL_HV 24
+EPEL_HV 32
+
+; hevc_put_unweighted_pred_<w>_<d>(pixel   *dst, ptrdiff_t dststride,
+;                                  int16_t *src, ptrdiff_t srcstride,
+;                                  int height)
+
+%macro AVG 5
+    %if %3
+        %if %4 == 4
+            movq %5, %2
+            paddsw %1, %5
+        %else
+            paddsw %1, %2
+        %endif
+    %endif
+%endmacro
+
+; %1: 0 - one source; 1 - two sources
+; %2: width
+; %3: bit depth
+%macro PUT_PRED 3
+%if %1
+cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
+%else
+cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
+%endif
+
+%assign shift       14 + %1 - %3
+%assign offset      (1 << (shift - 1))
+%define offset_data pw_ %+ offset
+
+    mova        m0, [offset_data]
+
+%if %3 > 8
+    %define STORE_BLOCK movu
+    %define STORE_HALF  movq
+
+    %assign pixel_max ((1 << %3) - 1)
+    %define pw_pixel_max pw_ %+ pixel_max
+    pxor    m1, m1
+    mova    m2, [pw_pixel_max]
+%else
+    %define STORE_BLOCK movq
+    %define STORE_HALF  movd
+%endif
+
+.loop
+%assign i 0
+%rep (%2 + 7) / 8
+
+    %if (i + 1) * 8 > %2
+        %define LOAD movq
+        %define STORE STORE_HALF
+    %else
+        %define LOAD mova
+        %define STORE STORE_BLOCK
+    %endif
+
+    LOAD m3, [srcq  + 16 * i]
+    AVG  m3, [src2q + 16 * i], %1, %3 - i * 8, m4
+
+    paddsw m3, m0
+    psraw  m3, shift
+
+    %if %3 == 8
+        packuswb m3, m3
+        STORE [dstq + 8 * i], m3
+    %else
+        CLIPW m3, m1, m2
+        STORE [dstq + 16 * i], m3
+    %endif
+%assign i (i + 1)
+%endrep
+
+    add dstq,  dststrideq
+    add srcq,  srcstrideq
+%if %1
+    add src2q, srcstrideq
+%endif
+
+    dec         heightq
+    jg          .loop
+    RET
+%endmacro
+
+INIT_XMM sse2
+PUT_PRED 0, 4,  8
+PUT_PRED 1, 4,  8
+PUT_PRED 0, 8,  8
+PUT_PRED 1, 8,  8
+PUT_PRED 0, 12, 8
+PUT_PRED 1, 12, 8
+PUT_PRED 0, 16, 8
+PUT_PRED 1, 16, 8
+PUT_PRED 0, 24, 8
+PUT_PRED 1, 24, 8
+PUT_PRED 0, 32, 8
+PUT_PRED 1, 32, 8
+PUT_PRED 0, 48, 8
+PUT_PRED 1, 48, 8
+PUT_PRED 0, 64, 8
+PUT_PRED 1, 64, 8
+
+PUT_PRED 0, 4,  10
+PUT_PRED 1, 4,  10
+PUT_PRED 0, 8,  10
+PUT_PRED 1, 8,  10
+PUT_PRED 0, 12, 10
+PUT_PRED 1, 12, 10
+PUT_PRED 0, 16, 10
+PUT_PRED 1, 16, 10
+PUT_PRED 0, 24, 10
+PUT_PRED 1, 24, 10
+PUT_PRED 0, 32, 10
+PUT_PRED 1, 32, 10
+PUT_PRED 0, 48, 10
+PUT_PRED 1, 48, 10
+PUT_PRED 0, 64, 10
+PUT_PRED 1, 64, 10
+
+%macro PUT_WEIGHTED_PRED 3
+%if %1
+cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
+%else
+cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
+%endif
+
+    add         denomq, 14 + %1 - %3
+    movq        m0, denomq
+
+%if %3 > 8
+    %assign pixel_max ((1 << %3) - 1)
+    %define pw_pixel_max pw_ %+ pixel_max
+    pxor    m4, m4
+    mova    m5, [pw_pixel_max]
+
+    shl         offset0q, %3 - 8
+%if %1
+    shl         offset1q, %3 - 8
+%endif
+%endif
+
+%if %1
+    lea         offset0q, [offset0q + offset1q + 1]
+%else
+    lea         offset0q, [2 * offset0q + 1]
+%endif
+    movq        m1, offset0q
+    SPLATD      m1
+    pslld       m1, m0
+    psrad       m1, 1
+
+    movq        m2, weight0q
+    SPLATD      m2
+%if %1
+    movq        m3, weight1q
+    SPLATD      m3
+%endif
+
+.loop
+%assign i 0
+%rep (%2 + 3) / 4
+
+    pmovsxwd   m6, [src0q + 8 * i]
+    pmulld     m6, m2
+
+%if %1
+    pmovsxwd   m7, [src1q + 8 * i]
+    pmulld     m7, m3
+    paddd      m6, m7
+%endif
+
+    paddd      m6, m1
+    psrad      m6, m0
+
+    packssdw   m6, m6
+
+%if %3 > 8
+    CLIPW      m6, m4, m5
+    movq       [dstq + 8 * i], m6
+%else
+    packuswb   m6, m6
+    movd [dstq + 4 * i], m6
+%endif
+
+%assign i (i + 1)
+%endrep
+
+    add dstq,  dststrideq
+    add src0q, srcstrideq
+%if %1
+    add src1q, srcstrideq
+%endif
+
+    dec         heightq
+    jg          .loop
+    RET
+%endmacro
+
+INIT_XMM sse4
+PUT_WEIGHTED_PRED 0, 4,  8
+PUT_WEIGHTED_PRED 1, 4,  8
+PUT_WEIGHTED_PRED 0, 8,  8
+PUT_WEIGHTED_PRED 1, 8,  8
+PUT_WEIGHTED_PRED 0, 12, 8
+PUT_WEIGHTED_PRED 1, 12, 8
+PUT_WEIGHTED_PRED 0, 16, 8
+PUT_WEIGHTED_PRED 1, 16, 8
+PUT_WEIGHTED_PRED 0, 24, 8
+PUT_WEIGHTED_PRED 1, 24, 8
+PUT_WEIGHTED_PRED 0, 32, 8
+PUT_WEIGHTED_PRED 1, 32, 8
+PUT_WEIGHTED_PRED 0, 48, 8
+PUT_WEIGHTED_PRED 1, 48, 8
+PUT_WEIGHTED_PRED 0, 64, 8
+PUT_WEIGHTED_PRED 1, 64, 8
+
+PUT_WEIGHTED_PRED 0, 4,  10
+PUT_WEIGHTED_PRED 1, 4,  10
+PUT_WEIGHTED_PRED 0, 8,  10
+PUT_WEIGHTED_PRED 1, 8,  10
+PUT_WEIGHTED_PRED 0, 12, 10
+PUT_WEIGHTED_PRED 1, 12, 10
+PUT_WEIGHTED_PRED 0, 16, 10
+PUT_WEIGHTED_PRED 1, 16, 10
+PUT_WEIGHTED_PRED 0, 24, 10
+PUT_WEIGHTED_PRED 1, 24, 10
+PUT_WEIGHTED_PRED 0, 32, 10
+PUT_WEIGHTED_PRED 1, 32, 10
+PUT_WEIGHTED_PRED 0, 48, 10
+PUT_WEIGHTED_PRED 1, 48, 10
+PUT_WEIGHTED_PRED 0, 64, 10
+PUT_WEIGHTED_PRED 1, 64, 10
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 04203c2..bcb8ee7 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -45,6 +45,149 @@ LFC_FUNCS(uint8_t, 10)
 LFL_FUNCS(uint8_t, 8)
 LFL_FUNCS(uint8_t, 10)
 
+#define GET_PIXELS(width, depth) \
+void ff_hevc_get_pixels_ ## width ## _ ## depth ## _sse2(int16_t *dst, ptrdiff_t dststride,                \
+                                                         uint8_t *src, ptrdiff_t srcstride,                \
+                                                         int height, int mx, int my, int16_t *mcbuffer);
+
+GET_PIXELS(4,  8)
+GET_PIXELS(8,  8)
+GET_PIXELS(12, 8)
+GET_PIXELS(16, 8)
+GET_PIXELS(24, 8)
+GET_PIXELS(32, 8)
+GET_PIXELS(48, 8)
+GET_PIXELS(64, 8)
+
+GET_PIXELS(4,  10)
+GET_PIXELS(8,  10)
+GET_PIXELS(12, 10)
+GET_PIXELS(16, 10)
+GET_PIXELS(24, 10)
+GET_PIXELS(32, 10)
+GET_PIXELS(48, 10)
+GET_PIXELS(64, 10)
+
+/* those are independent of the bit depth, so declared separately */
+#define INTERP_HV_FUNC(width, cf)                                                         \
+void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
+                                          int16_t *src, ptrdiff_t srcstride,              \
+                                          int height, int mx, int my, int16_t *mcbuffer); \
+void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride,              \
+                                          int16_t *src, ptrdiff_t srcstride,              \
+                                          int height, int mx, int my, int16_t *mcbuffer);
+
+INTERP_HV_FUNC(4,  avx)
+INTERP_HV_FUNC(8,  avx)
+INTERP_HV_FUNC(12, avx)
+INTERP_HV_FUNC(16, avx)
+INTERP_HV_FUNC(24, avx)
+INTERP_HV_FUNC(32, avx)
+INTERP_HV_FUNC(48, avx)
+INTERP_HV_FUNC(64, avx)
+
+#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
+void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
+                                                         uint8_t *src, ptrdiff_t srcstride,                   \
+                                                         int height, int mx, int my, int16_t *mcbuffer);      \
+void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
+                                                         uint8_t *src, ptrdiff_t srcstride,                   \
+                                                         int height, int mx, int my, int16_t *mcbuffer);      \
+static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
+                                                               uint8_t *src, ptrdiff_t srcstride,             \
+                                                               int height, int mx, int my, int16_t *mcbuffer) \
+{                                                                                                             \
+    const ptrdiff_t stride = FFALIGN(width + 7, 8);                                                           \
+    ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
+                                                        height + 7, mx, my, mcbuffer);                        \
+    ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride,                \
+                                            height, mx, my, mcbuffer);                                        \
+}
+QPEL_FUNCS(4,  8, ssse3, ssse3, avx)
+QPEL_FUNCS(8,  8, ssse3, ssse3, avx)
+QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
+QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
+QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
+QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
+QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
+QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
+
+QPEL_FUNCS(4,  10, avx, avx, avx)
+QPEL_FUNCS(8,  10, avx, avx, avx)
+QPEL_FUNCS(12, 10, avx, avx, avx)
+QPEL_FUNCS(16, 10, avx, avx, avx)
+QPEL_FUNCS(24, 10, avx, avx, avx)
+QPEL_FUNCS(32, 10, avx, avx, avx)
+QPEL_FUNCS(48, 10, avx, avx, avx)
+QPEL_FUNCS(64, 10, avx, avx, avx)
+
+#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv)                                                           \
+void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride,                   \
+                                                         uint8_t *src, ptrdiff_t srcstride,                   \
+                                                         int height, int mx, int my, int16_t *mcbuffer);      \
+void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride,                   \
+                                                         uint8_t *src, ptrdiff_t srcstride,                   \
+                                                         int height, int mx, int my, int16_t *mcbuffer);      \
+static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride,             \
+                                                               uint8_t *src, ptrdiff_t srcstride,             \
+                                                               int height, int mx, int my, int16_t *mcbuffer) \
+{                                                                                                             \
+    const ptrdiff_t stride = FFALIGN(width + 3, 8);                                                           \
+    ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride,     \
+                                                        height + 3, mx, my, mcbuffer);                        \
+    ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride,                    \
+                                            height, mx, my, mcbuffer);                                        \
+}
+
+EPEL_FUNCS(4,  8, ssse3, ssse3, avx)
+EPEL_FUNCS(8,  8, ssse3, ssse3, avx)
+EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
+EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
+EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
+EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
+
+EPEL_FUNCS(4,  10, avx, avx, avx)
+EPEL_FUNCS(8,  10, avx, avx, avx)
+EPEL_FUNCS(12, 10, avx, avx, avx)
+EPEL_FUNCS(16, 10, avx, avx, avx)
+EPEL_FUNCS(24, 10, avx, avx, avx)
+EPEL_FUNCS(32, 10, avx, avx, avx)
+
+#define PUT_PRED(width, depth, cf_uw, cf_w) \
+void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,                   \
+                                                                       int16_t *src, ptrdiff_t srcstride,                   \
+                                                                       int height);                                         \
+void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride,               \
+                                                                           int16_t *src1, int16_t *src2,                    \
+                                                                           ptrdiff_t srcstride, int height);                \
+void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset,          \
+                                                                    uint8_t *dst, ptrdiff_t dststride,                      \
+                                                                    int16_t *src, ptrdiff_t srcstride,                      \
+                                                                    int height);                                            \
+void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1,    \
+                                                                        int16_t offset0, int16_t offset1,                   \
+                                                                        uint8_t *dst, ptrdiff_t dststride,                  \
+                                                                        int16_t *src0, int16_t *src1, ptrdiff_t srcstride,  \
+                                                                        int height);
+
+PUT_PRED(4,  8, sse2, sse4)
+PUT_PRED(8,  8, sse2, sse4)
+PUT_PRED(12, 8, sse2, sse4)
+PUT_PRED(16, 8, sse2, sse4)
+PUT_PRED(24, 8, sse2, sse4)
+PUT_PRED(32, 8, sse2, sse4)
+PUT_PRED(48, 8, sse2, sse4)
+PUT_PRED(64, 8, sse2, sse4)
+
+PUT_PRED(4,  10, sse2, sse4)
+PUT_PRED(8,  10, sse2, sse4)
+PUT_PRED(12, 10, sse2, sse4)
+PUT_PRED(16, 10, sse2, sse4)
+PUT_PRED(24, 10, sse2, sse4)
+PUT_PRED(32, 10, sse2, sse4)
+PUT_PRED(48, 10, sse2, sse4)
+PUT_PRED(64, 10, sse2, sse4)
+
 void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -53,19 +196,281 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
+
+            c->put_hevc_qpel[0][0][0]  = ff_hevc_get_pixels_4_8_sse2;
+            c->put_hevc_qpel[0][0][1]  = ff_hevc_get_pixels_8_8_sse2;
+            c->put_hevc_qpel[0][0][2]  = ff_hevc_get_pixels_12_8_sse2;
+            c->put_hevc_qpel[0][0][3]  = ff_hevc_get_pixels_16_8_sse2;
+            c->put_hevc_qpel[0][0][4]  = ff_hevc_get_pixels_24_8_sse2;
+            c->put_hevc_qpel[0][0][5]  = ff_hevc_get_pixels_32_8_sse2;
+            c->put_hevc_qpel[0][0][6]  = ff_hevc_get_pixels_48_8_sse2;
+            c->put_hevc_qpel[0][0][7]  = ff_hevc_get_pixels_64_8_sse2;
+
+            c->put_hevc_epel[0][0][1]  = ff_hevc_get_pixels_4_8_sse2;
+            c->put_hevc_epel[0][0][3]  = ff_hevc_get_pixels_8_8_sse2;
+            c->put_hevc_epel[0][0][4]  = ff_hevc_get_pixels_12_8_sse2;
+            c->put_hevc_epel[0][0][5]  = ff_hevc_get_pixels_16_8_sse2;
+            c->put_hevc_epel[0][0][6]  = ff_hevc_get_pixels_24_8_sse2;
+            c->put_hevc_epel[0][0][7]  = ff_hevc_get_pixels_32_8_sse2;
+
+            c->put_unweighted_pred[0] = ff_hevc_put_unweighted_pred_4_8_sse2;
+            c->put_unweighted_pred[1] = ff_hevc_put_unweighted_pred_8_8_sse2;
+            c->put_unweighted_pred[2] = ff_hevc_put_unweighted_pred_12_8_sse2;
+            c->put_unweighted_pred[3] = ff_hevc_put_unweighted_pred_16_8_sse2;
+            c->put_unweighted_pred[4] = ff_hevc_put_unweighted_pred_24_8_sse2;
+            c->put_unweighted_pred[5] = ff_hevc_put_unweighted_pred_32_8_sse2;
+            c->put_unweighted_pred[6] = ff_hevc_put_unweighted_pred_48_8_sse2;
+            c->put_unweighted_pred[7] = ff_hevc_put_unweighted_pred_64_8_sse2;
+
+            c->put_unweighted_pred_avg[0] = ff_hevc_put_unweighted_pred_avg_4_8_sse2;
+            c->put_unweighted_pred_avg[1] = ff_hevc_put_unweighted_pred_avg_8_8_sse2;
+            c->put_unweighted_pred_avg[2] = ff_hevc_put_unweighted_pred_avg_12_8_sse2;
+            c->put_unweighted_pred_avg[3] = ff_hevc_put_unweighted_pred_avg_16_8_sse2;
+            c->put_unweighted_pred_avg[4] = ff_hevc_put_unweighted_pred_avg_24_8_sse2;
+            c->put_unweighted_pred_avg[5] = ff_hevc_put_unweighted_pred_avg_32_8_sse2;
+            c->put_unweighted_pred_avg[6] = ff_hevc_put_unweighted_pred_avg_48_8_sse2;
+            c->put_unweighted_pred_avg[7] = ff_hevc_put_unweighted_pred_avg_64_8_sse2;
+
+            c->put_unweighted_pred_chroma[1] = ff_hevc_put_unweighted_pred_4_8_sse2;
+            c->put_unweighted_pred_chroma[3] = ff_hevc_put_unweighted_pred_8_8_sse2;
+            c->put_unweighted_pred_chroma[4] = ff_hevc_put_unweighted_pred_12_8_sse2;
+            c->put_unweighted_pred_chroma[5] = ff_hevc_put_unweighted_pred_16_8_sse2;
+            c->put_unweighted_pred_chroma[6] = ff_hevc_put_unweighted_pred_24_8_sse2;
+            c->put_unweighted_pred_chroma[7] = ff_hevc_put_unweighted_pred_32_8_sse2;
+
+            c->put_unweighted_pred_avg_chroma[1] = ff_hevc_put_unweighted_pred_avg_4_8_sse2;
+            c->put_unweighted_pred_avg_chroma[3] = ff_hevc_put_unweighted_pred_avg_8_8_sse2;
+            c->put_unweighted_pred_avg_chroma[4] = ff_hevc_put_unweighted_pred_avg_12_8_sse2;
+            c->put_unweighted_pred_avg_chroma[5] = ff_hevc_put_unweighted_pred_avg_16_8_sse2;
+            c->put_unweighted_pred_avg_chroma[6] = ff_hevc_put_unweighted_pred_avg_24_8_sse2;
+            c->put_unweighted_pred_avg_chroma[7] = ff_hevc_put_unweighted_pred_avg_32_8_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
+
+            c->put_hevc_qpel[0][1][0]  = ff_hevc_qpel_h_4_8_ssse3;
+            c->put_hevc_qpel[0][1][1]  = ff_hevc_qpel_h_8_8_ssse3;
+            c->put_hevc_qpel[0][1][2]  = ff_hevc_qpel_h_12_8_ssse3;
+            c->put_hevc_qpel[0][1][3]  = ff_hevc_qpel_h_16_8_ssse3;
+            c->put_hevc_qpel[0][1][4]  = ff_hevc_qpel_h_24_8_ssse3;
+            c->put_hevc_qpel[0][1][5]  = ff_hevc_qpel_h_32_8_ssse3;
+            c->put_hevc_qpel[0][1][6]  = ff_hevc_qpel_h_48_8_ssse3;
+            c->put_hevc_qpel[0][1][7]  = ff_hevc_qpel_h_64_8_ssse3;
+
+            c->put_hevc_qpel[1][0][0]  = ff_hevc_qpel_v_4_8_ssse3;
+            c->put_hevc_qpel[1][0][1]  = ff_hevc_qpel_v_8_8_ssse3;
+            c->put_hevc_qpel[1][0][2]  = ff_hevc_qpel_v_12_8_ssse3;
+            c->put_hevc_qpel[1][0][3]  = ff_hevc_qpel_v_16_8_ssse3;
+            c->put_hevc_qpel[1][0][4]  = ff_hevc_qpel_v_24_8_ssse3;
+            c->put_hevc_qpel[1][0][5]  = ff_hevc_qpel_v_32_8_ssse3;
+            c->put_hevc_qpel[1][0][6]  = ff_hevc_qpel_v_48_8_ssse3;
+            c->put_hevc_qpel[1][0][7]  = ff_hevc_qpel_v_64_8_ssse3;
+
+            c->put_hevc_epel[0][1][1]  = ff_hevc_epel_h_4_8_ssse3;
+            c->put_hevc_epel[0][1][3]  = ff_hevc_epel_h_8_8_ssse3;
+            c->put_hevc_epel[0][1][4]  = ff_hevc_epel_h_12_8_ssse3;
+            c->put_hevc_epel[0][1][5]  = ff_hevc_epel_h_16_8_ssse3;
+            c->put_hevc_epel[0][1][6]  = ff_hevc_epel_h_24_8_ssse3;
+            c->put_hevc_epel[0][1][7]  = ff_hevc_epel_h_32_8_ssse3;
+
+            c->put_hevc_epel[1][0][1]  = ff_hevc_epel_v_4_8_ssse3;
+            c->put_hevc_epel[1][0][3]  = ff_hevc_epel_v_8_8_ssse3;
+            c->put_hevc_epel[1][0][4]  = ff_hevc_epel_v_12_8_ssse3;
+            c->put_hevc_epel[1][0][5]  = ff_hevc_epel_v_16_8_ssse3;
+            c->put_hevc_epel[1][0][6]  = ff_hevc_epel_v_24_8_ssse3;
+            c->put_hevc_epel[1][0][7]  = ff_hevc_epel_v_32_8_ssse3;
+        }
+
+        if (EXTERNAL_SSE4(cpu_flags)) {
+            c->weighted_pred[0] = ff_hevc_put_weighted_pred_4_8_sse4;
+            c->weighted_pred[1] = ff_hevc_put_weighted_pred_8_8_sse4;
+            c->weighted_pred[2] = ff_hevc_put_weighted_pred_12_8_sse4;
+            c->weighted_pred[3] = ff_hevc_put_weighted_pred_16_8_sse4;
+            c->weighted_pred[4] = ff_hevc_put_weighted_pred_24_8_sse4;
+            c->weighted_pred[5] = ff_hevc_put_weighted_pred_32_8_sse4;
+            c->weighted_pred[6] = ff_hevc_put_weighted_pred_48_8_sse4;
+            c->weighted_pred[7] = ff_hevc_put_weighted_pred_64_8_sse4;
+
+            c->weighted_pred_avg[0] = ff_hevc_put_weighted_pred_avg_4_8_sse4;
+            c->weighted_pred_avg[1] = ff_hevc_put_weighted_pred_avg_8_8_sse4;
+            c->weighted_pred_avg[2] = ff_hevc_put_weighted_pred_avg_12_8_sse4;
+            c->weighted_pred_avg[3] = ff_hevc_put_weighted_pred_avg_16_8_sse4;
+            c->weighted_pred_avg[4] = ff_hevc_put_weighted_pred_avg_24_8_sse4;
+            c->weighted_pred_avg[5] = ff_hevc_put_weighted_pred_avg_32_8_sse4;
+            c->weighted_pred_avg[6] = ff_hevc_put_weighted_pred_avg_48_8_sse4;
+            c->weighted_pred_avg[7] = ff_hevc_put_weighted_pred_avg_64_8_sse4;
+
+            c->weighted_pred_chroma[1] = ff_hevc_put_weighted_pred_4_8_sse4;
+            c->weighted_pred_chroma[3] = ff_hevc_put_weighted_pred_8_8_sse4;
+            c->weighted_pred_chroma[4] = ff_hevc_put_weighted_pred_12_8_sse4;
+            c->weighted_pred_chroma[5] = ff_hevc_put_weighted_pred_16_8_sse4;
+            c->weighted_pred_chroma[6] = ff_hevc_put_weighted_pred_24_8_sse4;
+            c->weighted_pred_chroma[7] = ff_hevc_put_weighted_pred_32_8_sse4;
+
+            c->weighted_pred_avg_chroma[1] = ff_hevc_put_weighted_pred_avg_4_8_sse4;
+            c->weighted_pred_avg_chroma[3] = ff_hevc_put_weighted_pred_avg_8_8_sse4;
+            c->weighted_pred_avg_chroma[4] = ff_hevc_put_weighted_pred_avg_12_8_sse4;
+            c->weighted_pred_avg_chroma[5] = ff_hevc_put_weighted_pred_avg_16_8_sse4;
+            c->weighted_pred_avg_chroma[6] = ff_hevc_put_weighted_pred_avg_24_8_sse4;
+            c->weighted_pred_avg_chroma[7] = ff_hevc_put_weighted_pred_avg_32_8_sse4;
+        }
+
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->put_hevc_qpel[1][1][0]  = hevc_qpel_hv_4_8_avx;
+            c->put_hevc_qpel[1][1][1]  = hevc_qpel_hv_8_8_avx;
+            c->put_hevc_qpel[1][1][2]  = hevc_qpel_hv_12_8_avx;
+            c->put_hevc_qpel[1][1][3]  = hevc_qpel_hv_16_8_avx;
+            c->put_hevc_qpel[1][1][4]  = hevc_qpel_hv_24_8_avx;
+            c->put_hevc_qpel[1][1][5]  = hevc_qpel_hv_32_8_avx;
+            c->put_hevc_qpel[1][1][6]  = hevc_qpel_hv_48_8_avx;
+            c->put_hevc_qpel[1][1][7]  = hevc_qpel_hv_64_8_avx;
+
+            c->put_hevc_epel[1][1][1]  = hevc_epel_hv_4_8_avx;
+            c->put_hevc_epel[1][1][3]  = hevc_epel_hv_8_8_avx;
+            c->put_hevc_epel[1][1][4]  = hevc_epel_hv_12_8_avx;
+            c->put_hevc_epel[1][1][5]  = hevc_epel_hv_16_8_avx;
+            c->put_hevc_epel[1][1][6]  = hevc_epel_hv_24_8_avx;
+            c->put_hevc_epel[1][1][7]  = hevc_epel_hv_32_8_avx;
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_SSE2(cpu_flags)) {
             c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
             c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
+
+            c->put_hevc_qpel[0][0][0]  = ff_hevc_get_pixels_4_10_sse2;
+            c->put_hevc_qpel[0][0][1]  = ff_hevc_get_pixels_8_10_sse2;
+            c->put_hevc_qpel[0][0][2]  = ff_hevc_get_pixels_12_10_sse2;
+            c->put_hevc_qpel[0][0][3]  = ff_hevc_get_pixels_16_10_sse2;
+            c->put_hevc_qpel[0][0][4]  = ff_hevc_get_pixels_24_10_sse2;
+            c->put_hevc_qpel[0][0][5]  = ff_hevc_get_pixels_32_10_sse2;
+            c->put_hevc_qpel[0][0][6]  = ff_hevc_get_pixels_48_10_sse2;
+            c->put_hevc_qpel[0][0][7]  = ff_hevc_get_pixels_64_10_sse2;
+
+            c->put_hevc_epel[0][0][1]  = ff_hevc_get_pixels_4_10_sse2;
+            c->put_hevc_epel[0][0][3]  = ff_hevc_get_pixels_8_10_sse2;
+            c->put_hevc_epel[0][0][4]  = ff_hevc_get_pixels_12_10_sse2;
+            c->put_hevc_epel[0][0][5]  = ff_hevc_get_pixels_16_10_sse2;
+            c->put_hevc_epel[0][0][6]  = ff_hevc_get_pixels_24_10_sse2;
+            c->put_hevc_epel[0][0][7]  = ff_hevc_get_pixels_32_10_sse2;
+
+            c->put_unweighted_pred[0] = ff_hevc_put_unweighted_pred_4_10_sse2;
+            c->put_unweighted_pred[1] = ff_hevc_put_unweighted_pred_8_10_sse2;
+            c->put_unweighted_pred[2] = ff_hevc_put_unweighted_pred_12_10_sse2;
+            c->put_unweighted_pred[3] = ff_hevc_put_unweighted_pred_16_10_sse2;
+            c->put_unweighted_pred[4] = ff_hevc_put_unweighted_pred_24_10_sse2;
+            c->put_unweighted_pred[5] = ff_hevc_put_unweighted_pred_32_10_sse2;
+            c->put_unweighted_pred[6] = ff_hevc_put_unweighted_pred_48_10_sse2;
+            c->put_unweighted_pred[7] = ff_hevc_put_unweighted_pred_64_10_sse2;
+
+            c->put_unweighted_pred_avg[0] = ff_hevc_put_unweighted_pred_avg_4_10_sse2;
+            c->put_unweighted_pred_avg[1] = ff_hevc_put_unweighted_pred_avg_8_10_sse2;
+            c->put_unweighted_pred_avg[4] = ff_hevc_put_unweighted_pred_avg_12_10_sse2;
+            c->put_unweighted_pred_avg[3] = ff_hevc_put_unweighted_pred_avg_16_10_sse2;
+            c->put_unweighted_pred_avg[4] = ff_hevc_put_unweighted_pred_avg_24_10_sse2;
+            c->put_unweighted_pred_avg[5] = ff_hevc_put_unweighted_pred_avg_32_10_sse2;
+            c->put_unweighted_pred_avg[6] = ff_hevc_put_unweighted_pred_avg_48_10_sse2;
+            c->put_unweighted_pred_avg[7] = ff_hevc_put_unweighted_pred_avg_64_10_sse2;
+
+            c->put_unweighted_pred_chroma[1] = ff_hevc_put_unweighted_pred_4_10_sse2;
+            c->put_unweighted_pred_chroma[3] = ff_hevc_put_unweighted_pred_8_10_sse2;
+            c->put_unweighted_pred_chroma[4] = ff_hevc_put_unweighted_pred_12_10_sse2;
+            c->put_unweighted_pred_chroma[5] = ff_hevc_put_unweighted_pred_16_10_sse2;
+            c->put_unweighted_pred_chroma[6] = ff_hevc_put_unweighted_pred_24_10_sse2;
+            c->put_unweighted_pred_chroma[7] = ff_hevc_put_unweighted_pred_32_10_sse2;
+
+            c->put_unweighted_pred_avg_chroma[1] = ff_hevc_put_unweighted_pred_avg_4_10_sse2;
+            c->put_unweighted_pred_avg_chroma[3] = ff_hevc_put_unweighted_pred_avg_8_10_sse2;
+            c->put_unweighted_pred_avg_chroma[4] = ff_hevc_put_unweighted_pred_avg_12_10_sse2;
+            c->put_unweighted_pred_avg_chroma[5] = ff_hevc_put_unweighted_pred_avg_16_10_sse2;
+            c->put_unweighted_pred_avg_chroma[6] = ff_hevc_put_unweighted_pred_avg_24_10_sse2;
+            c->put_unweighted_pred_avg_chroma[7] = ff_hevc_put_unweighted_pred_avg_32_10_sse2;
         }
         if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
             c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
             c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
         }
+        if (EXTERNAL_SSE4(cpu_flags)) {
+            c->weighted_pred[0] = ff_hevc_put_weighted_pred_4_10_sse4;
+            c->weighted_pred[1] = ff_hevc_put_weighted_pred_8_10_sse4;
+            c->weighted_pred[2] = ff_hevc_put_weighted_pred_12_10_sse4;
+            c->weighted_pred[3] = ff_hevc_put_weighted_pred_16_10_sse4;
+            c->weighted_pred[4] = ff_hevc_put_weighted_pred_24_10_sse4;
+            c->weighted_pred[5] = ff_hevc_put_weighted_pred_32_10_sse4;
+            c->weighted_pred[6] = ff_hevc_put_weighted_pred_48_10_sse4;
+            c->weighted_pred[7] = ff_hevc_put_weighted_pred_64_10_sse4;
+
+            c->weighted_pred_avg[0] = ff_hevc_put_weighted_pred_avg_4_10_sse4;
+            c->weighted_pred_avg[1] = ff_hevc_put_weighted_pred_avg_8_10_sse4;
+            c->weighted_pred_avg[2] = ff_hevc_put_weighted_pred_avg_12_10_sse4;
+            c->weighted_pred_avg[3] = ff_hevc_put_weighted_pred_avg_16_10_sse4;
+            c->weighted_pred_avg[4] = ff_hevc_put_weighted_pred_avg_24_10_sse4;
+            c->weighted_pred_avg[5] = ff_hevc_put_weighted_pred_avg_32_10_sse4;
+            c->weighted_pred_avg[6] = ff_hevc_put_weighted_pred_avg_48_10_sse4;
+            c->weighted_pred_avg[7] = ff_hevc_put_weighted_pred_avg_64_10_sse4;
+
+            c->weighted_pred_chroma[1] = ff_hevc_put_weighted_pred_4_10_sse4;
+            c->weighted_pred_chroma[3] = ff_hevc_put_weighted_pred_8_10_sse4;
+            c->weighted_pred_chroma[4] = ff_hevc_put_weighted_pred_12_10_sse4;
+            c->weighted_pred_chroma[5] = ff_hevc_put_weighted_pred_16_10_sse4;
+            c->weighted_pred_chroma[6] = ff_hevc_put_weighted_pred_24_10_sse4;
+            c->weighted_pred_chroma[7] = ff_hevc_put_weighted_pred_32_10_sse4;
+
+            c->weighted_pred_avg_chroma[1] = ff_hevc_put_weighted_pred_avg_4_10_sse4;
+            c->weighted_pred_avg_chroma[3] = ff_hevc_put_weighted_pred_avg_8_10_sse4;
+            c->weighted_pred_avg_chroma[4] = ff_hevc_put_weighted_pred_avg_12_10_sse4;
+            c->weighted_pred_avg_chroma[5] = ff_hevc_put_weighted_pred_avg_16_10_sse4;
+            c->weighted_pred_avg_chroma[6] = ff_hevc_put_weighted_pred_avg_24_10_sse4;
+            c->weighted_pred_avg_chroma[7] = ff_hevc_put_weighted_pred_avg_32_10_sse4;
+        }
+        if (EXTERNAL_AVX(cpu_flags)) {
+            c->put_hevc_qpel[0][1][0]  = ff_hevc_qpel_h_4_10_avx;
+            c->put_hevc_qpel[0][1][1]  = ff_hevc_qpel_h_8_10_avx;
+            c->put_hevc_qpel[0][1][2]  = ff_hevc_qpel_h_12_10_avx;
+            c->put_hevc_qpel[0][1][3]  = ff_hevc_qpel_h_16_10_avx;
+            c->put_hevc_qpel[0][1][4]  = ff_hevc_qpel_h_24_10_avx;
+            c->put_hevc_qpel[0][1][5]  = ff_hevc_qpel_h_32_10_avx;
+            c->put_hevc_qpel[0][1][6]  = ff_hevc_qpel_h_48_10_avx;
+            c->put_hevc_qpel[0][1][7]  = ff_hevc_qpel_h_64_10_avx;
+
+            c->put_hevc_qpel[1][0][0]  = ff_hevc_qpel_v_4_10_avx;
+            c->put_hevc_qpel[1][0][1]  = ff_hevc_qpel_v_8_10_avx;
+            c->put_hevc_qpel[1][0][2]  = ff_hevc_qpel_v_12_10_avx;
+            c->put_hevc_qpel[1][0][3]  = ff_hevc_qpel_v_16_10_avx;
+            c->put_hevc_qpel[1][0][4]  = ff_hevc_qpel_v_24_10_avx;
+            c->put_hevc_qpel[1][0][5]  = ff_hevc_qpel_v_32_10_avx;
+            c->put_hevc_qpel[1][0][6]  = ff_hevc_qpel_v_48_10_avx;
+            c->put_hevc_qpel[1][0][7]  = ff_hevc_qpel_v_64_10_avx;
+
+            c->put_hevc_qpel[1][1][0]  = hevc_qpel_hv_4_10_avx;
+            c->put_hevc_qpel[1][1][1]  = hevc_qpel_hv_8_10_avx;
+            c->put_hevc_qpel[1][1][2]  = hevc_qpel_hv_12_10_avx;
+            c->put_hevc_qpel[1][1][3]  = hevc_qpel_hv_16_10_avx;
+            c->put_hevc_qpel[1][1][4]  = hevc_qpel_hv_24_10_avx;
+            c->put_hevc_qpel[1][1][5]  = hevc_qpel_hv_32_10_avx;
+            c->put_hevc_qpel[1][1][6]  = hevc_qpel_hv_48_10_avx;
+            c->put_hevc_qpel[1][1][7]  = hevc_qpel_hv_64_10_avx;
+
+            c->put_hevc_epel[0][1][1]  = ff_hevc_epel_h_4_10_avx;
+            c->put_hevc_epel[0][1][3]  = ff_hevc_epel_h_8_10_avx;
+            c->put_hevc_epel[0][1][4]  = ff_hevc_epel_h_12_10_avx;
+            c->put_hevc_epel[0][1][5]  = ff_hevc_epel_h_16_10_avx;
+            c->put_hevc_epel[0][1][6]  = ff_hevc_epel_h_24_10_avx;
+            c->put_hevc_epel[0][1][7]  = ff_hevc_epel_h_32_10_avx;
+
+            c->put_hevc_epel[1][0][1]  = ff_hevc_epel_v_4_10_avx;
+            c->put_hevc_epel[1][0][3]  = ff_hevc_epel_v_8_10_avx;
+            c->put_hevc_epel[1][0][4]  = ff_hevc_epel_v_12_10_avx;
+            c->put_hevc_epel[1][0][5]  = ff_hevc_epel_v_16_10_avx;
+            c->put_hevc_epel[1][0][6]  = ff_hevc_epel_v_24_10_avx;
+            c->put_hevc_epel[1][0][7]  = ff_hevc_epel_v_32_10_avx;
+
+            c->put_hevc_epel[1][1][1]  = hevc_epel_hv_4_10_avx;
+            c->put_hevc_epel[1][1][3]  = hevc_epel_hv_8_10_avx;
+            c->put_hevc_epel[1][1][4]  = hevc_epel_hv_12_10_avx;
+            c->put_hevc_epel[1][1][5]  = hevc_epel_hv_16_10_avx;
+            c->put_hevc_epel[1][1][6]  = hevc_epel_hv_24_10_avx;
+            c->put_hevc_epel[1][1][7]  = hevc_epel_hv_32_10_avx;
+        }
     }
 }
-- 
2.0.0



More information about the libav-devel mailing list