[libav-commits] h264: use templates to avoid excessive inlining

Mans Rullgard git at libav.org
Thu Jul 5 12:55:12 CEST 2012


Module: libav
Branch: master
Commit: 28fff0d9740e00c2ee82f72a4be55bdbb5e0c8c6

Author:    Mans Rullgard <mans at mansr.com>
Committer: Mans Rullgard <mans at mansr.com>
Date:      Tue Jul  3 23:16:11 2012 +0100

h264: use templates to avoid excessive inlining

Instead of inlining everything into ff_h264_hl_decode_mb(), use
explicit templating to create versions of the called functions
with constant parameters filled in.  This greatly speeds up
compilation of h264.c and reduces the code size without any
measurable impact on performance.

Compilation time for h264.c on an i7 goes from 30s to 5.5s.
Code size is reduced by 430kB.

Signed-off-by: Mans Rullgard <mans at mansr.com>

---

 libavcodec/h264.c             |  543 +----------------------------------------
 libavcodec/h264_mb_template.c |  380 ++++++++++++++++++++++++++++
 libavcodec/h264_mc_template.c |  160 ++++++++++++
 3 files changed, 550 insertions(+), 533 deletions(-)

diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 2d6a08e..a4afcc8 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -714,33 +714,6 @@ static av_always_inline void mc_part_weighted(H264Context *h, int n, int square,
     }
 }
 
-static av_always_inline void mc_part(H264Context *h, int n, int square,
-                                     int height, int delta,
-                                     uint8_t *dest_y, uint8_t *dest_cb,
-                                     uint8_t *dest_cr,
-                                     int x_offset, int y_offset,
-                                     qpel_mc_func *qpix_put,
-                                     h264_chroma_mc_func chroma_put,
-                                     qpel_mc_func *qpix_avg,
-                                     h264_chroma_mc_func chroma_avg,
-                                     h264_weight_func *weight_op,
-                                     h264_biweight_func *weight_avg,
-                                     int list0, int list1,
-                                     int pixel_shift, int chroma_idc)
-{
-    if ((h->use_weight == 2 && list0 && list1 &&
-         (h->implicit_weight[h->ref_cache[0][scan8[n]]][h->ref_cache[1][scan8[n]]][h->s.mb_y & 1] != 32)) ||
-        h->use_weight == 1)
-        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
-                         x_offset, y_offset, qpix_put, chroma_put,
-                         weight_op[0], weight_op[1], weight_avg[0],
-                         weight_avg[1], list0, list1, pixel_shift, chroma_idc);
-    else
-        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
-                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
-                    chroma_avg, list0, list1, pixel_shift, chroma_idc);
-}
-
 static av_always_inline void prefetch_motion(H264Context *h, int list,
                                              int pixel_shift, int chroma_idc)
 {
@@ -768,146 +741,6 @@ static av_always_inline void prefetch_motion(H264Context *h, int list,
     }
 }
 
-static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y,
-                                       uint8_t *dest_cb, uint8_t *dest_cr,
-                                       qpel_mc_func(*qpix_put)[16],
-                                       h264_chroma_mc_func(*chroma_put),
-                                       qpel_mc_func(*qpix_avg)[16],
-                                       h264_chroma_mc_func(*chroma_avg),
-                                       h264_weight_func *weight_op,
-                                       h264_biweight_func *weight_avg,
-                                       int pixel_shift, int chroma_idc)
-{
-    MpegEncContext *const s = &h->s;
-    const int mb_xy   = h->mb_xy;
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
-
-    assert(IS_INTER(mb_type));
-
-    if (HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
-        await_references(h);
-    prefetch_motion(h, 0, pixel_shift, chroma_idc);
-
-    if (IS_16X16(mb_type)) {
-        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
-                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
-                weight_op, weight_avg,
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma_idc);
-    } else if (IS_16X8(mb_type)) {
-        mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
-                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                weight_op, weight_avg,
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma_idc);
-        mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
-                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
-                weight_op, weight_avg,
-                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma_idc);
-    } else if (IS_8X16(mb_type)) {
-        mc_part(h, 0, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
-                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[1], &weight_avg[1],
-                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
-                pixel_shift, chroma_idc);
-        mc_part(h, 4, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
-                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                &weight_op[1], &weight_avg[1],
-                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
-                pixel_shift, chroma_idc);
-    } else {
-        int i;
-
-        assert(IS_8X8(mb_type));
-
-        for (i = 0; i < 4; i++) {
-            const int sub_mb_type = h->sub_mb_type[i];
-            const int n  = 4 * i;
-            int x_offset = (i & 1) << 2;
-            int y_offset = (i & 2) << 1;
-
-            if (IS_SUB_8X8(sub_mb_type)) {
-                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr,
-                        x_offset, y_offset,
-                        qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
-                        &weight_op[1], &weight_avg[1],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-            } else if (IS_SUB_8X4(sub_mb_type)) {
-                mc_part(h, n, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr,
-                        x_offset, y_offset,
-                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                        &weight_op[1], &weight_avg[1],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-                mc_part(h, n + 2, 0, 4, 4 << pixel_shift,
-                        dest_y, dest_cb, dest_cr, x_offset, y_offset + 2,
-                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
-                        &weight_op[1], &weight_avg[1],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-            } else if (IS_SUB_4X8(sub_mb_type)) {
-                mc_part(h, n, 0, 8, 4 * h->mb_linesize,
-                        dest_y, dest_cb, dest_cr, x_offset, y_offset,
-                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[2], &weight_avg[2],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-                mc_part(h, n + 1, 0, 8, 4 * h->mb_linesize,
-                        dest_y, dest_cb, dest_cr, x_offset + 2, y_offset,
-                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                        &weight_op[2], &weight_avg[2],
-                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                        pixel_shift, chroma_idc);
-            } else {
-                int j;
-                assert(IS_SUB_4X4(sub_mb_type));
-                for (j = 0; j < 4; j++) {
-                    int sub_x_offset = x_offset + 2 * (j & 1);
-                    int sub_y_offset = y_offset + (j & 2);
-                    mc_part(h, n + j, 1, 4, 0,
-                            dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
-                            qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
-                            &weight_op[2], &weight_avg[2],
-                            IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
-                            pixel_shift, chroma_idc);
-                }
-            }
-        }
-    }
-
-    prefetch_motion(h, 1, pixel_shift, chroma_idc);
-}
-
-static av_always_inline void hl_motion_420(H264Context *h, uint8_t *dest_y,
-                                           uint8_t *dest_cb, uint8_t *dest_cr,
-                                           qpel_mc_func(*qpix_put)[16],
-                                           h264_chroma_mc_func(*chroma_put),
-                                           qpel_mc_func(*qpix_avg)[16],
-                                           h264_chroma_mc_func(*chroma_avg),
-                                           h264_weight_func *weight_op,
-                                           h264_biweight_func *weight_avg,
-                                           int pixel_shift)
-{
-    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
-              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 1);
-}
-
-static av_always_inline void hl_motion_422(H264Context *h, uint8_t *dest_y,
-                                           uint8_t *dest_cb, uint8_t *dest_cr,
-                                           qpel_mc_func(*qpix_put)[16],
-                                           h264_chroma_mc_func(*chroma_put),
-                                           qpel_mc_func(*qpix_avg)[16],
-                                           h264_chroma_mc_func(*chroma_avg),
-                                           h264_weight_func *weight_op,
-                                           h264_biweight_func *weight_avg,
-                                           int pixel_shift)
-{
-    hl_motion(h, dest_y, dest_cb, dest_cr, qpix_put, chroma_put,
-              qpix_avg, chroma_avg, weight_op, weight_avg, pixel_shift, 2);
-}
-
 static void free_tables(H264Context *h, int free_rbsp)
 {
     int i;
@@ -2077,373 +1910,17 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
     }
 }
 
-static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple,
-                                                   int pixel_shift)
-{
-    MpegEncContext *const s = &h->s;
-    const int mb_x    = s->mb_x;
-    const int mb_y    = s->mb_y;
-    const int mb_xy   = h->mb_xy;
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
-    uint8_t *dest_y, *dest_cb, *dest_cr;
-    int linesize, uvlinesize /*dct_offset*/;
-    int i, j;
-    int *block_offset = &h->block_offset[0];
-    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
-    /* is_h264 should always be true if SVQ3 is disabled. */
-    const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
-    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
-    const int block_h   = 16 >> s->chroma_y_shift;
-    const int chroma422 = CHROMA422;
-
-    dest_y  = s->current_picture.f.data[0] + ((mb_x << pixel_shift)     + mb_y * s->linesize)  * 16;
-    dest_cb = s->current_picture.f.data[1] +  (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h;
-    dest_cr = s->current_picture.f.data[2] +  (mb_x << pixel_shift) * 8 + mb_y * s->uvlinesize * block_h;
-
-    s->dsp.prefetch(dest_y  + (s->mb_x & 3) * 4 * s->linesize   + (64 << pixel_shift), s->linesize,       4);
-    s->dsp.prefetch(dest_cb + (s->mb_x & 7)     * s->uvlinesize + (64 << pixel_shift), dest_cr - dest_cb, 2);
-
-    h->list_counts[mb_xy] = h->list_count;
-
-    if (!simple && MB_FIELD) {
-        linesize     = h->mb_linesize = s->linesize * 2;
-        uvlinesize   = h->mb_uvlinesize = s->uvlinesize * 2;
-        block_offset = &h->block_offset[48];
-        if (mb_y & 1) { // FIXME move out of this function?
-            dest_y  -= s->linesize * 15;
-            dest_cb -= s->uvlinesize * (block_h - 1);
-            dest_cr -= s->uvlinesize * (block_h - 1);
-        }
-        if (FRAME_MBAFF) {
-            int list;
-            for (list = 0; list < h->list_count; list++) {
-                if (!USES_LIST(mb_type, list))
-                    continue;
-                if (IS_16X16(mb_type)) {
-                    int8_t *ref = &h->ref_cache[list][scan8[0]];
-                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
-                } else {
-                    for (i = 0; i < 16; i += 4) {
-                        int ref = h->ref_cache[list][scan8[i]];
-                        if (ref >= 0)
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
-                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
-                    }
-                }
-            }
-        }
-    } else {
-        linesize   = h->mb_linesize   = s->linesize;
-        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
-        // dct_offset = s->linesize * 16;
-    }
-
-    if (!simple && IS_INTRA_PCM(mb_type)) {
-        if (pixel_shift) {
-            const int bit_depth = h->sps.bit_depth_luma;
-            int j;
-            GetBitContext gb;
-            init_get_bits(&gb, (uint8_t *)h->mb,
-                          ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
-
-            for (i = 0; i < 16; i++) {
-                uint16_t *tmp_y = (uint16_t *)(dest_y + i * linesize);
-                for (j = 0; j < 16; j++)
-                    tmp_y[j] = get_bits(&gb, bit_depth);
-            }
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
-                if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cb[j] = 1 << (bit_depth - 1);
-                    }
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cr[j] = 1 << (bit_depth - 1);
-                    }
-                } else {
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cb[j] = get_bits(&gb, bit_depth);
-                    }
-                    for (i = 0; i < block_h; i++) {
-                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
-                        for (j = 0; j < 8; j++)
-                            tmp_cr[j] = get_bits(&gb, bit_depth);
-                    }
-                }
-            }
-        } else {
-            for (i = 0; i < 16; i++)
-                memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
-                if (!h->sps.chroma_format_idc) {
-                    for (i = 0; i < block_h; i++) {
-                        memset(dest_cb + i * uvlinesize, 128, 8);
-                        memset(dest_cr + i * uvlinesize, 128, 8);
-                    }
-                } else {
-                    uint8_t *src_cb = (uint8_t *)h->mb + 256;
-                    uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
-                    for (i = 0; i < block_h; i++) {
-                        memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
-                        memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
-                    }
-                }
-            }
-        }
-    } else {
-        if (IS_INTRA(mb_type)) {
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
-                               uvlinesize, 1, 0, simple, pixel_shift);
-
-            if (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
-                h->hpc.pred8x8[h->chroma_pred_mode](dest_cb, uvlinesize);
-                h->hpc.pred8x8[h->chroma_pred_mode](dest_cr, uvlinesize);
-            }
-
-            hl_decode_mb_predict_luma(h, mb_type, is_h264, simple,
-                                      transform_bypass, pixel_shift,
-                                      block_offset, linesize, dest_y, 0);
-
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
-                               uvlinesize, 0, 0, simple, pixel_shift);
-        } else if (is_h264) {
-            if (chroma422) {
-                hl_motion_422(h, dest_y, dest_cb, dest_cr,
-                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                              h->h264dsp.weight_h264_pixels_tab,
-                              h->h264dsp.biweight_h264_pixels_tab,
-                              pixel_shift);
-            } else {
-                hl_motion_420(h, dest_y, dest_cb, dest_cr,
-                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                              h->h264dsp.weight_h264_pixels_tab,
-                              h->h264dsp.biweight_h264_pixels_tab,
-                              pixel_shift);
-            }
-        }
-
-        hl_decode_mb_idct_luma(h, mb_type, is_h264, simple, transform_bypass,
-                               pixel_shift, block_offset, linesize, dest_y, 0);
-
-        if ((simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) &&
-            (h->cbp & 0x30)) {
-            uint8_t *dest[2] = { dest_cb, dest_cr };
-            if (transform_bypass) {
-                if (IS_INTRA(mb_type) && h->sps.profile_idc == 244 &&
-                    (h->chroma_pred_mode == VERT_PRED8x8 ||
-                     h->chroma_pred_mode == HOR_PRED8x8)) {
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0],
-                                                            block_offset + 16,
-                                                            h->mb + (16 * 16 * 1 << pixel_shift),
-                                                            uvlinesize);
-                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1],
-                                                            block_offset + 32,
-                                                            h->mb + (16 * 16 * 2 << pixel_shift),
-                                                            uvlinesize);
-                } else {
-                    idct_add = s->dsp.add_pixels4;
-                    for (j = 1; j < 3; j++) {
-                        for (i = j * 16; i < j * 16 + 4; i++)
-                            if (h->non_zero_count_cache[scan8[i]] ||
-                                dctcoef_get(h->mb, pixel_shift, i * 16))
-                                idct_add(dest[j - 1] + block_offset[i],
-                                         h->mb + (i * 16 << pixel_shift),
-                                         uvlinesize);
-                        if (chroma422) {
-                            for (i = j * 16 + 4; i < j * 16 + 8; i++)
-                                if (h->non_zero_count_cache[scan8[i + 4]] ||
-                                    dctcoef_get(h->mb, pixel_shift, i * 16))
-                                    idct_add(dest[j - 1] + block_offset[i + 4],
-                                             h->mb + (i * 16 << pixel_shift),
-                                             uvlinesize);
-                        }
-                    }
-                }
-            } else {
-                if (is_h264) {
-                    int qp[2];
-                    if (chroma422) {
-                        qp[0] = h->chroma_qp[0] + 3;
-                        qp[1] = h->chroma_qp[1] + 3;
-                    } else {
-                        qp[0] = h->chroma_qp[0];
-                        qp[1] = h->chroma_qp[1];
-                    }
-                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 0]])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 1 << pixel_shift),
-                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][qp[0]][0]);
-                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 1]])
-                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 2 << pixel_shift),
-                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][qp[1]][0]);
-                    h->h264dsp.h264_idct_add8(dest, block_offset,
-                                              h->mb, uvlinesize,
-                                              h->non_zero_count_cache);
-                } else if (CONFIG_SVQ3_DECODER) {
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 1,
-                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][h->chroma_qp[0]][0]);
-                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 2,
-                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][h->chroma_qp[1]][0]);
-                    for (j = 1; j < 3; j++) {
-                        for (i = j * 16; i < j * 16 + 4; i++)
-                            if (h->non_zero_count_cache[scan8[i]] || h->mb[i * 16]) {
-                                uint8_t *const ptr = dest[j - 1] + block_offset[i];
-                                ff_svq3_add_idct_c(ptr, h->mb + i * 16,
-                                                   uvlinesize,
-                                                   ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
-                            }
-                    }
-                }
-            }
-        }
-    }
-    if (h->cbp || IS_INTRA(mb_type)) {
-        s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift));
-    }
-}
-
-static av_always_inline void hl_decode_mb_444_internal(H264Context *h,
-                                                       int simple,
-                                                       int pixel_shift)
-{
-    MpegEncContext *const s = &h->s;
-    const int mb_x    = s->mb_x;
-    const int mb_y    = s->mb_y;
-    const int mb_xy   = h->mb_xy;
-    const int mb_type = s->current_picture.f.mb_type[mb_xy];
-    uint8_t *dest[3];
-    int linesize;
-    int i, j, p;
-    int *block_offset = &h->block_offset[0];
-    const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
-    const int plane_count      = (simple || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) ? 3 : 1;
-
-    for (p = 0; p < plane_count; p++) {
-        dest[p] = s->current_picture.f.data[p] +
-                  ((mb_x << pixel_shift) + mb_y * s->linesize) * 16;
-        s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << pixel_shift),
-                        s->linesize, 4);
-    }
-
-    h->list_counts[mb_xy] = h->list_count;
-
-    if (!simple && MB_FIELD) {
-        linesize     = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
-        block_offset = &h->block_offset[48];
-        if (mb_y & 1) // FIXME move out of this function?
-            for (p = 0; p < 3; p++)
-                dest[p] -= s->linesize * 15;
-        if (FRAME_MBAFF) {
-            int list;
-            for (list = 0; list < h->list_count; list++) {
-                if (!USES_LIST(mb_type, list))
-                    continue;
-                if (IS_16X16(mb_type)) {
-                    int8_t *ref = &h->ref_cache[list][scan8[0]];
-                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
-                } else {
-                    for (i = 0; i < 16; i += 4) {
-                        int ref = h->ref_cache[list][scan8[i]];
-                        if (ref >= 0)
-                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
-                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
-                    }
-                }
-            }
-        }
-    } else {
-        linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize;
-    }
-
-    if (!simple && IS_INTRA_PCM(mb_type)) {
-        if (pixel_shift) {
-            const int bit_depth = h->sps.bit_depth_luma;
-            GetBitContext gb;
-            init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
-
-            for (p = 0; p < plane_count; p++)
-                for (i = 0; i < 16; i++) {
-                    uint16_t *tmp = (uint16_t *)(dest[p] + i * linesize);
-                    for (j = 0; j < 16; j++)
-                        tmp[j] = get_bits(&gb, bit_depth);
-                }
-        } else {
-            for (p = 0; p < plane_count; p++)
-                for (i = 0; i < 16; i++)
-                    memcpy(dest[p] + i * linesize,
-                           (uint8_t *)h->mb + p * 256 + i * 16, 16);
-        }
-    } else {
-        if (IS_INTRA(mb_type)) {
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
-                               linesize, 1, 1, simple, pixel_shift);
-
-            for (p = 0; p < plane_count; p++)
-                hl_decode_mb_predict_luma(h, mb_type, 1, simple,
-                                          transform_bypass, pixel_shift,
-                                          block_offset, linesize, dest[p], p);
-
-            if (h->deblocking_filter)
-                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
-                               linesize, 0, 1, simple, pixel_shift);
-        } else {
-            hl_motion(h, dest[0], dest[1], dest[2],
-                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
-                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
-                      h->h264dsp.weight_h264_pixels_tab,
-                      h->h264dsp.biweight_h264_pixels_tab, pixel_shift, 3);
-        }
-
-        for (p = 0; p < plane_count; p++)
-            hl_decode_mb_idct_luma(h, mb_type, 1, simple, transform_bypass,
-                                   pixel_shift, block_offset, linesize,
-                                   dest[p], p);
-    }
-    if (h->cbp || IS_INTRA(mb_type)) {
-        s->dsp.clear_blocks(h->mb);
-        s->dsp.clear_blocks(h->mb + (24 * 16 << pixel_shift));
-    }
-}
-
-/**
- * Process a macroblock; this case avoids checks for expensive uncommon cases.
- */
-#define hl_decode_mb_simple(sh, bits)                          \
-static void hl_decode_mb_simple_ ## bits(H264Context *h)       \
-{                                                              \
-    hl_decode_mb_internal(h, 1, sh);                           \
-}
+#define BITS   8
+#define SIMPLE 1
+#include "h264_mb_template.c"
 
-hl_decode_mb_simple(0, 8)
-hl_decode_mb_simple(1, 16)
+#undef  BITS
+#define BITS   16
+#include "h264_mb_template.c"
 
-/**
- * Process a macroblock; this handles edge cases, such as interlacing.
- */
-static av_noinline void hl_decode_mb_complex(H264Context *h)
-{
-    hl_decode_mb_internal(h, 0, h->pixel_shift);
-}
-
-static av_noinline void hl_decode_mb_444_complex(H264Context *h)
-{
-    hl_decode_mb_444_internal(h, 0, h->pixel_shift);
-}
-
-static av_noinline void hl_decode_mb_444_simple(H264Context *h)
-{
-    hl_decode_mb_444_internal(h, 1, 0);
-}
+#undef  SIMPLE
+#define SIMPLE 0
+#include "h264_mb_template.c"
 
 void ff_h264_hl_decode_mb(H264Context *h)
 {
@@ -2456,7 +1933,7 @@ void ff_h264_hl_decode_mb(H264Context *h)
         if (is_complex || h->pixel_shift)
             hl_decode_mb_444_complex(h);
         else
-            hl_decode_mb_444_simple(h);
+            hl_decode_mb_444_simple_8(h);
     } else if (is_complex) {
         hl_decode_mb_complex(h);
     } else if (h->pixel_shift) {
diff --git a/libavcodec/h264_mb_template.c b/libavcodec/h264_mb_template.c
new file mode 100644
index 0000000..b7856cb
--- /dev/null
+++ b/libavcodec/h264_mb_template.c
@@ -0,0 +1,380 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef FUNC
+#undef PIXEL_SHIFT
+
+#if SIMPLE
+#   define FUNC(n) AV_JOIN(n ## _simple_, BITS)
+#   define PIXEL_SHIFT (BITS >> 4)
+#else
+#   define FUNC(n) n ## _complex
+#   define PIXEL_SHIFT h->pixel_shift
+#endif
+
+#undef  CHROMA_IDC
+#define CHROMA_IDC 1
+#include "h264_mc_template.c"
+
+#undef  CHROMA_IDC
+#define CHROMA_IDC 2
+#include "h264_mc_template.c"
+
+static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
+{
+    MpegEncContext *const s = &h->s;
+    const int mb_x    = s->mb_x;
+    const int mb_y    = s->mb_y;
+    const int mb_xy   = h->mb_xy;
+    const int mb_type = s->current_picture.f.mb_type[mb_xy];
+    uint8_t *dest_y, *dest_cb, *dest_cr;
+    int linesize, uvlinesize /*dct_offset*/;
+    int i, j;
+    int *block_offset = &h->block_offset[0];
+    const int transform_bypass = !SIMPLE && (s->qscale == 0 && h->sps.transform_bypass);
+    /* is_h264 should always be true if SVQ3 is disabled. */
+    const int is_h264 = !CONFIG_SVQ3_DECODER || SIMPLE || s->codec_id == CODEC_ID_H264;
+    void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
+    const int block_h   = 16 >> s->chroma_y_shift;
+    const int chroma422 = CHROMA422;
+
+    dest_y  = s->current_picture.f.data[0] + ((mb_x << PIXEL_SHIFT)     + mb_y * s->linesize)  * 16;
+    dest_cb = s->current_picture.f.data[1] +  (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h;
+    dest_cr = s->current_picture.f.data[2] +  (mb_x << PIXEL_SHIFT) * 8 + mb_y * s->uvlinesize * block_h;
+
+    s->dsp.prefetch(dest_y  + (s->mb_x & 3) * 4 * s->linesize   + (64 << PIXEL_SHIFT), s->linesize,       4);
+    s->dsp.prefetch(dest_cb + (s->mb_x & 7)     * s->uvlinesize + (64 << PIXEL_SHIFT), dest_cr - dest_cb, 2);
+
+    h->list_counts[mb_xy] = h->list_count;
+
+    if (!SIMPLE && MB_FIELD) {
+        linesize     = h->mb_linesize = s->linesize * 2;
+        uvlinesize   = h->mb_uvlinesize = s->uvlinesize * 2;
+        block_offset = &h->block_offset[48];
+        if (mb_y & 1) { // FIXME move out of this function?
+            dest_y  -= s->linesize * 15;
+            dest_cb -= s->uvlinesize * (block_h - 1);
+            dest_cr -= s->uvlinesize * (block_h - 1);
+        }
+        if (FRAME_MBAFF) {
+            int list;
+            for (list = 0; list < h->list_count; list++) {
+                if (!USES_LIST(mb_type, list))
+                    continue;
+                if (IS_16X16(mb_type)) {
+                    int8_t *ref = &h->ref_cache[list][scan8[0]];
+                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
+                } else {
+                    for (i = 0; i < 16; i += 4) {
+                        int ref = h->ref_cache[list][scan8[i]];
+                        if (ref >= 0)
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
+                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
+                    }
+                }
+            }
+        }
+    } else {
+        linesize   = h->mb_linesize   = s->linesize;
+        uvlinesize = h->mb_uvlinesize = s->uvlinesize;
+        // dct_offset = s->linesize * 16;
+    }
+
+    if (!SIMPLE && IS_INTRA_PCM(mb_type)) {
+        if (PIXEL_SHIFT) {
+            const int bit_depth = h->sps.bit_depth_luma;
+            int j;
+            GetBitContext gb;
+            init_get_bits(&gb, (uint8_t *)h->mb,
+                          ff_h264_mb_sizes[h->sps.chroma_format_idc] * bit_depth);
+
+            for (i = 0; i < 16; i++) {
+                uint16_t *tmp_y = (uint16_t *)(dest_y + i * linesize);
+                for (j = 0; j < 16; j++)
+                    tmp_y[j] = get_bits(&gb, bit_depth);
+            }
+            if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
+                if (!h->sps.chroma_format_idc) {
+                    for (i = 0; i < block_h; i++) {
+                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
+                        for (j = 0; j < 8; j++)
+                            tmp_cb[j] = 1 << (bit_depth - 1);
+                    }
+                    for (i = 0; i < block_h; i++) {
+                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
+                        for (j = 0; j < 8; j++)
+                            tmp_cr[j] = 1 << (bit_depth - 1);
+                    }
+                } else {
+                    for (i = 0; i < block_h; i++) {
+                        uint16_t *tmp_cb = (uint16_t *)(dest_cb + i * uvlinesize);
+                        for (j = 0; j < 8; j++)
+                            tmp_cb[j] = get_bits(&gb, bit_depth);
+                    }
+                    for (i = 0; i < block_h; i++) {
+                        uint16_t *tmp_cr = (uint16_t *)(dest_cr + i * uvlinesize);
+                        for (j = 0; j < 8; j++)
+                            tmp_cr[j] = get_bits(&gb, bit_depth);
+                    }
+                }
+            }
+        } else {
+            for (i = 0; i < 16; i++)
+                memcpy(dest_y + i * linesize, (uint8_t *)h->mb + i * 16, 16);
+            if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
+                if (!h->sps.chroma_format_idc) {
+                    for (i = 0; i < block_h; i++) {
+                        memset(dest_cb + i * uvlinesize, 128, 8);
+                        memset(dest_cr + i * uvlinesize, 128, 8);
+                    }
+                } else {
+                    uint8_t *src_cb = (uint8_t *)h->mb + 256;
+                    uint8_t *src_cr = (uint8_t *)h->mb + 256 + block_h * 8;
+                    for (i = 0; i < block_h; i++) {
+                        memcpy(dest_cb + i * uvlinesize, src_cb + i * 8, 8);
+                        memcpy(dest_cr + i * uvlinesize, src_cr + i * 8, 8);
+                    }
+                }
+            }
+        }
+    } else {
+        if (IS_INTRA(mb_type)) {
+            if (h->deblocking_filter)
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
+                               uvlinesize, 1, 0, SIMPLE, PIXEL_SHIFT);
+
+            if (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) {
+                h->hpc.pred8x8[h->chroma_pred_mode](dest_cb, uvlinesize);
+                h->hpc.pred8x8[h->chroma_pred_mode](dest_cr, uvlinesize);
+            }
+
+            hl_decode_mb_predict_luma(h, mb_type, is_h264, SIMPLE,
+                                      transform_bypass, PIXEL_SHIFT,
+                                      block_offset, linesize, dest_y, 0);
+
+            if (h->deblocking_filter)
+                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize,
+                               uvlinesize, 0, 0, SIMPLE, PIXEL_SHIFT);
+        } else if (is_h264) {
+            if (chroma422) {
+                FUNC(hl_motion_422)(h, dest_y, dest_cb, dest_cr,
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                              h->h264dsp.weight_h264_pixels_tab,
+                              h->h264dsp.biweight_h264_pixels_tab);
+            } else {
+                FUNC(hl_motion_420)(h, dest_y, dest_cb, dest_cr,
+                              s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                              s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                              h->h264dsp.weight_h264_pixels_tab,
+                              h->h264dsp.biweight_h264_pixels_tab);
+            }
+        }
+
+        hl_decode_mb_idct_luma(h, mb_type, is_h264, SIMPLE, transform_bypass,
+                               PIXEL_SHIFT, block_offset, linesize, dest_y, 0);
+
+        if ((SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) &&
+            (h->cbp & 0x30)) {
+            uint8_t *dest[2] = { dest_cb, dest_cr };
+            if (transform_bypass) {
+                if (IS_INTRA(mb_type) && h->sps.profile_idc == 244 &&
+                    (h->chroma_pred_mode == VERT_PRED8x8 ||
+                     h->chroma_pred_mode == HOR_PRED8x8)) {
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0],
+                                                            block_offset + 16,
+                                                            h->mb + (16 * 16 * 1 << PIXEL_SHIFT),
+                                                            uvlinesize);
+                    h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1],
+                                                            block_offset + 32,
+                                                            h->mb + (16 * 16 * 2 << PIXEL_SHIFT),
+                                                            uvlinesize);
+                } else {
+                    idct_add = s->dsp.add_pixels4;
+                    for (j = 1; j < 3; j++) {
+                        for (i = j * 16; i < j * 16 + 4; i++)
+                            if (h->non_zero_count_cache[scan8[i]] ||
+                                dctcoef_get(h->mb, PIXEL_SHIFT, i * 16))
+                                idct_add(dest[j - 1] + block_offset[i],
+                                         h->mb + (i * 16 << PIXEL_SHIFT),
+                                         uvlinesize);
+                        if (chroma422) {
+                            for (i = j * 16 + 4; i < j * 16 + 8; i++)
+                                if (h->non_zero_count_cache[scan8[i + 4]] ||
+                                    dctcoef_get(h->mb, PIXEL_SHIFT, i * 16))
+                                    idct_add(dest[j - 1] + block_offset[i + 4],
+                                             h->mb + (i * 16 << PIXEL_SHIFT),
+                                             uvlinesize);
+                        }
+                    }
+                }
+            } else {
+                if (is_h264) {
+                    int qp[2];
+                    if (chroma422) {
+                        qp[0] = h->chroma_qp[0] + 3;
+                        qp[1] = h->chroma_qp[1] + 3;
+                    } else {
+                        qp[0] = h->chroma_qp[0];
+                        qp[1] = h->chroma_qp[1];
+                    }
+                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 0]])
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 1 << PIXEL_SHIFT),
+                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][qp[0]][0]);
+                    if (h->non_zero_count_cache[scan8[CHROMA_DC_BLOCK_INDEX + 1]])
+                        h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + (16 * 16 * 2 << PIXEL_SHIFT),
+                                                               h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][qp[1]][0]);
+                    h->h264dsp.h264_idct_add8(dest, block_offset,
+                                              h->mb, uvlinesize,
+                                              h->non_zero_count_cache);
+                } else if (CONFIG_SVQ3_DECODER) {
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 1,
+                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 1 : 4][h->chroma_qp[0]][0]);
+                    h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16 * 16 * 2,
+                                                           h->dequant4_coeff[IS_INTRA(mb_type) ? 2 : 5][h->chroma_qp[1]][0]);
+                    for (j = 1; j < 3; j++) {
+                        for (i = j * 16; i < j * 16 + 4; i++)
+                            if (h->non_zero_count_cache[scan8[i]] || h->mb[i * 16]) {
+                                uint8_t *const ptr = dest[j - 1] + block_offset[i];
+                                ff_svq3_add_idct_c(ptr, h->mb + i * 16,
+                                                   uvlinesize,
+                                                   ff_h264_chroma_qp[0][s->qscale + 12] - 12, 2);
+                            }
+                    }
+                }
+            }
+        }
+    }
+    if (h->cbp || IS_INTRA(mb_type)) {
+        s->dsp.clear_blocks(h->mb);
+        s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+    }
+}
+
+#if !SIMPLE || BITS == 8
+
+#undef  CHROMA_IDC
+#define CHROMA_IDC 3
+#include "h264_mc_template.c"
+
+static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
+{
+    MpegEncContext *const s = &h->s;
+    const int mb_x    = s->mb_x;
+    const int mb_y    = s->mb_y;
+    const int mb_xy   = h->mb_xy;
+    const int mb_type = s->current_picture.f.mb_type[mb_xy];
+    uint8_t *dest[3];
+    int linesize;
+    int i, j, p;
+    int *block_offset = &h->block_offset[0];
+    const int transform_bypass = !SIMPLE && (s->qscale == 0 && h->sps.transform_bypass);
+    const int plane_count      = (SIMPLE || !CONFIG_GRAY || !(s->flags & CODEC_FLAG_GRAY)) ? 3 : 1;
+
+    for (p = 0; p < plane_count; p++) {
+        dest[p] = s->current_picture.f.data[p] +
+                  ((mb_x << PIXEL_SHIFT) + mb_y * s->linesize) * 16;
+        s->dsp.prefetch(dest[p] + (s->mb_x & 3) * 4 * s->linesize + (64 << PIXEL_SHIFT),
+                        s->linesize, 4);
+    }
+
+    h->list_counts[mb_xy] = h->list_count;
+
+    if (!SIMPLE && MB_FIELD) {
+        linesize     = h->mb_linesize = h->mb_uvlinesize = s->linesize * 2;
+        block_offset = &h->block_offset[48];
+        if (mb_y & 1) // FIXME move out of this function?
+            for (p = 0; p < 3; p++)
+                dest[p] -= s->linesize * 15;
+        if (FRAME_MBAFF) {
+            int list;
+            for (list = 0; list < h->list_count; list++) {
+                if (!USES_LIST(mb_type, list))
+                    continue;
+                if (IS_16X16(mb_type)) {
+                    int8_t *ref = &h->ref_cache[list][scan8[0]];
+                    fill_rectangle(ref, 4, 4, 8, (16 + *ref) ^ (s->mb_y & 1), 1);
+                } else {
+                    for (i = 0; i < 16; i += 4) {
+                        int ref = h->ref_cache[list][scan8[i]];
+                        if (ref >= 0)
+                            fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2,
+                                           8, (16 + ref) ^ (s->mb_y & 1), 1);
+                    }
+                }
+            }
+        }
+    } else {
+        linesize = h->mb_linesize = h->mb_uvlinesize = s->linesize;
+    }
+
+    if (!SIMPLE && IS_INTRA_PCM(mb_type)) {
+        if (PIXEL_SHIFT) {
+            const int bit_depth = h->sps.bit_depth_luma;
+            GetBitContext gb;
+            init_get_bits(&gb, (uint8_t *)h->mb, 768 * bit_depth);
+
+            for (p = 0; p < plane_count; p++)
+                for (i = 0; i < 16; i++) {
+                    uint16_t *tmp = (uint16_t *)(dest[p] + i * linesize);
+                    for (j = 0; j < 16; j++)
+                        tmp[j] = get_bits(&gb, bit_depth);
+                }
+        } else {
+            for (p = 0; p < plane_count; p++)
+                for (i = 0; i < 16; i++)
+                    memcpy(dest[p] + i * linesize,
+                           (uint8_t *)h->mb + p * 256 + i * 16, 16);
+        }
+    } else {
+        if (IS_INTRA(mb_type)) {
+            if (h->deblocking_filter)
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
+                               linesize, 1, 1, SIMPLE, PIXEL_SHIFT);
+
+            for (p = 0; p < plane_count; p++)
+                hl_decode_mb_predict_luma(h, mb_type, 1, SIMPLE,
+                                          transform_bypass, PIXEL_SHIFT,
+                                          block_offset, linesize, dest[p], p);
+
+            if (h->deblocking_filter)
+                xchg_mb_border(h, dest[0], dest[1], dest[2], linesize,
+                               linesize, 0, 1, SIMPLE, PIXEL_SHIFT);
+        } else {
+            FUNC(hl_motion_444)(h, dest[0], dest[1], dest[2],
+                      s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
+                      s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
+                      h->h264dsp.weight_h264_pixels_tab,
+                      h->h264dsp.biweight_h264_pixels_tab);
+        }
+
+        for (p = 0; p < plane_count; p++)
+            hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass,
+                                   PIXEL_SHIFT, block_offset, linesize,
+                                   dest[p], p);
+    }
+    if (h->cbp || IS_INTRA(mb_type)) {
+        s->dsp.clear_blocks(h->mb);
+        s->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
+    }
+}
+
+#endif
diff --git a/libavcodec/h264_mc_template.c b/libavcodec/h264_mc_template.c
new file mode 100644
index 0000000..a3af39b
--- /dev/null
+++ b/libavcodec/h264_mc_template.c
@@ -0,0 +1,160 @@
+/*
+ * H.26L/H.264/AVC/JVT/14496-10/... decoder
+ * Copyright (c) 2003 Michael Niedermayer <michaelni at gmx.at>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#undef MCFUNC
+
+#if   CHROMA_IDC == 1
+#   define MCFUNC(n) FUNC(n ## _420)
+#elif CHROMA_IDC == 2
+#   define MCFUNC(n) FUNC(n ## _422)
+#elif CHROMA_IDC == 3
+#   define MCFUNC(n) FUNC(n ## _444)
+#endif
+
+#undef  mc_part
+#define mc_part MCFUNC(mc_part)
+
+static void mc_part(H264Context *h, int n, int square,
+                    int height, int delta,
+                    uint8_t *dest_y, uint8_t *dest_cb,
+                    uint8_t *dest_cr,
+                    int x_offset, int y_offset,
+                    qpel_mc_func *qpix_put,
+                    h264_chroma_mc_func chroma_put,
+                    qpel_mc_func *qpix_avg,
+                    h264_chroma_mc_func chroma_avg,
+                    h264_weight_func *weight_op,
+                    h264_biweight_func *weight_avg,
+                    int list0, int list1)
+{
+    if ((h->use_weight == 2 && list0 && list1 &&
+         (h->implicit_weight[h->ref_cache[0][scan8[n]]][h->ref_cache[1][scan8[n]]][h->s.mb_y & 1] != 32)) ||
+        h->use_weight == 1)
+        mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
+                         x_offset, y_offset, qpix_put, chroma_put,
+                         weight_op[0], weight_op[1], weight_avg[0],
+                         weight_avg[1], list0, list1, PIXEL_SHIFT, CHROMA_IDC);
+    else
+        mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
+                    x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
+                    chroma_avg, list0, list1, PIXEL_SHIFT, CHROMA_IDC);
+}
+
+static void MCFUNC(hl_motion)(H264Context *h, uint8_t *dest_y,
+                              uint8_t *dest_cb, uint8_t *dest_cr,
+                              qpel_mc_func(*qpix_put)[16],
+                              h264_chroma_mc_func(*chroma_put),
+                              qpel_mc_func(*qpix_avg)[16],
+                              h264_chroma_mc_func(*chroma_avg),
+                              h264_weight_func *weight_op,
+                              h264_biweight_func *weight_avg)
+{
+    MpegEncContext *const s = &h->s;
+    const int mb_xy   = h->mb_xy;
+    const int mb_type = s->current_picture.f.mb_type[mb_xy];
+
+    assert(IS_INTER(mb_type));
+
+    if (HAVE_THREADS && (s->avctx->active_thread_type & FF_THREAD_FRAME))
+        await_references(h);
+    prefetch_motion(h, 0, PIXEL_SHIFT, CHROMA_IDC);
+
+    if (IS_16X16(mb_type)) {
+        mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
+                qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
+                weight_op, weight_avg,
+                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+    } else if (IS_16X8(mb_type)) {
+        mc_part(h, 0, 0, 8, 8 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr, 0, 0,
+                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+                weight_op, weight_avg,
+                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+        mc_part(h, 8, 0, 8, 8 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr, 0, 4,
+                qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
+                weight_op, weight_avg,
+                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+    } else if (IS_8X16(mb_type)) {
+        mc_part(h, 0, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
+                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+                &weight_op[1], &weight_avg[1],
+                IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
+        mc_part(h, 4, 0, 16, 8 * h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
+                qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+                &weight_op[1], &weight_avg[1],
+                IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
+    } else {
+        int i;
+
+        assert(IS_8X8(mb_type));
+
+        for (i = 0; i < 4; i++) {
+            const int sub_mb_type = h->sub_mb_type[i];
+            const int n  = 4 * i;
+            int x_offset = (i & 1) << 2;
+            int y_offset = (i & 2) << 1;
+
+            if (IS_SUB_8X8(sub_mb_type)) {
+                mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr,
+                        x_offset, y_offset,
+                        qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
+                        &weight_op[1], &weight_avg[1],
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+            } else if (IS_SUB_8X4(sub_mb_type)) {
+                mc_part(h, n, 0, 4, 4 << PIXEL_SHIFT, dest_y, dest_cb, dest_cr,
+                        x_offset, y_offset,
+                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+                        &weight_op[1], &weight_avg[1],
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+                mc_part(h, n + 2, 0, 4, 4 << PIXEL_SHIFT,
+                        dest_y, dest_cb, dest_cr, x_offset, y_offset + 2,
+                        qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
+                        &weight_op[1], &weight_avg[1],
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+            } else if (IS_SUB_4X8(sub_mb_type)) {
+                mc_part(h, n, 0, 8, 4 * h->mb_linesize,
+                        dest_y, dest_cb, dest_cr, x_offset, y_offset,
+                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+                        &weight_op[2], &weight_avg[2],
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+                mc_part(h, n + 1, 0, 8, 4 * h->mb_linesize,
+                        dest_y, dest_cb, dest_cr, x_offset + 2, y_offset,
+                        qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+                        &weight_op[2], &weight_avg[2],
+                        IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+            } else {
+                int j;
+                assert(IS_SUB_4X4(sub_mb_type));
+                for (j = 0; j < 4; j++) {
+                    int sub_x_offset = x_offset + 2 * (j & 1);
+                    int sub_y_offset = y_offset + (j & 2);
+                    mc_part(h, n + j, 1, 4, 0,
+                            dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
+                            qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
+                            &weight_op[2], &weight_avg[2],
+                            IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
+                }
+            }
+        }
+    }
+
+    prefetch_motion(h, 1, PIXEL_SHIFT, CHROMA_IDC);
+}
+



More information about the libav-commits mailing list