Untitled

From aa1a8435000228c4d9e74da0f9fd3d16e85a3e80 Mon Sep 17 00:00:00 2001
From: Loren Merritt <pengvado@akuvian.org>
Date: Sat, 26 Jun 2010 20:55:59 -0700
Subject: [PATCH 1/7] Simplify pixel_ads

---
 common/macroblock.c    |    2 +-
 common/x86/pixel-a.asm |  175 +++++++++++++++++------------------------------
 encoder/me.c           |    2 +-
 3 files changed, 65 insertions(+), 114 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 8e9b06d..4561d8a 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -341,7 +341,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
         int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
         int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
         int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
-            ((me_range*2+18) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
+            ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
         scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
     }
     int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+3)&~3) * sizeof(int);
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 78ca4c7..1756f86 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -2142,34 +2142,24 @@ cglobal pixel_ssim_end4_sse2, 3,3,7
 ; Successive Elimination ADS
 ;=============================================================================

-%macro ADS_START 1 ; unroll_size
-%ifdef ARCH_X86_64
-    %define t0 r6
+%macro ADS_START 0
 %ifdef WIN64
-    mov     r4,  r4mp
-    movsxd  r5,  dword r5m
+    movsxd  r5,  r5d
 %endif
-    mov     r10, rsp
-%else
-    %define t0 r4
-    mov     rbp, rsp
-%endif
-    mov     r0d, r5m
-    sub     rsp, r0
-    sub     rsp, %1*4-1
-    and     rsp, ~15
-    mov     t0,  rsp
+    mov     r0d, r5d
+    lea     r6,  [r4+r5+15]
+    and     r6,  ~15;
     shl     r2d,  1
 %endmacro

-%macro ADS_END 1
+%macro ADS_END 1 ; unroll_size
     add     r1, 8*%1
     add     r3, 8*%1
-    add     t0, 4*%1
+    add     r6, 4*%1
     sub     r0d, 4*%1
     jg .loop
 %ifdef WIN64
-    RESTORE_XMM r10
+    RESTORE_XMM rsp
 %endif
     jmp ads_mvs
 %endmacro
@@ -2180,14 +2170,14 @@ cglobal pixel_ssim_end4_sse2, 3,3,7
 ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
 ;                 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
 ;-----------------------------------------------------------------------------
-cglobal pixel_ads4_mmxext, 4,7
+cglobal pixel_ads4_mmxext, 6,7
     movq    mm6, [r0]
     movq    mm4, [r0+8]
     pshufw  mm7, mm6, 0
     pshufw  mm6, mm6, 0xAA
     pshufw  mm5, mm4, 0
     pshufw  mm4, mm4, 0xAA
-    ADS_START 1
+    ADS_START
 .loop:
     movq    mm0, [r1]
     movq    mm1, [r1+16]
@@ -2204,25 +2194,19 @@ cglobal pixel_ads4_mmxext, 4,7
     ABS1    mm3, mm1
     paddw   mm0, mm2
     paddw   mm0, mm3
-%ifdef WIN64
-    pshufw  mm1, [r10+stack_offset+56], 0
-%elifdef ARCH_X86_64
-    pshufw  mm1, [r10+8], 0
-%else
-    pshufw  mm1, [ebp+stack_offset+28], 0
-%endif
+    pshufw  mm1, r6m, 0
     paddusw mm0, [r3]
     psubusw mm1, mm0
     packsswb mm1, mm1
-    movd    [t0], mm1
+    movd    [r6], mm1
     ADS_END 1

-cglobal pixel_ads2_mmxext, 4,7
+cglobal pixel_ads2_mmxext, 6,7
     movq    mm6, [r0]
     pshufw  mm5, r6m, 0
     pshufw  mm7, mm6, 0
     pshufw  mm6, mm6, 0xAA
-    ADS_START 1
+    ADS_START
 .loop:
     movq    mm0, [r1]
     movq    mm1, [r1+r2]
@@ -2235,13 +2219,13 @@ cglobal pixel_ads2_mmxext, 4,7
     movq    mm4, mm5
     psubusw mm4, mm0
     packsswb mm4, mm4
-    movd    [t0], mm4
+    movd    [r6], mm4
     ADS_END 1

-cglobal pixel_ads1_mmxext, 4,7
+cglobal pixel_ads1_mmxext, 6,7
     pshufw  mm7, [r0], 0
     pshufw  mm6, r6m, 0
-    ADS_START 2
+    ADS_START
 .loop:
     movq    mm0, [r1]
     movq    mm1, [r1+8]
@@ -2256,11 +2240,11 @@ cglobal pixel_ads1_mmxext, 4,7
     psubusw mm4, mm0
     psubusw mm5, mm1
     packsswb mm4, mm5
-    movq    [t0], mm4
+    movq    [r6], mm4
     ADS_END 2

 %macro ADS_SSE2 1
-cglobal pixel_ads4_%1, 4,7,12
+cglobal pixel_ads4_%1, 6,7,12
     movdqa  xmm4, [r0]
     pshuflw xmm7, xmm4, 0
     pshuflw xmm6, xmm4, 0xAA
@@ -2273,7 +2257,7 @@ cglobal pixel_ads4_%1, 4,7,12
 %ifdef ARCH_X86_64
     pshuflw xmm8, r6m, 0
     punpcklqdq xmm8, xmm8
-    ADS_START 2
+    ADS_START
     movdqu  xmm10, [r1]
     movdqu  xmm11, [r1+r2]
 .loop:
@@ -2299,9 +2283,9 @@ cglobal pixel_ads4_%1, 4,7,12
     movdqa  xmm1, xmm8
     psubusw xmm1, xmm0
     packsswb xmm1, xmm1
-    movq    [t0], xmm1
+    movq    [r6], xmm1
 %else
-    ADS_START 2
+    ADS_START
 .loop:
     movdqu  xmm0, [r1]
     movdqu  xmm1, [r1+16]
@@ -2318,18 +2302,18 @@ cglobal pixel_ads4_%1, 4,7,12
     ABS1    xmm3, xmm1
     paddw   xmm0, xmm2
     paddw   xmm0, xmm3
-    movd    xmm1, [ebp+stack_offset+28]
+    movd    xmm1, r6m
     movdqu  xmm2, [r3]
     pshuflw xmm1, xmm1, 0
     punpcklqdq xmm1, xmm1
     paddusw xmm0, xmm2
     psubusw xmm1, xmm0
     packsswb xmm1, xmm1
-    movq    [t0], xmm1
+    movq    [r6], xmm1
 %endif ; ARCH
     ADS_END 2

-cglobal pixel_ads2_%1, 4,7,8
+cglobal pixel_ads2_%1, 6,7,8
     movq    xmm6, [r0]
     movd    xmm5, r6m
     pshuflw xmm7, xmm6, 0
@@ -2338,7 +2322,7 @@ cglobal pixel_ads2_%1, 4,7,8
     punpcklqdq xmm7, xmm7
     punpcklqdq xmm6, xmm6
     punpcklqdq xmm5, xmm5
-    ADS_START 2
+    ADS_START
 .loop:
     movdqu  xmm0, [r1]
     movdqu  xmm1, [r1+r2]
@@ -2352,17 +2336,17 @@ cglobal pixel_ads2_%1, 4,7,8
     movdqa  xmm1, xmm5
     psubusw xmm1, xmm0
     packsswb xmm1, xmm1
-    movq    [t0], xmm1
+    movq    [r6], xmm1
     ADS_END 2

-cglobal pixel_ads1_%1, 4,7,8
+cglobal pixel_ads1_%1, 6,7,8
     movd    xmm7, [r0]
     movd    xmm6, r6m
     pshuflw xmm7, xmm7, 0
     pshuflw xmm6, xmm6, 0
     punpcklqdq xmm7, xmm7
     punpcklqdq xmm6, xmm6
-    ADS_START 4
+    ADS_START
 .loop:
     movdqu  xmm0, [r1]
     movdqu  xmm1, [r1+16]
@@ -2379,7 +2363,7 @@ cglobal pixel_ads1_%1, 4,7,8
     psubusw xmm4, xmm0
     psubusw xmm5, xmm1
     packsswb xmm4, xmm5
-    movdqa  [t0], xmm4
+    movdqa  [r6], xmm4
     ADS_END 4
 %endmacro

@@ -2401,90 +2385,57 @@ ADS_SSE2 ssse3
 ;     }
 ;     return nmv;
 ; }
+
+%macro TEST 1
+    mov     [r4+r0*2], r1w
+    test    r2d, 0xff<<(%1*8)
+    setne   r3b
+    add     r0d, r3d
+    inc     r1d
+%endmacro
+
 cglobal pixel_ads_mvs, 0,7,0
 ads_mvs:
-%ifdef ARCH_X86_64
+    lea     r6,  [r4+r5+15]
+    and     r6,  ~15;
     ; mvs = r4
-    ; masks = rsp
+    ; masks = r6
     ; width = r5
     ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
-%ifdef WIN64
-    mov     r8, r4
-    mov     r9, r5
-%endif
-    xor     eax, eax
-    xor     esi, esi
-    mov     dword [rsp+r9], 0
+    xor     r0d, r0d
+    xor     r1d, r1d
+    mov     [r6+r5], r0d
     jmp .loopi
+ALIGN 16
 .loopi0:
-    add     esi, 8
-    cmp     esi, r9d
+    add     r1d, 8
+    cmp     r1d, r5d
     jge .end
 .loopi:
-    mov     rdi, [rsp+rsi]
-    test    rdi, rdi
+    mov     r2,  [r6+r1]
+%ifdef ARCH_X86_64
+    test    r2,  r2
+%else
+    mov     r3,  r2
+    or      r3d, [r6+r1+4]
+%endif
     jz .loopi0
-    xor     ecx, ecx
-%macro TEST 1
-    mov     [r8+rax*2], si
-    test    edi, 0xff<<(%1*8)
-    setne   cl
-    add     eax, ecx
-    inc     esi
-%endmacro
+    xor     r3d, r3d
     TEST 0
     TEST 1
     TEST 2
     TEST 3
-    shr     rdi, 32
+%ifdef ARCH_X86_64
+    shr     r2,  32
+%else
+    mov     r2d, [r6+r1]
+%endif
     TEST 0
     TEST 1
     TEST 2
     TEST 3
-    cmp     esi, r9d
-    jl .loopi
-.end:
-    mov     rsp, r10
-    RET
-
-%else
-    xor     eax, eax
-    xor     esi, esi
-    mov     ebx, [ebp+stack_offset+20] ; mvs
-    mov     edi, [ebp+stack_offset+24] ; width
-    mov     dword [esp+edi], 0
-    push    ebp
-    jmp .loopi
-.loopi0:
-    add     esi, 8
-    cmp     esi, edi
-    jge .end
-.loopi:
-    mov     ebp, [esp+esi+4]
-    mov     edx, [esp+esi+8]
-    mov     ecx, ebp
-    or      ecx, edx
-    jz .loopi0
-    xor     ecx, ecx
-%macro TEST 2
-    mov     [ebx+eax*2], si
-    test    %2, 0xff<<(%1*8)
-    setne   cl
-    add     eax, ecx
-    inc     esi
-%endmacro
-    TEST 0, ebp
-    TEST 1, ebp
-    TEST 2, ebp
-    TEST 3, ebp
-    TEST 0, edx
-    TEST 1, edx
-    TEST 2, edx
-    TEST 3, edx
-    cmp     esi, edi
+    cmp     r1d, r5d
     jl .loopi
 .end:
-    pop     esp
+    movifnidn eax, r0d
     RET
-%endif ; ARCH
-
diff --git a/encoder/me.c b/encoder/me.c
index 291104a..19c5b2b 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -609,7 +609,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
             if( h->mb.i_me_method == X264_ME_TESA )
             {
                 // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
-                mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
+                mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15) + 4);
                 int nmvsad = 0, limit;
                 int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
                 int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
--
1.7.1


From 4f74306c2f266bfc671ad99e9027b816dd423ece Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Mon, 28 Jun 2010 15:02:33 -0700
Subject: [PATCH 2/7] Callback feature for low-latency per-slice output
 Add a callback to allow the calling application to send slices immediately after being encoded.
 Also add some extra information to the x264_nal_t structure to help inform such a calling application how the NAL units should be ordered.

Full documentation is in x264.h.
---
 common/bitstream.c |    7 ++-
 common/bitstream.h |    1 -
 encoder/encoder.c  |   26 ++++++++---
 x264.h             |  128 +++++++++++++++++++++++++++++++++-------------------
 4 files changed, 105 insertions(+), 57 deletions(-)

diff --git a/common/bitstream.c b/common/bitstream.c
index 0aaac21..ad8c16e 100644
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -44,7 +44,7 @@ uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
 /****************************************************************************
  * x264_nal_encode:
  ****************************************************************************/
-int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
+void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
 {
     uint8_t *src = nal->p_payload;
     uint8_t *end = nal->p_payload + nal->i_payload;
@@ -52,7 +52,7 @@ int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startc

     if( h->param.b_annexb )
     {
-        if( b_long_startcode )
+        if( nal->b_long_startcode )
             *dst++ = 0x00;
         *dst++ = 0x00;
         *dst++ = 0x00;
@@ -77,7 +77,8 @@ int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startc
         orig_dst[3] = size>> 0;
     }

-    return size+4;
+    nal->i_payload = size+4;
+    nal->p_payload = orig_dst;
 }

 void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
diff --git a/common/bitstream.h b/common/bitstream.h
index 9ce5bd7..dd8118d 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -68,7 +68,6 @@ typedef struct
     uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
 } x264_bitstream_function_t;

-int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
 void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );

 /* A larger level table size theoretically could help a bit at extremely
diff --git a/encoder/encoder.c b/encoder/encoder.c
index f54fe85..fe97aef 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -427,6 +427,8 @@ static int x264_validate_parameters( x264_t *h )
     else
         h->param.b_sliced_threads = 0;
     h->i_thread_frames = h->param.b_sliced_threads ? 1 : h->param.i_threads;
+    if( h->i_thread_frames > 1 )
+        h->param.nalu_process = NULL;

     if( h->param.b_interlaced )
     {
@@ -1253,8 +1255,9 @@ static void x264_nal_start( x264_t *h, int i_type, int i_ref_idc )
 {
     x264_nal_t *nal = &h->out.nal[h->out.i_nal];

-    nal->i_ref_idc = i_ref_idc;
-    nal->i_type    = i_type;
+    nal->i_ref_idc        = i_ref_idc;
+    nal->i_type           = i_type;
+    nal->b_long_startcode = 1;

     nal->i_payload= 0;
     nal->p_payload= &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8];
@@ -1280,6 +1283,8 @@ static int x264_nal_end( x264_t *h )
 {
     x264_nal_t *nal = &h->out.nal[h->out.i_nal];
     nal->i_payload = &h->out.p_bitstream[bs_pos( &h->out.bs ) / 8] - nal->p_payload;
+    if( h->param.nalu_process )
+        h->param.nalu_process( h, nal );
     h->out.i_nal++;

     return x264_nal_check_buffer( h );
@@ -1289,6 +1294,13 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
 {
     int nal_size = 0, previous_nal_size = 0;

+    if( h->param.nalu_process )
+    {
+        for( int i = start; i < h->out.i_nal; i++ )
+            nal_size += h->out.nal[i].i_payload;
+        return nal_size;
+    }
+
     for( int i = 0; i < start; i++ )
         previous_nal_size += h->out.nal[i].i_payload;

@@ -1311,11 +1323,9 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )

     for( int i = start; i < h->out.i_nal; i++ )
     {
-        int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
-        int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
-        h->out.nal[i].i_payload = size;
-        h->out.nal[i].p_payload = nal_buffer;
-        nal_buffer += size;
+        h->out.nal[i].b_long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
+        x264_nal_encode( h, nal_buffer, &h->out.nal[i] );
+        nal_buffer += h->out.nal[i].i_payload;
     }

     x264_emms();
@@ -1805,6 +1815,7 @@ static int x264_slice_write( x264_t *h )

     /* Slice */
     x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc );
+    h->out.nal[h->out.i_nal].i_first_mb = h->sh.i_first_mb;

     /* Slice header */
     x264_macroblock_thread_init( h );
@@ -2020,6 +2031,7 @@ static int x264_slice_write( x264_t *h )
             i_mb_x = 0;
         }
     }
+    h->out.nal[h->out.i_nal].i_last_mb = h->sh.i_last_mb;

     if( h->param.b_cabac )
     {
diff --git a/x264.h b/x264.h
index 1138a8b..e1ae084 100644
--- a/x264.h
+++ b/x264.h
@@ -35,13 +35,61 @@

 #include <stdarg.h>

-#define X264_BUILD 100
+#define X264_BUILD 101

 /* x264_t:
  *      opaque handler for encoder */
 typedef struct x264_t x264_t;

 /****************************************************************************
+ * NAL structure and functions
+ ****************************************************************************/
+
+enum nal_unit_type_e
+{
+    NAL_UNKNOWN     = 0,
+    NAL_SLICE       = 1,
+    NAL_SLICE_DPA   = 2,
+    NAL_SLICE_DPB   = 3,
+    NAL_SLICE_DPC   = 4,
+    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+    NAL_SEI         = 6,    /* ref_idc == 0 */
+    NAL_SPS         = 7,
+    NAL_PPS         = 8,
+    NAL_AUD         = 9,
+    NAL_FILLER      = 12,
+    /* ref_idc == 0 for 6,9,10,11,12 */
+};
+enum nal_priority_e
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+    NAL_PRIORITY_LOW        = 1,
+    NAL_PRIORITY_HIGH       = 2,
+    NAL_PRIORITY_HIGHEST    = 3,
+};
+
+/* The data within the payload is already NAL-encapsulated; the ref_idc and type
+ * are merely in the struct for easy access by the calling application.
+ * All data returned in an x264_nal_t, including the data in p_payload, is no longer
+ * valid after the next call to x264_encoder_encode.  Thus it must be used or copied
+ * before calling x264_encoder_encode or x264_encoder_headers again. */
+typedef struct
+{
+    int i_ref_idc;  /* nal_priority_e */
+    int i_type;     /* nal_unit_type_e */
+    int b_long_startcode;
+    int i_first_mb; /* If this NAL is a slice, the index of the first MB in the slice. */
+    int i_last_mb;  /* If this NAL is a slice, the index of the last MB in the slice. */
+
+    /* Size of payload in bytes. */
+    int     i_payload;
+    /* If param->b_annexb is set, Annex-B bytestream with startcode.
+     * Otherwise, startcode is replaced with a 4-byte size.
+     * This size is the size used in mp4/similar muxing; it is equal to i_payload-4 */
+    uint8_t *p_payload;
+} x264_nal_t;
+
+/****************************************************************************
  * Encoder parameters
  ****************************************************************************/
 /* CPU flags
@@ -377,8 +425,41 @@ typedef struct x264_param_t
      * i.e. when an x264_param_t is passed to x264_t in an x264_picture_t or in zones.
      * Not used when x264_encoder_reconfig is called directly. */
     void (*param_free)( void* );
+
+    /* Optional low-level callback for low-latency encoding.  Called for each output NAL unit
+     * immediately after the NAL unit is finished encoding.  This allows the calling application
+     * to begin processing video data (e.g. by sending packets over a network) before the frame
+     * is done encoding.
+     *
+     * This callback MUST do the following in order to work correctly:
+     * 1) Have available an output buffer of at least size nal->i_payload*3/2 + 5 + 16.
+     * 2) Call x264_nal_encode( h, dst, nal ), where dst is the output buffer.
+     * After these steps, the content of nal is valid and can be used in the same way as if
+     * the NAL unit were output by x264_encoder_encode.
+     *
+     * This does not need to be synchronous with the encoding process: the data pointed to
+     * by nal (both before and after x264_nal_encode) will remain valid until the next
+     * x264_encoder_encode call.  The callback must be re-entrant.
+     *
+     * This callback does not work with frame-based threads; threads must be disabled
+     * or sliced-threads enabled.  This callback also does not work as one would expect
+     * with HRD -- since the buffering period SEI cannot be calculated until the frame
+     * is finished encoding, it will not be sent via this callback.
+     *
+     * Note also that the NALs are not necessarily returned in order when sliced threads is
+     * enabled.  Accordingly, the variable i_first_mb and i_last_mb are available in
+     * x264_nal_t to help the calling application reorder the slices if necessary.
+     *
+     * When this callback is enabled, x264_encoder_encode does not return valid NALs;
+     * the calling application is expected to acquire all output NALs through the callback.
+     *
+     * It is generally sensible to combine this callback with a use of slice-max-mbs or
+     * slice-max-size. */
+    void (*nalu_process) ( x264_t *h, x264_nal_t *nal );
 } x264_param_t;

+void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
+
 /****************************************************************************
  * H.264 level restriction information
  ****************************************************************************/
@@ -586,51 +667,6 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
 void x264_picture_clean( x264_picture_t *pic );

 /****************************************************************************
- * NAL structure and functions
- ****************************************************************************/
-
-enum nal_unit_type_e
-{
-    NAL_UNKNOWN     = 0,
-    NAL_SLICE       = 1,
-    NAL_SLICE_DPA   = 2,
-    NAL_SLICE_DPB   = 3,
-    NAL_SLICE_DPC   = 4,
-    NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
-    NAL_SEI         = 6,    /* ref_idc == 0 */
-    NAL_SPS         = 7,
-    NAL_PPS         = 8,
-    NAL_AUD         = 9,
-    NAL_FILLER      = 12,
-    /* ref_idc == 0 for 6,9,10,11,12 */
-};
-enum nal_priority_e
-{
-    NAL_PRIORITY_DISPOSABLE = 0,
-    NAL_PRIORITY_LOW        = 1,
-    NAL_PRIORITY_HIGH       = 2,
-    NAL_PRIORITY_HIGHEST    = 3,
-};
-
-/* The data within the payload is already NAL-encapsulated; the ref_idc and type
- * are merely in the struct for easy access by the calling application.
- * All data returned in an x264_nal_t, including the data in p_payload, is no longer
- * valid after the next call to x264_encoder_encode.  Thus it must be used or copied
- * before calling x264_encoder_encode or x264_encoder_headers again. */
-typedef struct
-{
-    int i_ref_idc;  /* nal_priority_e */
-    int i_type;     /* nal_unit_type_e */
-
-    /* Size of payload in bytes. */
-    int     i_payload;
-    /* If param->b_annexb is set, Annex-B bytestream with 4-byte startcode.
-     * Otherwise, startcode is replaced with a 4-byte size.
-     * This size is the size used in mp4/similar muxing; it is equal to i_payload-4 */
-    uint8_t *p_payload;
-} x264_nal_t;
-
-/****************************************************************************
  * Encoder functions
  ****************************************************************************/

--
1.7.1


From 22bf1672adafa4e938a13952b8f71cd7548d31f1 Mon Sep 17 00:00:00 2001
From: Lamont Alston <wewk584@gmail.com>
Date: Tue, 29 Jun 2010 10:11:42 -0700
Subject: [PATCH 3/7] Make open-GOP Blu-ray compatible
 Blu-ray is even more braindamaged than we thought.
 Accordingly, open-gop options are now "normal" and "bluray", as opposed to display and coded.
 Normal should be used in all cases besides Blu-ray authoring.

---
 encoder/encoder.c   |    2 +-
 encoder/slicetype.c |   28 +++++++---------------------
 x264.c              |    8 ++++----
 x264.h              |    8 ++++----
 4 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/encoder/encoder.c b/encoder/encoder.c
index fe97aef..5cd3307 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -577,7 +577,7 @@ static int x264_validate_parameters( x264_t *h )
         h->param.analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL;
     }
     h->param.i_bframe = x264_clip3( h->param.i_bframe, 0, X264_MIN( X264_BFRAME_MAX, h->param.i_keyint_max-1 ) );
-    h->param.i_open_gop = x264_clip3( h->param.i_open_gop, X264_OPEN_GOP_NONE, X264_OPEN_GOP_CODED_ORDER );
+    h->param.i_open_gop = x264_clip3( h->param.i_open_gop, X264_OPEN_GOP_NONE, X264_OPEN_GOP_BLURAY );
     if( h->param.i_keyint_max == 1 )
         h->param.b_intra_refresh = 0;
     h->param.i_bframe_bias = x264_clip3( h->param.i_bframe_bias, -90, 100 );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 2703f02..4ede8cf 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -1233,17 +1233,11 @@ void x264_slicetype_analyse( x264_t *h, int keyframe )
     if( !h->param.b_intra_refresh )
         for( int i = keyint_limit+1; i <= num_frames; i += h->param.i_keyint_max )
         {
-            int j = i;
-            if( h->param.i_open_gop == X264_OPEN_GOP_CODED_ORDER )
-            {
-                while( IS_X264_TYPE_B( frames[i]->i_type ) )
-                    i++;
-                while( IS_X264_TYPE_B( frames[j-1]->i_type ) )
-                    j--;
-            }
             frames[i]->i_type = X264_TYPE_I;
             reset_start = X264_MIN( reset_start, i+1 );
-            i = j;
+            if( h->param.i_open_gop == X264_OPEN_GOP_BLURAY )
+                while( IS_X264_TYPE_B( frames[i-1]->i_type ) )
+                    i--;
         }

     if( vbv_lookahead )
@@ -1337,16 +1331,8 @@ void x264_slicetype_decide( x264_t *h )
             if( frm->i_type == X264_TYPE_AUTO || frm->i_type == X264_TYPE_I )
                 frm->i_type = h->param.i_open_gop && h->lookahead->i_last_keyframe >= 0 ? X264_TYPE_I : X264_TYPE_IDR;
             int warn = frm->i_type != X264_TYPE_IDR;
-            if( warn && h->param.i_open_gop == X264_OPEN_GOP_DISPLAY_ORDER )
-                warn &= frm->i_type != X264_TYPE_I && frm->i_type != X264_TYPE_KEYFRAME;
-            if( warn && h->param.i_open_gop == X264_OPEN_GOP_CODED_ORDER )
-            {
-                /* if this minigop ends with i, it's not a violation */
-                int j = bframes;
-                while( IS_X264_TYPE_B( h->lookahead->next.list[j]->i_type ) )
-                    j++;
-                warn = h->lookahead->next.list[j]->i_type != X264_TYPE_I && h->lookahead->next.list[j]->i_type != X264_TYPE_KEYFRAME;
-            }
+            if( warn && h->param.i_open_gop )
+                warn &= frm->i_type != X264_TYPE_I;
             if( warn )
                 x264_log( h, X264_LOG_WARNING, "specified frame type (%d) at %d is not compatible with keyframe interval\n", frm->i_type, frm->i_frame );
         }
@@ -1355,8 +1341,8 @@ void x264_slicetype_decide( x264_t *h )
             if( h->param.i_open_gop )
             {
                 h->lookahead->i_last_keyframe = frm->i_frame; // Use display order
-                if( h->param.i_open_gop == X264_OPEN_GOP_CODED_ORDER )
-                    h->lookahead->i_last_keyframe -= bframes; // Use coded order
+                if( h->param.i_open_gop == X264_OPEN_GOP_BLURAY )
+                    h->lookahead->i_last_keyframe -= bframes; // Use bluray order
                 frm->b_keyframe = 1;
             }
             else
diff --git a/x264.c b/x264.c
index df04385..f08ab41 100644
--- a/x264.c
+++ b/x264.c
@@ -382,10 +382,10 @@ static void Help( x264_param_t *defaults, int longhelp )
         "                                  - normal: Non-strict (not Blu-ray compatible)\n",
         strtable_lookup( x264_b_pyramid_names, defaults->i_bframe_pyramid ) );
     H1( "      --open-gop <string>     Use recovery points to close GOPs [none]\n"
-        "                                  - none: Use standard closed GOPs\n"
-        "                                  - display: Base GOP length on display order\n"
-        "                                             (not Blu-ray compatible)\n"
-        "                                  - coded: Base GOP length on coded order\n"
+        "                                  - none: closed GOPs only\n"
+        "                                  - normal: standard open GOPs\n"
+        "                                            (not Blu-ray compatible)\n"
+        "                                  - bluray: Blu-ray-compatible open GOPs\n"
         "                              Only available with b-frames\n" );
     H1( "      --no-cabac              Disable CABAC\n" );
     H1( "  -r, --ref <integer>         Number of reference frames [%d]\n", defaults->i_frame_reference );
diff --git a/x264.h b/x264.h
index e1ae084..86f7426 100644
--- a/x264.h
+++ b/x264.h
@@ -153,8 +153,8 @@ typedef struct
 #define X264_B_PYRAMID_NORMAL        2
 #define X264_KEYINT_MIN_AUTO         0
 #define X264_OPEN_GOP_NONE           0
-#define X264_OPEN_GOP_DISPLAY_ORDER  1
-#define X264_OPEN_GOP_CODED_ORDER    2
+#define X264_OPEN_GOP_NORMAL         1
+#define X264_OPEN_GOP_BLURAY         2

 static const char * const x264_direct_pred_names[] = { "none", "spatial", "temporal", "auto", 0 };
 static const char * const x264_motion_est_names[] = { "dia", "hex", "umh", "esa", "tesa", 0 };
@@ -166,7 +166,7 @@ static const char * const x264_colorprim_names[] = { "", "bt709", "undef", "", "
 static const char * const x264_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", "log316", 0 };
 static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", 0 };
 static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
-static const char * const x264_open_gop_names[] = { "none", "display", "coded", 0 };
+static const char * const x264_open_gop_names[] = { "none", "normal", "bluray", 0 };

 /* Colorspace type
  * legacy only; nothing other than I420 is really supported. */
@@ -276,7 +276,7 @@ typedef struct x264_param_t
     int         i_bframe_adaptive;
     int         i_bframe_bias;
     int         i_bframe_pyramid;   /* Keep some B-frames as references: 0=off, 1=strict hierarchical, 2=normal */
-    int         i_open_gop;         /* Open gop: 1=display order, 2=coded order to determine gop size */
+    int         i_open_gop;         /* Open gop: 1=display order, 2=bluray compatibility braindamage mode */

     int         b_deblocking_filter;
     int         i_deblocking_filter_alphac0;    /* [-6, 6] -6 light filter, 6 strong */
--
1.7.1


From ae5c32e10d6b500366d1d638c52b75e65aad1d9f Mon Sep 17 00:00:00 2001
From: Steven Walters <kemuri9@gmail.com>
Date: Sat, 26 Jun 2010 16:28:49 -0400
Subject: [PATCH 4/7] Centralize logging within x264cli
 x264cli messages will now respect the log level they pertain to.
 Slightly reduces binary size.

---
 input/avs.c             |   88 +++++-------------
 input/ffms.c            |   58 +++--------
 input/input.h           |    2 +
 input/lavf.c            |   55 +++--------
 input/thread.c          |    9 +-
 input/timecode.c        |  111 ++++++----------------
 input/y4m.c             |   23 +----
 input/yuv.c             |    8 +-
 muxers.h                |   61 ------------
 output/flv.c            |   10 +-
 output/flv_bytestream.c |    2 +-
 output/matroska.c       |    2 +-
 output/matroska_ebml.c  |    2 +-
 output/mp4.c            |   12 +--
 output/output.h         |    2 +
 output/raw.c            |    2 +-
 x264.c                  |  246 +++++++++++++++++++++--------------------------
 x264cli.h               |   67 +++++++++++++
 18 files changed, 289 insertions(+), 471 deletions(-)
 delete mode 100644 muxers.h
 create mode 100644 x264cli.h

diff --git a/input/avs.c b/input/avs.c
index 07add40..b83f715 100644
--- a/input/avs.c
+++ b/input/avs.c
@@ -20,8 +20,9 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "input.h"
 #include <windows.h>
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "avs", __VA_ARGS__ )

 /* the AVS interface currently uses __declspec to link function declarations to their definitions in the dll.
    this has a side effect of preventing program execution if the avisynth dll is not found,
@@ -131,27 +132,15 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     FILE *fh = fopen( psz_filename, "r" );
     if( !fh )
         return -1;
-    else if( !x264_is_regular_file( fh ) )
-    {
-        fprintf( stderr, "avs [error]: AVS input is incompatible with non-regular file `%s'\n", psz_filename );
-        return -1;
-    }
+    FAIL_IF_ERROR( !x264_is_regular_file( fh ), "AVS input is incompatible with non-regular file `%s'\n", psz_filename );
     fclose( fh );

     avs_hnd_t *h = malloc( sizeof(avs_hnd_t) );
     if( !h )
         return -1;
-    if( avs_load_library( h ) )
-    {
-        fprintf( stderr, "avs [error]: failed to load avisynth\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( avs_load_library( h ), "failed to load avisynth\n" )
     h->env = h->func.avs_create_script_environment( AVS_INTERFACE_YV12 );
-    if( !h->env )
-    {
-        fprintf( stderr, "avs [error]: failed to initiate avisynth\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( !h->env, "failed to initiate avisynth\n" )
     AVS_Value arg = avs_new_value_string( psz_filename );
     AVS_Value res;
     char *filename_ext = get_filename_extension( psz_filename );
@@ -159,11 +148,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     if( !strcasecmp( filename_ext, "avs" ) )
     {
         res = h->func.avs_invoke( h->env, "Import", arg, NULL );
-        if( avs_is_error( res ) )
-        {
-            fprintf( stderr, "avs [error]: %s\n", avs_as_string( res ) );
-            return -1;
-        }
+        FAIL_IF_ERROR( avs_is_error( res ), "%s\n", avs_as_string( res ) )
         /* check if the user is using a multi-threaded script and apply distributor if necessary.
            adapted from avisynth's vfw interface */
         AVS_Value mt_test = h->func.avs_invoke( h->env, "GetMTMode", avs_new_value_bool( 0 ), NULL );
@@ -184,78 +169,55 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
         int i;
         for( i = 0; filter[i]; i++ )
         {
-            fprintf( stderr, "avs [info]: trying %s... ", filter[i] );
+            x264_cli_log( "avs", X264_LOG_INFO, "trying %s... ", filter[i] );
             if( !h->func.avs_function_exists( h->env, filter[i] ) )
             {
-                fprintf( stderr, "not found\n" );
+                x264_cli_printf( X264_LOG_INFO, "not found\n" );
                 continue;
             }
             if( !strncasecmp( filter[i], "FFmpegSource", 12 ) )
             {
-                fprintf( stderr, "indexing... " );
+                x264_cli_printf( X264_LOG_INFO, "indexing... " );
                 fflush( stderr );
             }
             res = h->func.avs_invoke( h->env, filter[i], arg, NULL );
             if( !avs_is_error( res ) )
             {
-                fprintf( stderr, "succeeded\n" );
+                x264_cli_printf( X264_LOG_INFO, "succeeded\n" );
                 break;
             }
-            fprintf( stderr, "failed\n" );
-        }
-        if( !filter[i] )
-        {
-            fprintf( stderr, "avs [error]: unable to find source filter to open `%s'\n", psz_filename );
-            return -1;
+            x264_cli_printf( X264_LOG_INFO, "failed\n" );
         }
+        FAIL_IF_ERROR( !filter[i], "unable to find source filter to open `%s'\n", psz_filename )
     }
-    if( !avs_is_clip( res ) )
-    {
-        fprintf( stderr, "avs [error]: `%s' didn't return a video clip\n", psz_filename );
-        return -1;
-    }
+    FAIL_IF_ERROR( !avs_is_clip( res ), "`%s' didn't return a video clip\n", psz_filename )
     h->clip = h->func.avs_take_clip( res, h->env );
     int avs_version = h->func.avs_get_version( h->clip );
     const AVS_VideoInfo *vi = h->func.avs_get_video_info( h->clip );
-    if( !avs_has_video( vi ) )
-    {
-        fprintf( stderr, "avs [error]: `%s' has no video data\n", psz_filename );
-        return -1;
-    }
+    FAIL_IF_ERROR( !avs_has_video( vi ), "`%s' has no video data\n", psz_filename )
     /* if the clip is made of fields instead of frames, call weave to make them frames */
     if( avs_is_field_based( vi ) )
     {
-        fprintf( stderr, "avs [warning]: detected fieldbased (separated) input, weaving to frames\n" );
+        x264_cli_log( "avs", X264_LOG_WARNING, "detected fieldbased (separated) input, weaving to frames\n" );
         AVS_Value tmp = h->func.avs_invoke( h->env, "Weave", res, NULL );
-        if( avs_is_error( tmp ) )
-        {
-            fprintf( stderr, "avs [error]: couldn't weave fields into frames\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( avs_is_error( tmp ), "couldn't weave fields into frames\n" )
         res = update_clip( h, &vi, tmp, res );
         info->interlaced = 1;
         info->tff = avs_is_tff( vi );
     }
-    if( vi->width&1 || vi->height&1 )
-    {
-        fprintf( stderr, "avs [error]: input clip width or height not divisible by 2 (%dx%d)\n",
-                 vi->width, vi->height );
-        return -1;
-    }
+    FAIL_IF_ERROR( vi->width&1 || vi->height&1, "input clip width or height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
     /* always call ConvertToYV12 to convert non YV12 planar colorspaces to YV12 when user's AVS supports them,
        as all planar colorspaces are flagged as YV12. If it is already YV12 in this case, the call does nothing */
     if( !avs_is_yv12( vi ) || avs_version >= AVS_INTERFACE_OTHER_PLANAR )
     {
-        fprintf( stderr, "avs %s\n", !avs_is_yv12( vi ) ? "[warning]: converting input clip to YV12"
-               : "[info]: avisynth 2.6+ detected, forcing conversion to YV12" );
+        if( !avs_is_yv12( vi ) )
+            x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to YV12" );
+        else
+            x264_cli_log( "avs", X264_LOG_INFO, "avisynth 2.6+ detected, forcing conversion to YV12" );
         const char *arg_name[2] = { NULL, "interlaced" };
         AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
         AVS_Value res2 = h->func.avs_invoke( h->env, "ConvertToYV12", avs_new_value_array( arg_arr, 2 ), arg_name );
-        if( avs_is_error( res2 ) )
-        {
-            fprintf( stderr, "avs [error]: couldn't convert input clip to YV12\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( avs_is_error( res2 ), "couldn't convert input clip to YV12\n" )
         res = update_clip( h, &vi, res2, res );
     }
     h->func.avs_release_value( res );
@@ -294,11 +256,7 @@ static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
         return -1;
     AVS_VideoFrame *frm = p_pic->opaque = h->func.avs_get_frame( h->clip, i_frame );
     const char *err = h->func.avs_clip_get_error( h->clip );
-    if( err )
-    {
-        fprintf( stderr, "avs [error]: %s occurred while reading frame %d\n", err, i_frame );
-        return -1;
-    }
+    FAIL_IF_ERROR( err, "%s occurred while reading frame %d\n", err, i_frame )
     for( int i = 0; i < 3; i++ )
     {
         /* explicitly cast away the const attribute to avoid a warning */
diff --git a/input/ffms.c b/input/ffms.c
index b2a253e..fe8bf7e 100644
--- a/input/ffms.c
+++ b/input/ffms.c
@@ -21,8 +21,10 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "input.h"
 #include <ffms.h>
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "ffms", __VA_ARGS__ )
+
 #undef DECLARE_ALIGNED
 #include <libavcodec/avcodec.h>
 #include <libswscale/swscale.h>
@@ -86,28 +88,16 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     {
         idx = FFMS_MakeIndex( psz_filename, 0, 0, NULL, NULL, 0, update_progress, NULL, &e );
         fprintf( stderr, "                                            \r" );
-        if( !idx )
-        {
-            fprintf( stderr, "ffms [error]: could not create index\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( !idx, "could not create index\n" )
         if( opt->index_file && FFMS_WriteIndex( opt->index_file, idx, &e ) )
-            fprintf( stderr, "ffms [warning]: could not write index file\n" );
+            x264_cli_log( "ffms", X264_LOG_WARNING, "could not write index file\n" );
     }

     int trackno = FFMS_GetFirstTrackOfType( idx, FFMS_TYPE_VIDEO, &e );
-    if( trackno < 0 )
-    {
-        fprintf( stderr, "ffms [error]: could not find video track\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( trackno < 0, "could not find video track\n" )

     h->video_source = FFMS_CreateVideoSource( psz_filename, trackno, idx, 1, seekmode, &e );
-    if( !h->video_source )
-    {
-        fprintf( stderr, "ffms [error]: could not create video source\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( !h->video_source, "could not create video source\n" )

     h->track = FFMS_GetTrackFromVideo( h->video_source );

@@ -121,11 +111,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     h->vfr_input       = info->vfr;

     const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, 0, &e );
-    if( !frame )
-    {
-        fprintf( stderr, "ffms [error]: could not read frame 0\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( !frame, "could not read frame 0\n" )

     h->init_width  = h->cur_width  = info->width  = frame->EncodedWidth;
     h->init_height = h->cur_height = info->height = frame->EncodedHeight;
@@ -134,8 +120,8 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     info->tff        = frame->TopFieldFirst;

     if( h->cur_pix_fmt != PIX_FMT_YUV420P )
-        fprintf( stderr, "ffms [warning]: converting from %s to YV12\n",
-                 avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );
+        x264_cli_log( "ffms", X264_LOG_WARNING, "converting from %s to YV12\n",
+                       avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );

     /* ffms timestamps are in milliseconds. ffms also uses int64_ts for timebase,
      * so we need to reduce large timebases to prevent overflow */
@@ -173,19 +159,15 @@ static int check_swscale( ffms_hnd_t *h, const FFMS_Frame *frame, int i_frame )
     if( h->scaler )
     {
         sws_freeContext( h->scaler );
-        fprintf( stderr, "ffms [warning]: stream properties changed to %dx%d, %s at frame %d  \n", frame->EncodedWidth,
-                 frame->EncodedHeight, avcodec_get_pix_fmt_name( frame->EncodedPixelFormat ), i_frame );
+        x264_cli_log( "ffms", X264_LOG_WARNING, "stream properties changed to %dx%d, %s at frame %d  \n", frame->EncodedWidth,
+                      frame->EncodedHeight, avcodec_get_pix_fmt_name( frame->EncodedPixelFormat ), i_frame );
         h->cur_width   = frame->EncodedWidth;
         h->cur_height  = frame->EncodedHeight;
         h->cur_pix_fmt = frame->EncodedPixelFormat;
     }
     h->scaler = sws_getContext( h->cur_width, h->cur_height, h->cur_pix_fmt, h->init_width, h->init_height,
                                 PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL );
-    if( !h->scaler )
-    {
-        fprintf( stderr, "ffms [error]: could not open swscale context\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( !h->scaler, "could not open swscale context\n" )
     return 0;
 }

@@ -195,11 +177,7 @@ static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
     FFMS_ErrorInfo e;
     e.BufferSize = 0;
     const FFMS_Frame *frame = FFMS_GetFrame( h->video_source, i_frame, &e );
-    if( !frame )
-    {
-        fprintf( stderr, "ffms [error]: could not read frame %d\n", i_frame );
-        return -1;
-    }
+    FAIL_IF_ERROR( !frame, "could not read frame %d\n", i_frame )

     if( check_swscale( h, frame, i_frame ) )
         return -1;
@@ -214,12 +192,8 @@ static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )

     if( h->vfr_input )
     {
-        if( info->PTS == AV_NOPTS_VALUE )
-        {
-            fprintf( stderr, "ffms [error]: invalid timestamp. "
-                     "Use --force-cfr and specify a framerate with --fps\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( info->PTS == AV_NOPTS_VALUE, "invalid timestamp. "
+                       "Use --force-cfr and specify a framerate with --fps\n" )

         if( !h->pts_offset_flag )
         {
diff --git a/input/input.h b/input/input.h
index f89b13b..f588f3c 100644
--- a/input/input.h
+++ b/input/input.h
@@ -25,6 +25,8 @@
 #ifndef X264_INPUT_H
 #define X264_INPUT_H

+#include "x264cli.h"
+
 /* options that are used by only some demuxers */
 typedef struct
 {
diff --git a/input/lavf.c b/input/lavf.c
index 4b0375f..54a275f 100644
--- a/input/lavf.c
+++ b/input/lavf.c
@@ -21,7 +21,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "input.h"
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "lavf", __VA_ARGS__ )
 #undef DECLARE_ALIGNED
 #include <libavformat/avformat.h>
 #include <libswscale/swscale.h>
@@ -59,19 +60,15 @@ static int check_swscale( lavf_hnd_t *h, AVCodecContext *c, int i_frame )
     if( h->scaler )
     {
         sws_freeContext( h->scaler );
-        fprintf( stderr, "lavf [warning]: stream properties changed to %dx%d, %s at frame %d  \n",
-                 c->width, c->height, avcodec_get_pix_fmt_name( c->pix_fmt ), i_frame );
+        x264_cli_log( "lavf", X264_LOG_WARNING, "stream properties changed to %dx%d, %s at frame %d  \n",
+                      c->width, c->height, avcodec_get_pix_fmt_name( c->pix_fmt ), i_frame );
         h->cur_width   = c->width;
         h->cur_height  = c->height;
         h->cur_pix_fmt = c->pix_fmt;
     }
     h->scaler = sws_getContext( h->cur_width, h->cur_height, h->cur_pix_fmt, h->init_width, h->init_height,
                                 PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL );
-    if( !h->scaler )
-    {
-        fprintf( stderr, "lavf [error]: could not open swscale context\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( !h->scaler, "could not open swscale context\n" )
     return 0;
 }

@@ -106,12 +103,12 @@ static int read_frame_internal( x264_picture_t *p_pic, lavf_hnd_t *h, int i_fram
             {
                 c->reordered_opaque = pkt->pts;
                 if( avcodec_decode_video2( c, frame, &finished, pkt ) < 0 )
-                    fprintf( stderr, "lavf [warning]: video decoding failed on frame %d\n", h->next_frame );
+                    x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame );
             }
         if( !finished )
         {
             if( avcodec_decode_video2( c, frame, &finished, pkt ) < 0 )
-                fprintf( stderr, "lavf [warning]: video decoding failed on frame %d\n", h->next_frame );
+                x264_cli_log( "lavf", X264_LOG_WARNING, "video decoding failed on frame %d\n", h->next_frame );
             if( !finished )
                 return -1;
         }
@@ -166,26 +163,13 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     if( !strcmp( psz_filename, "-" ) )
         psz_filename = "pipe:";

-    if( av_open_input_file( &h->lavf, psz_filename, NULL, 0, NULL ) )
-    {
-        fprintf( stderr, "lavf [error]: could not open input file\n" );
-        return -1;
-    }
-
-    if( av_find_stream_info( h->lavf ) < 0 )
-    {
-        fprintf( stderr, "lavf [error]: could not find input stream info\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( av_open_input_file( &h->lavf, psz_filename, NULL, 0, NULL ), "could not open input file\n" )
+    FAIL_IF_ERROR( av_find_stream_info( h->lavf ) < 0, "could not find input stream info\n" )

     int i = 0;
     while( i < h->lavf->nb_streams && h->lavf->streams[i]->codec->codec_type != CODEC_TYPE_VIDEO )
         i++;
-    if( i == h->lavf->nb_streams )
-    {
-        fprintf( stderr, "lavf [error]: could not find video stream\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( i == h->lavf->nb_streams, "could not find video stream\n" )
     h->stream_id       = i;
     h->next_frame      = 0;
     h->pts_offset_flag = 0;
@@ -207,22 +191,15 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
         info->csp |= X264_CSP_VFLIP;

     if( h->cur_pix_fmt != PIX_FMT_YUV420P )
-        fprintf( stderr, "lavf [warning]: converting from %s to YV12\n",
-                 avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );
-
-    if( avcodec_open( c, avcodec_find_decoder( c->codec_id ) ) )
-    {
-        fprintf( stderr, "lavf [error]: could not find decoder for video stream\n" );
-        return -1;
-    }
+        x264_cli_log( "lavf", X264_LOG_WARNING, "converting from %s to YV12\n",
+                      avcodec_get_pix_fmt_name( h->cur_pix_fmt ) );
+    FAIL_IF_ERROR( avcodec_open( c, avcodec_find_decoder( c->codec_id ) ),
+                   "could not find decoder for video stream\n" )

     /* prefetch the first frame and set/confirm flags */
     h->first_pic = malloc( sizeof(x264_picture_t) );
-    if( !h->first_pic || lavf_input.picture_alloc( h->first_pic, info->csp, info->width, info->height ) )
-    {
-        fprintf( stderr, "lavf [error]: malloc failed\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( !h->first_pic || lavf_input.picture_alloc( h->first_pic, info->csp, info->width, info->height ),
+                   "malloc failed\n" )
     else if( read_frame_internal( h->first_pic, h, 0, info ) )
         return -1;

diff --git a/input/thread.c b/input/thread.c
index c4b07fa..98af22b 100644
--- a/input/thread.c
+++ b/input/thread.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "input.h"

 extern cli_input_t input;

@@ -47,11 +47,8 @@ typedef struct thread_input_arg_t
 static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, cli_input_opt_t *opt )
 {
     thread_hnd_t *h = malloc( sizeof(thread_hnd_t) );
-    if( !h || input.picture_alloc( &h->pic, info->csp, info->width, info->height ) )
-    {
-        fprintf( stderr, "x264 [error]: malloc failed\n" );
-        return -1;
-    }
+    FAIL_IF_ERR( !h || input.picture_alloc( &h->pic, info->csp, info->width, info->height ),
+                 "x264", "malloc failed\n" )
     h->input = input;
     h->p_handle = *p_handle;
     h->next_frame = -1;
diff --git a/input/timecode.c b/input/timecode.c
index a307327..7821e76 100644
--- a/input/timecode.c
+++ b/input/timecode.c
@@ -20,7 +20,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "input.h"
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "timecode", __VA_ARGS__ )
 #include <math.h>

 extern cli_input_t input;
@@ -61,12 +62,8 @@ static double correct_fps( double fps, timecode_hnd_t *h )
     {
         fps_den = i * h->timebase_num;
         fps_num = round( fps_den * fps_sig ) * exponent;
-        if( fps_num > UINT32_MAX )
-        {
-            fprintf( stderr, "timecode [error]: tcfile fps correction failed.\n"
-                             "                  Specify an appropriate timebase manually or remake tcfile.\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( fps_num > UINT32_MAX, "tcfile fps correction failed.\n"
+                       "                  Specify an appropriate timebase manually or remake tcfile.\n" )
         if( fabs( ((double)fps_num / fps_den) / exponent - fps_sig ) < DOUBLE_EPSILON )
             break;
         ++i;
@@ -91,12 +88,8 @@ static int try_mkv_timebase_den( double *fpss, timecode_hnd_t *h, int loop_num )
         double fps_sig = sigexp10( fpss[num], &exponent );
         fps_den = round( MKV_TIMEBASE_DEN / fps_sig ) / exponent;
         h->timebase_num = fps_den && h->timebase_num ? gcd( h->timebase_num, fps_den ) : fps_den;
-        if( h->timebase_num > UINT32_MAX || !h->timebase_num )
-        {
-            fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
-                             "                  Specify timebase manually.\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || !h->timebase_num, "automatic timebase generation failed.\n"
+                       "                  Specify timebase manually.\n" )
     }
     return 0;
 }
@@ -110,11 +103,7 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
     double *fpss = NULL;

     ret = fscanf( tcfile_in, "# timecode format v%d", &tcfv );
-    if( ret != 1 || (tcfv != 1 && tcfv != 2) )
-    {
-        fprintf( stderr, "timecode [error]: unsupported timecode format\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( ret != 1 || (tcfv != 1 && tcfv != 2), "unsupported timecode format\n" )

     if( tcfv == 1 )
     {
@@ -128,18 +117,11 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
         {
             if( buff[0] == '#' || buff[0] == '\n' || buff[0] == '\r' )
                 continue;
-            if( sscanf( buff, "assume %lf", &h->assume_fps ) != 1 && sscanf( buff, "Assume %lf", &h->assume_fps ) != 1 )
-            {
-                fprintf( stderr, "timecode [error]: tcfile parsing error: assumed fps not found\n" );
-                return -1;
-            }
+            FAIL_IF_ERROR( sscanf( buff, "assume %lf", &h->assume_fps ) != 1 && sscanf( buff, "Assume %lf", &h->assume_fps ) != 1,
+                           "tcfile parsing error: assumed fps not found\n" )
             break;
         }
-        if( h->assume_fps <= 0 )
-        {
-            fprintf( stderr, "timecode [error]: invalid assumed fps %.6f\n", h->assume_fps );
-            return -1;
-        }
+        FAIL_IF_ERROR( h->assume_fps <= 0, "invalid assumed fps %.6f\n", h->assume_fps )

         file_pos = ftell( tcfile_in );
         h->stored_pts_num = 0;
@@ -152,16 +134,9 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
                 continue;
             }
             ret = sscanf( buff, "%d,%d,%lf", &start, &end, &seq_fps );
-            if( ret != 3 && ret != EOF )
-            {
-                fprintf( stderr, "timecode [error]: invalid input tcfile\n" );
-                return -1;
-            }
-            if( start > end || start <= prev_start || end <= prev_end || seq_fps <= 0 )
-            {
-                fprintf( stderr, "timecode [error]: invalid input tcfile at line %d: %s\n", num, buff );
-                return -1;
-            }
+            FAIL_IF_ERROR( ret != 3 && ret != EOF, "invalid input tcfile\n" )
+            FAIL_IF_ERROR( start > end || start <= prev_start || end <= prev_end || seq_fps <= 0,
+                           "invalid input tcfile at line %d: %s\n", num, buff )
             prev_start = start;
             prev_end = end;
             if( h->auto_timebase_den || h->auto_timebase_num )
@@ -259,11 +234,7 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
             ++num;
         }
         timecodes_num = h->stored_pts_num + h->seek;
-        if( !timecodes_num )
-        {
-            fprintf( stderr, "timecode [error]: input tcfile doesn't have any timecodes!\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( !timecodes_num, "input tcfile doesn't have any timecodes!\n" )
         fseek( tcfile_in, file_pos, SEEK_SET );

         timecodes = malloc( timecodes_num * sizeof(double) );
@@ -272,11 +243,7 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info

         fgets( buff, sizeof(buff), tcfile_in );
         ret = sscanf( buff, "%lf", &timecodes[0] );
-        if( ret != 1 )
-        {
-            fprintf( stderr, "timecode [error]: invalid input tcfile for frame 0\n" );
-            goto fail;
-        }
+        FAIL_IF_ERROR( ret != 1, "invalid input tcfile for frame 0\n" )
         for( num = 1; num < timecodes_num; )
         {
             fgets( buff, sizeof(buff), tcfile_in );
@@ -284,11 +251,8 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
                 continue;
             ret = sscanf( buff, "%lf", &timecodes[num] );
             timecodes[num] *= 1e-3;         /* Timecode format v2 is expressed in milliseconds. */
-            if( ret != 1 || timecodes[num] <= timecodes[num - 1] )
-            {
-                fprintf( stderr, "timecode [error]: invalid input tcfile for frame %d\n", num );
-                goto fail;
-            }
+            FAIL_IF_ERROR( ret != 1 || timecodes[num] <= timecodes[num - 1],
+                           "invalid input tcfile for frame %d\n", num )
             ++num;
         }

@@ -342,14 +306,10 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
         uint64_t i = gcd( h->timebase_num, h->timebase_den );
         h->timebase_num /= i;
         h->timebase_den /= i;
-        fprintf( stderr, "timecode [info]: automatic timebase generation %"PRIu64"/%"PRIu64"\n", h->timebase_num, h->timebase_den );
-    }
-    else if( h->timebase_den > UINT32_MAX || !h->timebase_den )
-    {
-        fprintf( stderr, "timecode [error]: automatic timebase generation failed.\n"
-                         "                  Specify an appropriate timebase manually.\n" );
-        goto fail;
+        x264_cli_log( "timecode", X264_LOG_INFO, "automatic timebase generation %"PRIu64"/%"PRIu64"\n", h->timebase_num, h->timebase_den );
     }
+    else FAIL_IF_ERROR( h->timebase_den > UINT32_MAX || !h->timebase_den, "automatic timebase generation failed.\n"
+                        "                  Specify an appropriate timebase manually.\n" )

     h->pts = malloc( h->stored_pts_num * sizeof(int64_t) );
     if( !h->pts )
@@ -360,11 +320,7 @@ static int parse_tcfile( FILE *tcfile_in, timecode_hnd_t *h, video_info_t *info
     {
         h->pts[num] = (int64_t)( timecodes[h->seek + num] * ((double)h->timebase_den / h->timebase_num) + 0.5 );
         h->pts[num] -= pts_seek_offset;
-        if( h->pts[num] <= h->pts[num - 1] )
-        {
-            fprintf( stderr, "timecode [error]: invalid timebase or timecode for frame %d\n", num );
-            goto fail;
-        }
+        FAIL_IF_ERROR( h->pts[num] <= h->pts[num - 1], "invalid timebase or timecode for frame %d\n", num )
     }

     free( timecodes );
@@ -386,11 +342,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     int ret = 0;
     FILE *tcfile_in;
     timecode_hnd_t *h = malloc( sizeof(timecode_hnd_t) );
-    if( !h )
-    {
-        fprintf( stderr, "timecode [error]: malloc failed\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( !h, "malloc failed\n" )
     h->input = input;
     h->p_handle = *p_handle;
     h->frame_total = input.get_frame_total( h->p_handle );
@@ -400,11 +352,8 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
         ret = sscanf( opt->timebase, "%"SCNu64"/%"SCNu64, &h->timebase_num, &h->timebase_den );
         if( ret == 1 )
             h->timebase_num = strtoul( opt->timebase, NULL, 10 );
-        if( h->timebase_num > UINT32_MAX || h->timebase_den > UINT32_MAX )
-        {
-            fprintf( stderr, "timecode [error]: timebase you specified exceeds H.264 maximum\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( h->timebase_num > UINT32_MAX || h->timebase_den > UINT32_MAX,
+                       "timebase you specified exceeds H.264 maximum\n" )
     }
     h->auto_timebase_num = !ret;
     h->auto_timebase_den = ret < 2;
@@ -418,14 +367,10 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     *p_handle = h;

     tcfile_in = fopen( psz_filename, "rb" );
-    if( !tcfile_in )
-    {
-        fprintf( stderr, "timecode [error]: can't open `%s'\n", psz_filename );
-        return -1;
-    }
+    FAIL_IF_ERROR( !tcfile_in, "can't open `%s'\n", psz_filename )
     else if( !x264_is_regular_file( tcfile_in ) )
     {
-        fprintf( stderr, "timecode [error]: tcfile input incompatible with non-regular file `%s'\n", psz_filename );
+        x264_cli_log( "timecode", X264_LOG_ERROR, "tcfile input incompatible with non-regular file `%s'\n", psz_filename );
         fclose( tcfile_in );
         return -1;
     }
@@ -466,8 +411,8 @@ static int read_frame( x264_picture_t *p_pic, hnd_t handle, int i_frame )
     {
         if( h->pts )
         {
-            fprintf( stderr, "timecode [info]: input timecode file missing data for frame %d and later\n"
-                             "                 assuming constant fps %.6f\n", i_frame, h->assume_fps );
+            x264_cli_log( "timecode", X264_LOG_INFO, "input timecode file missing data for frame %d and later\n"
+                          "                 assuming constant fps %.6f\n", i_frame, h->assume_fps );
             free( h->pts );
             h->pts = NULL;
         }
diff --git a/input/y4m.c b/input/y4m.c
index fd42140..9b39d2f 100644
--- a/input/y4m.c
+++ b/input/y4m.c
@@ -21,7 +21,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "input.h"
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "y4m", __VA_ARGS__ )

 typedef struct
 {
@@ -162,11 +163,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     if( colorspace == X264_CSP_NONE )
         colorspace = X264_CSP_I420;

-    if( colorspace != X264_CSP_I420 )
-    {
-        fprintf( stderr, "y4m [error]: colorspace unhandled\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( colorspace != X264_CSP_I420, "colorspace unhandled\n" )

     *p_handle = h;
     return 0;
@@ -202,21 +199,13 @@ static int read_frame_internal( x264_picture_t *p_pic, y4m_hnd_t *h )
         return -1;

     header[slen] = 0;
-    if( strncmp( header, Y4M_FRAME_MAGIC, slen ) )
-    {
-        fprintf( stderr, "y4m [error]: bad header magic (%"PRIx32" <=> %s)\n",
-                 M32(header), header );
-        return -1;
-    }
+    FAIL_IF_ERROR( strncmp( header, Y4M_FRAME_MAGIC, slen ), "bad header magic (%"PRIx32" <=> %s)\n",
+                   M32(header), header )

     /* Skip most of it */
     while( i < MAX_FRAME_HEADER && fgetc( h->fh ) != '\n' )
         i++;
-    if( i == MAX_FRAME_HEADER )
-    {
-        fprintf( stderr, "y4m [error]: bad frame header!\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( i == MAX_FRAME_HEADER, "bad frame header!\n" )
     h->frame_header_len = i+slen+1;

     if( fread( p_pic->img.plane[0], h->width * h->height, 1, h->fh ) <= 0
diff --git a/input/yuv.c b/input/yuv.c
index cbed7fc..613662c 100644
--- a/input/yuv.c
+++ b/input/yuv.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "input.h"

 typedef struct
 {
@@ -45,11 +45,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
     }
     else
         sscanf( opt->resolution, "%ux%u", &info->width, &info->height );
-    if( !info->width || !info->height )
-    {
-        fprintf( stderr, "yuv [error]: rawyuv input requires a resolution.\n" );
-        return -1;
-    }
+    FAIL_IF_ERR( !info->width || !info->height, "yuv", "rawyuv input requires a resolution.\n" )

     h->next_frame = 0;
     info->vfr     = 0;
diff --git a/muxers.h b/muxers.h
deleted file mode 100644
index b309320..0000000
--- a/muxers.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*****************************************************************************
- * muxers.h: h264 file i/o modules
- *****************************************************************************
- * Copyright (C) 2003-2009 x264 project
- *
- * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Loren Merritt <lorenm@u.washington.edu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *****************************************************************************/
-
-#ifndef X264_MUXERS_H
-#define X264_MUXERS_H
-
-#include "common/common.h"
-#include "x264.h"
-
-typedef void *hnd_t;
-
-static inline int64_t gcd( int64_t a, int64_t b )
-{
-    while( 1 )
-    {
-        int64_t c = a % b;
-        if( !c )
-            return b;
-        a = b;
-        b = c;
-    }
-}
-
-static inline int64_t lcm( int64_t a, int64_t b )
-{
-    return ( a / gcd( a, b ) ) * b;
-}
-
-static inline char *get_filename_extension( char *filename )
-{
-    char *ext = filename + strlen( filename );
-    while( *ext != '.' && ext > filename )
-        ext--;
-    ext += *ext == '.';
-    return ext;
-}
-
-#include "input/input.h"
-#include "output/output.h"
-
-#endif
diff --git a/output/flv.c b/output/flv.c
index e441b6d..9831a5b 100644
--- a/output/flv.c
+++ b/output/flv.c
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "output.h"
 #include "flv_bytestream.h"

 #define CHECK(x)\
@@ -223,14 +223,14 @@ static int write_frame( hnd_t handle, uint8_t *p_nalu, int i_size, x264_picture_
         if( prev_dts == dts )
         {
             double fps = ((double)p_flv->i_timebase_den / p_flv->i_timebase_num) / (p_picture->i_dts - p_flv->i_prev_dts);
-            fprintf( stderr, "flv [warning]: duplicate DTS %"PRId64" generated by rounding\n"
-                             "               current internal decoding framerate: %.6f fps\n", dts, fps );
+            x264_cli_log( "flv", X264_LOG_WARNING, "duplicate DTS %"PRId64" generated by rounding\n"
+                          "               current internal decoding framerate: %.6f fps\n", dts, fps );
         }
         if( prev_cts == cts )
         {
             double fps = ((double)p_flv->i_timebase_den / p_flv->i_timebase_num) / (p_picture->i_pts - p_flv->i_prev_pts);
-            fprintf( stderr, "flv [warning]: duplicate CTS %"PRId64" generated by rounding\n"
-                             "               current internal composition framerate: %.6f fps\n", cts, fps );
+            x264_cli_log( "flv", X264_LOG_WARNING, "duplicate CTS %"PRId64" generated by rounding\n"
+                          "               current internal composition framerate: %.6f fps\n", cts, fps );
         }
     }
     p_flv->i_prev_dts = p_picture->i_dts;
diff --git a/output/flv_bytestream.c b/output/flv_bytestream.c
index 316114c..e02476c 100644
--- a/output/flv_bytestream.c
+++ b/output/flv_bytestream.c
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "output.h"
 #include "flv_bytestream.h"

 uint64_t dbl2int( double value )
diff --git a/output/matroska.c b/output/matroska.c
index 0304c84..a1219d0 100644
--- a/output/matroska.c
+++ b/output/matroska.c
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "output.h"
 #include "matroska_ebml.h"

 typedef struct
diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c
index 31b62f8..adfcaa8 100644
--- a/output/matroska_ebml.c
+++ b/output/matroska_ebml.c
@@ -18,7 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "output.h"
 #include "matroska_ebml.h"

 #define CLSIZE 1048576
diff --git a/output/mp4.c b/output/mp4.c
index 0e3c2fc..f2fc5f5 100644
--- a/output/mp4.c
+++ b/output/mp4.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "output.h"
 #include <gpac/isomedia.h>

 #if HAVE_GF_MALLOC
@@ -61,12 +61,12 @@ static void recompute_bitrate_mp4( GF_ISOFile *p_file, int i_track )

     timescale = gf_isom_get_media_timescale( p_file, i_track );
     count = gf_isom_get_sample_count( p_file, i_track );
-    for( int i = 0; i < count; i++ )
+    for( u32 i = 0; i < count; i++ )
     {
         GF_ISOSample *samp = gf_isom_get_sample_info( p_file, i_track, i+1, &di, &offset );
         if( !samp )
         {
-            fprintf( stderr, "mp4 [error]: failure reading back frame %u\n", i );
+            x264_cli_log( "mp4", X264_LOG_ERROR, "failure reading back frame %u\n", i );
             break;
         }

@@ -163,11 +163,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle )
     FILE *fh = fopen( psz_filename, "w" );
     if( !fh )
         return -1;
-    else if( !x264_is_regular_file( fh ) )
-    {
-        fprintf( stderr, "mp4 [error]: MP4 output is incompatible with non-regular file `%s'\n", psz_filename );
-        return -1;
-    }
+    FAIL_IF_ERR( !x264_is_regular_file( fh ), "MP4 output is incompatible with non-regular file `%s'\n", psz_filename )
     fclose( fh );

     if( !(p_mp4 = malloc( sizeof(mp4_hnd_t) )) )
diff --git a/output/output.h b/output/output.h
index c79b48e..094fefc 100644
--- a/output/output.h
+++ b/output/output.h
@@ -24,6 +24,8 @@
 #ifndef X264_OUTPUT_H
 #define X264_OUTPUT_H

+#include "x264cli.h"
+
 typedef struct
 {
     int (*open_file)( char *psz_filename, hnd_t *p_handle );
diff --git a/output/raw.c b/output/raw.c
index 02e4c56..fc418fb 100644
--- a/output/raw.c
+++ b/output/raw.c
@@ -21,7 +21,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  *****************************************************************************/

-#include "muxers.h"
+#include "output.h"

 static int open_file( char *psz_filename, hnd_t *p_handle )
 {
diff --git a/x264.c b/x264.c
index f08ab41..741570c 100644
--- a/x264.c
+++ b/x264.c
@@ -31,9 +31,11 @@
 #include <getopt.h>

 #include "common/common.h"
-#include "common/cpu.h"
-#include "x264.h"
-#include "muxers.h"
+#include "x264cli.h"
+#include "input/input.h"
+#include "output/output.h"
+
+#define FAIL_IF_ERROR( cond, ... ) FAIL_IF_ERR( cond, "x264", __VA_ARGS__ )

 #ifdef _WIN32
 #include <windows.h>
@@ -96,6 +98,7 @@ static const char * const muxer_names[] =
 };

 static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
+static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };

 typedef struct{
     int mod;
@@ -141,6 +144,48 @@ static void Help( x264_param_t *defaults, int longhelp );
 static int  Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt );
 static int  Encode( x264_param_t *param, cli_opt_t *opt );

+/* logging and printing for within the cli system */
+static int cli_log_level;
+void x264_cli_log( const char *name, int i_level, const char *fmt, ... )
+{
+    if( i_level > cli_log_level )
+        return;
+    char *s_level;
+    switch( i_level )
+    {
+        case X264_LOG_ERROR:
+            s_level = "error";
+            break;
+        case X264_LOG_WARNING:
+            s_level = "warning";
+            break;
+        case X264_LOG_INFO:
+            s_level = "info";
+            break;
+        case X264_LOG_DEBUG:
+            s_level = "debug";
+            break;
+        default:
+            s_level = "unknown";
+            break;
+    }
+    fprintf( stderr, "%s [%s]: ", name, s_level );
+    va_list arg;
+    va_start( arg, fmt );
+    vfprintf( stderr, fmt, arg );
+    va_end( arg );
+}
+
+void x264_cli_printf( int i_level, const char *fmt, ... )
+{
+    if( i_level > cli_log_level )
+        return;
+    va_list arg;
+    va_start( arg, fmt );
+    vfprintf( stderr, fmt, arg );
+    va_end( arg );
+}
+
 /****************************************************************************
  * main:
  ****************************************************************************/
@@ -571,6 +616,9 @@ static void Help( x264_param_t *defaults, int longhelp )
     H1( "  -v, --verbose               Print stats for each frame\n" );
     H1( "      --no-progress           Don't show the progress indicator while encoding\n" );
     H0( "      --quiet                 Quiet Mode\n" );
+    H1( "      --log-level <string>    Specify the maximum level of logging [\"%s\"]\n"
+        "                                  - %s\n", strtable_lookup( log_level_names, cli_log_level - X264_LOG_NONE ),
+                                       stringify_names( buf, log_level_names ) );
     H1( "      --psnr                  Enable PSNR computation\n" );
     H1( "      --ssim                  Enable SSIM computation\n" );
     H1( "      --threads <integer>     Force a specific number of threads\n" );
@@ -616,6 +664,7 @@ enum {
     OPT_TCFILE_OUT,
     OPT_TIMEBASE,
     OPT_PULLDOWN,
+    OPT_LOG_LEVEL
 } OptionsOPT;

 static char short_options[] = "8A:B:b:f:hI:i:m:o:p:q:r:t:Vvw";
@@ -729,6 +778,7 @@ static struct option long_options[] =
     { "ssim",              no_argument, NULL, 0 },
     { "quiet",             no_argument, NULL, OPT_QUIET },
     { "verbose",           no_argument, NULL, 'v' },
+    { "log-level",   required_argument, NULL, OPT_LOG_LEVEL },
     { "no-progress",       no_argument, NULL, OPT_NOPROGRESS },
     { "visualize",         no_argument, NULL, OPT_VISUALIZE },
     { "dump-yuv",    required_argument, NULL, 0 },
@@ -780,11 +830,11 @@ static int select_output( const char *muxer, char *filename, x264_param_t *param
         param->b_repeat_headers = 0;
         if( param->i_nal_hrd == X264_NAL_HRD_CBR )
         {
-            fprintf( stderr, "x264 [warning]: cbr nal-hrd is not compatible with mp4\n" );
+            x264_cli_log( "x264", X264_LOG_WARNING, "cbr nal-hrd is not compatible with mp4\n" );
             param->i_nal_hrd = X264_NAL_HRD_VBR;
         }
 #else
-        fprintf( stderr, "x264 [error]: not compiled with MP4 output support\n" );
+        x264_cli_log( "x264", X264_LOG_ERROR, "not compiled with MP4 output support\n" );
         return -1;
 #endif
     }
@@ -833,7 +883,7 @@ static int select_input( const char *demuxer, char *used_demuxer, char *filename
         input = avs_input;
         module = "avs";
 #else
-        fprintf( stderr, "x264 [error]: not compiled with AVS input support\n" );
+        x264_cli_log( "x264", X264_LOG_ERROR, "not compiled with AVS input support\n" );
         return -1;
 #endif
     }
@@ -877,11 +927,7 @@ static int select_input( const char *demuxer, char *used_demuxer, char *filename
             input = yuv_input;
         }

-        if( !(*p_handle) )
-        {
-            fprintf( stderr, "x264 [error]: could not open input file `%s' via any method!\n", filename );
-            return -1;
-        }
+        FAIL_IF_ERROR( !(*p_handle), "could not open input file `%s' via any method!\n", filename )
     }
     strcpy( used_demuxer, module );

@@ -932,6 +978,7 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
     char *tune = NULL;

     x264_param_default( &defaults );
+    cli_log_level = defaults.i_log_level;

     memset( opt, 0, sizeof(cli_opt_t) );
     memset( &input_opt, 0, sizeof(cli_input_opt_t) );
@@ -1004,32 +1051,20 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
                 output_filename = optarg;
                 break;
             case OPT_MUXER:
-                if( parse_enum_name( optarg, muxer_names, &muxer ) < 0 )
-                {
-                    fprintf( stderr, "x264 [error]: Unknown muxer `%s'\n", optarg );
-                    return -1;
-                }
+                FAIL_IF_ERROR( parse_enum_name( optarg, muxer_names, &muxer ), "Unknown muxer `%s'\n", optarg )
                 break;
             case OPT_DEMUXER:
-                if( parse_enum_name( optarg, demuxer_names, &demuxer ) < 0 )
-                {
-                    fprintf( stderr, "x264 [error]: Unknown demuxer `%s'\n", optarg );
-                    return -1;
-                }
+                FAIL_IF_ERROR( parse_enum_name( optarg, demuxer_names, &demuxer ), "Unknown demuxer `%s'\n", optarg )
                 break;
             case OPT_INDEX:
                 input_opt.index_file = optarg;
                 break;
             case OPT_QPFILE:
                 opt->qpfile = fopen( optarg, "rb" );
-                if( !opt->qpfile )
-                {
-                    fprintf( stderr, "x264 [error]: can't open qpfile `%s'\n", optarg );
-                    return -1;
-                }
-                else if( !x264_is_regular_file( opt->qpfile ) )
+                FAIL_IF_ERROR( !opt->qpfile, "can't open qpfile `%s'\n", optarg )
+                if( !x264_is_regular_file( opt->qpfile ) )
                 {
-                    fprintf( stderr, "x264 [error]: qpfile incompatible with non-regular file `%s'\n", optarg );
+                    x264_cli_log( "x264", X264_LOG_ERROR, "qpfile incompatible with non-regular file `%s'\n", optarg );
                     fclose( opt->qpfile );
                     return -1;
                 }
@@ -1038,11 +1073,17 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
                 b_thread_input = 1;
                 break;
             case OPT_QUIET:
-                param->i_log_level = X264_LOG_NONE;
+                cli_log_level = param->i_log_level = X264_LOG_NONE;
                 break;
             case 'v':
-                param->i_log_level = X264_LOG_DEBUG;
+                cli_log_level = param->i_log_level = X264_LOG_DEBUG;
                 break;
+            case OPT_LOG_LEVEL:
+                if( !parse_enum_value( optarg, log_level_names, &cli_log_level ) )
+                    cli_log_level += X264_LOG_NONE;
+                else
+                    cli_log_level = atoi( optarg );
+                param->i_log_level = cli_log_level;
             case OPT_NOPROGRESS:
                 opt->b_progress = 0;
                 break;
@@ -1051,7 +1092,7 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
                 param->b_visualize = 1;
                 b_exit_on_ctrl_c = 1;
 #else
-                fprintf( stderr, "x264 [warning]: not compiled with visualization support\n" );
+                x264_cli_log( "x264", X264_LOG_WARNING, "not compiled with visualization support\n" );
 #endif
                 break;
             case OPT_TUNE:
@@ -1078,18 +1119,13 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
                 break;
             case OPT_TCFILE_OUT:
                 opt->tcfile_out = fopen( optarg, "wb" );
-                if( !opt->tcfile_out )
-                {
-                    fprintf( stderr, "x264 [error]: can't open `%s'\n", optarg );
-                    return -1;
-                }
+                FAIL_IF_ERROR( !opt->tcfile_out, "can't open `%s'\n", optarg )
                 break;
             case OPT_TIMEBASE:
                 input_opt.timebase = optarg;
                 break;
             case OPT_PULLDOWN:
-                if( parse_enum_value( optarg, pulldown_names, &opt->i_pulldown ) < 0 )
-                    return -1;
+                FAIL_IF_ERROR( parse_enum_value( optarg, pulldown_names, &opt->i_pulldown ), "Unknown pulldown `%s'\n", optarg )
                 break;
             default:
 generic_option:
@@ -1116,7 +1152,7 @@ generic_option:
         if( b_error )
         {
             const char *name = long_options_index > 0 ? long_options[long_options_index].name : argv[optind-2];
-            fprintf( stderr, "x264 [error]: invalid argument: %s = %s\n", name, optarg );
+            x264_cli_log( "x264", X264_LOG_ERROR, "invalid argument: %s = %s\n", name, optarg );
             return -1;
         }
     }
@@ -1130,20 +1166,12 @@ generic_option:
         return -1;

     /* Get the file name */
-    if( optind > argc - 1 || !output_filename )
-    {
-        fprintf( stderr, "x264 [error]: No %s file. Run x264 --help for a list of options.\n",
-                 optind > argc - 1 ? "input" : "output" );
-        return -1;
-    }
+    FAIL_IF_ERROR( optind > argc - 1 || !output_filename, "No %s file. Run x264 --help for a list of options.\n",
+                   optind > argc - 1 ? "input" : "output" )

     if( select_output( muxer, output_filename, param ) )
         return -1;
-    if( output.open_file( output_filename, &opt->hout ) )
-    {
-        fprintf( stderr, "x264 [error]: could not open output file `%s'\n", output_filename );
-        return -1;
-    }
+    FAIL_IF_ERROR( output.open_file( output_filename, &opt->hout ), "could not open output file `%s'\n", output_filename )

     input_filename = argv[optind++];
     input_opt.resolution = optind < argc ? argv[optind++] : NULL;
@@ -1163,39 +1191,22 @@ generic_option:
     if( select_input( demuxer, demuxername, input_filename, &opt->hin, &info, &input_opt ) )
         return -1;

-    if( !opt->hin && input.open_file( input_filename, &opt->hin, &info, &input_opt ) )
-    {
-        fprintf( stderr, "x264 [error]: could not open input file `%s'\n", input_filename );
-        return -1;
-    }
+    FAIL_IF_ERROR( !opt->hin && input.open_file( input_filename, &opt->hin, &info, &input_opt ),
+                   "could not open input file `%s'\n", input_filename )

     x264_reduce_fraction( &info.sar_width, &info.sar_height );
     x264_reduce_fraction( &info.fps_num, &info.fps_den );
-    if( param->i_log_level >= X264_LOG_INFO )
-        fprintf( stderr, "%s [info]: %dx%d%c %d:%d @ %d/%d fps (%cfr)\n", demuxername, info.width,
-                 info.height, info.interlaced ? 'i' : 'p', info.sar_width, info.sar_height,
-                 info.fps_num, info.fps_den, info.vfr ? 'v' : 'c' );
+    x264_cli_log( demuxername, X264_LOG_INFO, "%dx%d%c %d:%d @ %d/%d fps (%cfr)\n", info.width,
+                  info.height, info.interlaced ? 'i' : 'p', info.sar_width, info.sar_height,
+                  info.fps_num, info.fps_den, info.vfr ? 'v' : 'c' );

     if( tcfile_name )
     {
-        if( b_user_fps )
-        {
-            fprintf( stderr, "x264 [error]: --fps + --tcfile-in is incompatible.\n" );
-            return -1;
-        }
-        if( timecode_input.open_file( tcfile_name, &opt->hin, &info, &input_opt ) )
-        {
-            fprintf( stderr, "x264 [error]: timecode input failed\n" );
-            return -1;
-        }
-        else
-            input = timecode_input;
-    }
-    else if( !info.vfr && input_opt.timebase )
-    {
-        fprintf( stderr, "x264 [error]: --timebase is incompatible with cfr input\n" );
-        return -1;
+        FAIL_IF_ERROR( b_user_fps, "--fps + --tcfile-in is incompatible.\n" )
+        FAIL_IF_ERROR( timecode_input.open_file( tcfile_name, &opt->hin, &info, &input_opt ), "timecode input failed\n" )
+        input = timecode_input;
     }
+    else FAIL_IF_ERROR( !info.vfr && input_opt.timebase, "--timebase is incompatible with cfr input\n" )

     /* set param flags from the info flags as necessary */
     param->i_csp       = info.csp;
@@ -1204,9 +1215,9 @@ generic_option:
     param->i_width     = info.width;
     if( !b_user_interlaced && info.interlaced )
     {
-        fprintf( stderr, "x264 [warning]: input appears to be interlaced, enabling %cff interlaced mode.\n"
-                         "                If you want otherwise, use --no-interlaced or --%cff\n",
-                 info.tff ? 't' : 'b', info.tff ? 'b' : 't' );
+        x264_cli_log( "x264", X264_LOG_WARNING, "input appears to be interlaced, enabling %cff interlaced mode.\n"
+                      "                If you want otherwise, use --no-interlaced or --%cff\n",
+                      info.tff ? 't' : 'b', info.tff ? 'b' : 't' );
         param->b_interlaced = 1;
         param->b_tff = !!info.tff;
     }
@@ -1230,21 +1241,14 @@ generic_option:
         uint64_t i_user_timebase_num;
         uint64_t i_user_timebase_den;
         int ret = sscanf( input_opt.timebase, "%"SCNu64"/%"SCNu64, &i_user_timebase_num, &i_user_timebase_den );
-        if( !ret )
-        {
-            fprintf( stderr, "x264 [error]: invalid argument: timebase = %s\n", input_opt.timebase );
-            return -1;
-        }
+        FAIL_IF_ERROR( !ret, "invalid argument: timebase = %s\n", input_opt.timebase )
         else if( ret == 1 )
         {
             i_user_timebase_num = param->i_timebase_num;
             i_user_timebase_den = strtoul( input_opt.timebase, NULL, 10 );
         }
-        if( i_user_timebase_num > UINT32_MAX || i_user_timebase_den > UINT32_MAX )
-        {
-            fprintf( stderr, "x264 [error]: timebase you specified exceeds H.264 maximum\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( i_user_timebase_num > UINT32_MAX || i_user_timebase_den > UINT32_MAX,
+                       "timebase you specified exceeds H.264 maximum\n" )
         opt->timebase_convert_multiplier = ((double)i_user_timebase_den / param->i_timebase_den)
                                          * ((double)param->i_timebase_num / i_user_timebase_num);
         param->i_timebase_num = i_user_timebase_num;
@@ -1261,13 +1265,8 @@ generic_option:
     if( b_thread_input || param->i_threads > 1
         || (param->i_threads == X264_THREADS_AUTO && x264_cpu_num_processors() > 1) )
     {
-        if( thread_input.open_file( NULL, &opt->hin, &info, NULL ) )
-        {
-            fprintf( stderr, "x264 [error]: threaded input failed\n" );
-            return -1;
-        }
-        else
-            input = thread_input;
+        FAIL_IF_ERROR( thread_input.open_file( NULL, &opt->hin, &info, NULL ), "threaded input failed\n" )
+        input = thread_input;
     }
 #endif

@@ -1321,7 +1320,7 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
         else ret = 0;
         if( ret != 3 || qp < -1 || qp > 51 )
         {
-            fprintf( stderr, "x264 [error]: can't parse qpfile for frame %d\n", i_frame );
+            x264_cli_log( "x264", X264_LOG_ERROR, "can't parse qpfile for frame %d\n", i_frame );
             fclose( opt->qpfile );
             opt->qpfile = NULL;
             pic->i_type = X264_TYPE_AUTO;
@@ -1344,11 +1343,7 @@ static int  Encode_frame( x264_t *h, hnd_t hout, x264_picture_t *pic, int64_t *l

     i_frame_size = x264_encoder_encode( h, &nal, &i_nal, pic, &pic_out );

-    if( i_frame_size < 0 )
-    {
-        fprintf( stderr, "x264 [error]: x264_encoder_encode failed\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( i_frame_size < 0, "x264_encoder_encode failed\n" );

     if( i_frame_size )
     {
@@ -1424,17 +1419,14 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
         param->b_pic_struct = 1;
         pulldown = &pulldown_values[opt->i_pulldown];
         param->i_timebase_num = param->i_fps_den;
-        if( fmod( param->i_fps_num * pulldown->fps_factor, 1 ) )
-        {
-            fprintf( stderr, "x264 [error]: unsupported framerate for chosen pulldown\n" );
-            return -1;
-        }
+        FAIL_IF_ERROR( fmod( param->i_fps_num * pulldown->fps_factor, 1 ),
+                       "unsupported framerate for chosen pulldown\n" )
         param->i_timebase_den = param->i_fps_num * pulldown->fps_factor;
     }

     if( ( h = x264_encoder_open( param ) ) == NULL )
     {
-        fprintf( stderr, "x264 [error]: x264_encoder_open failed\n" );
+        x264_cli_log( "x264", X264_LOG_ERROR, "x264_encoder_open failed\n" );
         input.close_file( opt->hin );
         return -1;
     }
@@ -1445,27 +1437,19 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )

     if( output.set_param( opt->hout, param ) )
     {
-        fprintf( stderr, "x264 [error]: can't set outfile param\n" );
+        x264_cli_log( "x264", X264_LOG_ERROR, "can't set outfile param\n" );
         input.close_file( opt->hin );
         output.close_file( opt->hout, largest_pts, second_largest_pts );
         return -1;
     }

     /* Create a new pic */
-    if( input.picture_alloc( &pic, param->i_csp, param->i_width, param->i_height ) )
-    {
-        fprintf( stderr, "x264 [error]: malloc failed\n" );
-        return -1;
-    }
+    FAIL_IF_ERROR( input.picture_alloc( &pic, param->i_csp, param->i_width, param->i_height ), "malloc failed\n" )

     i_start = x264_mdate();
     /* ticks/frame = ticks/second / frames/second */
     ticks_per_frame = (int64_t)param->i_timebase_den * param->i_fps_den / param->i_timebase_num / param->i_fps_num;
-    if( ticks_per_frame < 1 )
-    {
-        fprintf( stderr, "x264 [error]: ticks_per_frame invalid: %"PRId64"\n", ticks_per_frame );
-        return -1;
-    }
+    FAIL_IF_ERROR( ticks_per_frame < 1, "ticks_per_frame invalid: %"PRId64"\n", ticks_per_frame )

     if( !param->b_repeat_headers )
     {
@@ -1473,12 +1457,7 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
         x264_nal_t *headers;
         int i_nal;

-        if( x264_encoder_headers( h, &headers, &i_nal ) < 0 )
-        {
-            fprintf( stderr, "x264 [error]: x264_encoder_headers failed\n" );
-            return -1;
-        }
-
+        FAIL_IF_ERROR( x264_encoder_headers( h, &headers, &i_nal ) < 0, "x264_encoder_headers failed\n" )
         if( (i_file = output.write_headers( opt->hout, headers )) < 0 )
             return -1;
     }
@@ -1508,15 +1487,12 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )

         if( pic.i_pts <= largest_pts )
         {
-            if( param->i_log_level >= X264_LOG_WARNING )
-            {
-                if( param->i_log_level >= X264_LOG_DEBUG || pts_warning_cnt < MAX_PTS_WARNING )
-                    fprintf( stderr, "x264 [warning]: non-strictly-monotonic pts at frame %d (%"PRId64" <= %"PRId64")\n",
+            if( cli_log_level >= X264_LOG_DEBUG || pts_warning_cnt < MAX_PTS_WARNING )
+                x264_cli_log( "x264", X264_LOG_WARNING, "non-strictly-monotonic pts at frame %d (%"PRId64" <= %"PRId64")\n",
                              i_frame, output_pts, largest_pts * dts_compress_multiplier );
-                else if( pts_warning_cnt == MAX_PTS_WARNING )
-                    fprintf( stderr, "x264 [warning]: too many nonmonotonic pts warnings, suppressing further ones\n" );
-                pts_warning_cnt++;
-            }
+            else if( pts_warning_cnt == MAX_PTS_WARNING )
+                x264_cli_log( "x264", X264_LOG_WARNING, "too many nonmonotonic pts warnings, suppressing further ones\n" );
+            pts_warning_cnt++;
             pic.i_pts = largest_pts + ticks_per_frame;
             output_pts = pic.i_pts * dts_compress_multiplier;
         }
@@ -1573,8 +1549,8 @@ static int  Encode( x264_param_t *param, cli_opt_t *opt )
         if( opt->b_progress && i_frame_output % i_update_interval == 0 && i_frame_output )
             Print_status( i_start, i_frame_output, i_frame_total, i_file, param, 2 * last_dts - prev_dts - first_dts );
     }
-    if( pts_warning_cnt >= MAX_PTS_WARNING && param->i_log_level < X264_LOG_DEBUG )
-        fprintf( stderr, "x264 [warning]: %d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );
+    if( pts_warning_cnt >= MAX_PTS_WARNING && cli_log_level < X264_LOG_DEBUG )
+        x264_cli_log( "x264", X264_LOG_WARNING, "%d suppressed nonmonotonic pts warnings\n", pts_warning_cnt-MAX_PTS_WARNING );

     /* duration algorithm fails when only 1 frame is output */
     if( i_frame_output == 1 )
diff --git a/x264cli.h b/x264cli.h
new file mode 100644
index 0000000..1acca56
--- /dev/null
+++ b/x264cli.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+ * x264cli.h: x264cli common
+ *****************************************************************************
+ * Copyright (C) 2003-2010 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir@via.ecp.fr>
+ *          Loren Merritt <lorenm@u.washington.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *****************************************************************************/
+
+#ifndef X264_CLI_H
+#define X264_CLI_H
+
+#include "common/common.h"
+
+typedef void *hnd_t;
+
+static inline int64_t gcd( int64_t a, int64_t b )
+{
+    while( 1 )
+    {
+        int64_t c = a % b;
+        if( !c )
+            return b;
+        a = b;
+        b = c;
+    }
+}
+
+static inline int64_t lcm( int64_t a, int64_t b )
+{
+    return ( a / gcd( a, b ) ) * b;
+}
+
+static inline char *get_filename_extension( char *filename )
+{
+    char *ext = filename + strlen( filename );
+    while( *ext != '.' && ext > filename )
+        ext--;
+    ext += *ext == '.';
+    return ext;
+}
+
+void x264_cli_log( const char *name, int i_level, const char *fmt, ... );
+void x264_cli_printf( int i_level, const char *fmt, ... );
+
+#define FAIL_IF_ERR( cond, name, ... )\
+if( cond )\
+{\
+    x264_cli_log( name, X264_LOG_ERROR, __VA_ARGS__ );\
+    return -1;\
+}
+
+#endif
--
1.7.1


From e9d3c46276c11f98f3819d8faf1fd0402bcb6a08 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Wed, 30 Jun 2010 13:06:22 -0700
Subject: [PATCH 5/7] Don't check i16x16 planar mode unless previous modes were useful
 Saves ~160 clocks per MB at subme=1, ~270 per MB at subme>1 (measured on Core i7).
 Negligle effect on compression.

Also make a few more arrays static.
---
 encoder/analyse.c |   29 +++++++++++++++++++----------
 encoder/set.c     |    3 ++-
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 696c78f..cdbdd1e 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -646,16 +646,27 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
     /* 16x16 prediction selection */
     const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );

+    /* Not heavily tuned */
+    static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
+    int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
+
     if( !h->mb.b_lossless && predict_mode[3] >= 0 )
     {
         h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
-        h->predict_16x16[I_PRED_16x16_P]( p_dst );
-        a->i_satd_i16x16_dir[I_PRED_16x16_P] =
-            h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
-        for( int i = 0; i < 4; i++ )
+        a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
+        a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
+        a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
+        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
+        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
+        COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
+
+        /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
+        if( a->i_satd_i16x16 <= i16x16_thresh )
         {
-            int cost = a->i_satd_i16x16_dir[i] += lambda * bs_size_ue(i);
-            COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
+            h->predict_16x16[I_PRED_16x16_P]( p_dst );
+            a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+            a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
+            COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
         }
     }
     else
@@ -681,9 +692,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
         /* cavlc mb type prefix */
         a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];

-    /* Not heavily tuned */
-    const uint8_t i16x16_thresh[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
-    if( a->b_fast_intra && a->i_satd_i16x16 > (i16x16_thresh[h->mb.i_subpel_refine]*i_satd_inter)>>1 )
+    if( a->i_satd_i16x16 > i16x16_thresh )
         return;

     /* 8x8 prediction selection */
@@ -784,7 +793,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
         }
         /* Not heavily tuned */
-        const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
+        static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
         if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
             return;
     }
diff --git a/encoder/set.c b/encoder/set.c
index 8d007aa..8ea6eac 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -534,7 +534,8 @@ int x264_sei_version_write( x264_t *h, bs_t *s )
 {
     int i;
     // random ID number generated according to ISO-11578
-    const uint8_t uuid[16] = {
+    static const uint8_t uuid[16] =
+    {
         0xdc, 0x45, 0xe9, 0xbd, 0xe6, 0xd9, 0x48, 0xb7,
         0x96, 0x2c, 0xd8, 0x20, 0xd9, 0x23, 0xee, 0xef
     };
--
1.7.1


From 470c853a3c0817573139ab387b1c3fe207d62a17 Mon Sep 17 00:00:00 2001
From: Jason Garrett-Glaser <darkshikari@gmail.com>
Date: Wed, 30 Jun 2010 13:55:46 -0700
Subject: [PATCH 6/7] Support infinite keyint (--keyint infinite).
 This just means x264 won't insert non-scenecut keyframes.
 Useful for streaming when using interactive error recovery or some other mechanism that makes keyframes unnecessary.

Also change POC logic to limit POC/framenum LSB size (to save bits per slice).
Also fix a bug in the CPB underflow detection code (didn't affect the bitstream, just resulted in the failure to print certain warning messages).
---
 common/common.c       |    7 ++++---
 encoder/encoder.c     |    8 ++++----
 encoder/ratecontrol.c |   10 +++++-----
 encoder/set.c         |   29 +++++++++++++++++------------
 encoder/slicetype.c   |    2 +-
 x264.c                |    2 +-
 x264.h                |    3 ++-
 7 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/common/common.c b/common/common.c
index 8c7cf3c..14dd716 100644
--- a/common/common.c
+++ b/common/common.c
@@ -638,9 +638,10 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
         p->i_dpb_size = atoi(value);
     OPT("keyint")
     {
-        p->i_keyint_max = atoi(value);
-        if( p->i_keyint_min > p->i_keyint_max )
-            p->i_keyint_min = p->i_keyint_max;
+        if( strstr( value, "infinite" ) )
+            p->i_keyint_max = X264_KEYINT_MAX_INFINITE;
+        else
+            p->i_keyint_max = atoi(value);
     }
     OPT2("min-keyint", "keyint-min")
     {
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 5cd3307..31cb84a 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -567,8 +567,7 @@ static int x264_validate_parameters( x264_t *h )

     h->param.i_frame_reference = x264_clip3( h->param.i_frame_reference, 1, 16 );
     h->param.i_dpb_size = x264_clip3( h->param.i_dpb_size, 1, 16 );
-    if( h->param.i_keyint_max <= 0 )
-        h->param.i_keyint_max = 1;
+    h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
     if( h->param.i_scenecut_threshold < 0 )
         h->param.i_scenecut_threshold = 0;
     if( !h->param.analyse.i_subpel_refine && h->param.analyse.i_direct_mv_pred > X264_DIRECT_PRED_SPATIAL )
@@ -627,9 +626,10 @@ static int x264_validate_parameters( x264_t *h )
     h->param.rc.f_qcompress = x264_clip3f( h->param.rc.f_qcompress, 0.0, 1.0 );
     if( h->param.i_keyint_max == 1 || h->param.rc.f_qcompress == 1 )
         h->param.rc.b_mb_tree = 0;
-    if( !h->param.rc.i_lookahead && !h->param.b_intra_refresh && h->param.rc.b_mb_tree )
+    if( (!h->param.b_intra_refresh && h->param.i_keyint_max != X264_KEYINT_MAX_INFINITE) &&
+        !h->param.rc.i_lookahead && h->param.rc.b_mb_tree )
     {
-        x264_log( h, X264_LOG_WARNING, "lookaheadless mb-tree requires intra refresh\n" );
+        x264_log( h, X264_LOG_WARNING, "lookaheadless mb-tree requires intra refresh or infinite keyint\n" );
         h->param.rc.b_mb_tree = 0;
     }
     if( h->param.rc.b_stat_read )
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 1030ef2..6fdaa98 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -492,13 +492,13 @@ void x264_ratecontrol_init_reconfigurable( x264_t *h, int b_init )
             // arbitrary
             #define MAX_DURATION 0.5

-            int max_cpb_output_delay = h->param.i_keyint_max * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick;
+            int max_cpb_output_delay = X264_MIN( h->param.i_keyint_max * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick, INT_MAX );
             int max_dpb_output_delay = h->sps->vui.i_max_dec_frame_buffering * MAX_DURATION * h->sps->vui.i_time_scale / h->sps->vui.i_num_units_in_tick;
             int max_delay = (int)(90000.0 * (double)h->sps->vui.hrd.i_cpb_size_unscaled / h->sps->vui.hrd.i_bit_rate_unscaled + 0.5);

             h->sps->vui.hrd.i_initial_cpb_removal_delay_length = 2 + x264_clip3( 32 - x264_clz( max_delay ), 4, 22 );
-            h->sps->vui.hrd.i_cpb_removal_delay_length = x264_clip3( 32 - x264_clz( max_cpb_output_delay ), 4, 32 );
-            h->sps->vui.hrd.i_dpb_output_delay_length  = x264_clip3( 32 - x264_clz( max_dpb_output_delay ), 4, 32 );
+            h->sps->vui.hrd.i_cpb_removal_delay_length = x264_clip3( 32 - x264_clz( max_cpb_output_delay ), 4, 31 );
+            h->sps->vui.hrd.i_dpb_output_delay_length  = x264_clip3( 32 - x264_clz( max_dpb_output_delay ), 4, 31 );

             #undef MAX_DURATION

@@ -1781,10 +1781,10 @@ void x264_hrd_fullness( x264_t *h )
     uint64_t cpb_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
     uint64_t multiply_factor = 180000 / rct->hrd_multiply_denom;

-    if( cpb_state < 0 || cpb_state > cpb_size )
+    if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > cpb_size )
     {
          x264_log( h, X264_LOG_WARNING, "CPB %s: %.0lf bits in a %.0lf-bit buffer\n",
-                   cpb_state < 0 ? "underflow" : "overflow", (float)cpb_state/denom, (float)cpb_size/denom );
+                   rct->buffer_fill_final < 0 ? "underflow" : "overflow", (float)rct->buffer_fill_final/denom, (float)cpb_size/denom );
     }

     h->initial_cpb_removal_delay = (multiply_factor * cpb_state + denom) / (2*denom);
diff --git a/encoder/set.c b/encoder/set.c
index 8ea6eac..9e6e736 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -99,6 +99,7 @@ static void x264_sei_write( bs_t *s, uint8_t *p_start )
 void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
 {
     sps->i_id = i_id;
+    int max_frame_num;

     sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
     if( sps->b_qpprime_y_zero_transform_bypass )
@@ -118,15 +119,27 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     /* Never set constraint_set2, it is not necessary and not used in real world. */
     sps->b_constraint_set2  = 0;

-    sps->i_log2_max_frame_num = 4;  /* at least 4 */
-    while( (1 << sps->i_log2_max_frame_num) <= param->i_keyint_max && sps->i_log2_max_frame_num < 10 )
+    sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
+    /* extra slot with pyramid so that we don't have to override the
+     * order of forgetting old pictures */
+    sps->vui.i_max_dec_frame_buffering =
+    sps->i_num_ref_frames = X264_MIN(16, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
+                            param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size));
+    sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
+
+    /* number of refs + current frame */
+    max_frame_num = sps->vui.i_max_dec_frame_buffering * (!!param->i_bframe_pyramid+1) + 1;
+    sps->i_log2_max_frame_num = 4;
+    while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
         sps->i_log2_max_frame_num++;
-    sps->i_log2_max_frame_num++;

     sps->i_poc_type = 0;
     if( sps->i_poc_type == 0 )
     {
-        sps->i_log2_max_poc_lsb = sps->i_log2_max_frame_num + 1;    /* max poc = 2*frame_num */
+        int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
+        sps->i_log2_max_poc_lsb = 4;
+        while( (1 << sps->i_log2_max_poc_lsb) <= max_delta_poc * 2 )
+            sps->i_log2_max_poc_lsb++;
     }
     else if( sps->i_poc_type == 1 )
     {
@@ -219,14 +232,6 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )

     // NOTE: HRD related parts of the SPS are initialised in x264_ratecontrol_init_reconfigurable

-    sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
-    /* extra slot with pyramid so that we don't have to override the
-     * order of forgetting old pictures */
-    sps->vui.i_max_dec_frame_buffering =
-    sps->i_num_ref_frames = X264_MIN(16, X264_MAX4(param->i_frame_reference, 1 + sps->vui.i_num_reorder_frames,
-                            param->i_bframe_pyramid ? 4 : 1, param->i_dpb_size));
-    sps->i_num_ref_frames -= param->i_bframe_pyramid == X264_B_PYRAMID_STRICT;
-
     sps->vui.b_bitstream_restriction = 1;
     if( sps->vui.b_bitstream_restriction )
     {
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 4ede8cf..7d69b71 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -1009,7 +1009,7 @@ static int scenecut_internal( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **f
     float f_thresh_max = h->param.i_scenecut_threshold / 100.0;
     /* magic numbers pulled out of thin air */
     float f_thresh_min = f_thresh_max * h->param.i_keyint_min
-                         / ( h->param.i_keyint_max * 4 );
+                         / ( h->param.i_keyint_max * 4. );
     int res;

     if( h->param.i_keyint_min == h->param.i_keyint_max )
diff --git a/x264.c b/x264.c
index 741570c..0bede93 100644
--- a/x264.c
+++ b/x264.c
@@ -409,7 +409,7 @@ static void Help( x264_param_t *defaults, int longhelp )
     H0( "\n" );
     H0( "Frame-type options:\n" );
     H0( "\n" );
-    H0( "  -I, --keyint <integer>      Maximum GOP size [%d]\n", defaults->i_keyint_max );
+    H0( "  -I, --keyint <integer or \"infinite\"> Maximum GOP size [%d]\n", defaults->i_keyint_max );
     H2( "  -i, --min-keyint <integer>  Minimum GOP size [auto]\n" );
     H2( "      --no-scenecut           Disable adaptive I-frame decision\n" );
     H2( "      --scenecut <integer>    How aggressively to insert extra I-frames [%d]\n", defaults->i_scenecut_threshold );
diff --git a/x264.h b/x264.h
index 86f7426..097365a 100644
--- a/x264.h
+++ b/x264.h
@@ -35,7 +35,7 @@

 #include <stdarg.h>

-#define X264_BUILD 101
+#define X264_BUILD 102

 /* x264_t:
  *      opaque handler for encoder */
@@ -152,6 +152,7 @@ typedef struct
 #define X264_B_PYRAMID_STRICT        1
 #define X264_B_PYRAMID_NORMAL        2
 #define X264_KEYINT_MIN_AUTO         0
+#define X264_KEYINT_MAX_INFINITE     (1<<30)
 #define X264_OPEN_GOP_NONE           0
 #define X264_OPEN_GOP_NORMAL         1
 #define X264_OPEN_GOP_BLURAY         2
--
1.7.1


From 1b4e95140832b569f81984dcc36fea50452380f9 Mon Sep 17 00:00:00 2001
From: Oskar Arvidsson <oskar@irock.se>
Date: Fri, 2 Jul 2010 04:06:08 +0200
Subject: [PATCH 7/7] Support for 9 and 10-bit encoding
 Output bit depth is specified on compilation time via --bit-depth.
 There is currently almost no assembly code available for high-bit-depth modes, so encoding will be very slow.
 Input is still 8-bit only; this will change in the future.

Note that very few H.264 decoders support >8 bit depth currently.
---
 common/arm/mc-c.c      |   42 +++++++-----
 common/arm/predict-c.c |    8 ++
 common/bitstream.h     |    2 +-
 common/common.c        |   17 ++++-
 common/common.h        |   39 +++++++----
 common/dct.c           |   15 +++-
 common/deblock.c       |   27 +++++---
 common/macroblock.c    |    2 +-
 common/macroblock.h    |   66 ++++++++++++-------
 common/mc.c            |   33 ++++-----
 common/mc.h            |    2 +-
 common/pixel.c         |   14 +++-
 common/ppc/dct.c       |    2 +
 common/ppc/deblock.c   |    2 +
 common/ppc/mc.c        |    4 +
 common/ppc/pixel.c     |    4 +
 common/ppc/predict.c   |    6 ++
 common/ppc/quant.c     |    2 +
 common/predict.c       |   63 ++++++++++--------
 common/quant.c         |   14 ++---
 common/set.c           |   25 ++++++-
 common/x86/mc-c.c      |   12 +++-
 common/x86/predict-c.c |   10 +++
 configure              |   16 +++++
 encoder/analyse.c      |   80 ++++++++++++-----------
 encoder/cabac.c        |   25 ++++---
 encoder/cavlc.c        |   24 +++----
 encoder/encoder.c      |   45 +++++++++----
 encoder/macroblock.h   |    4 +-
 encoder/me.h           |    2 +-
 encoder/ratecontrol.c  |   20 +++---
 encoder/rdo.c          |   10 +--
 encoder/set.c          |    8 ++-
 encoder/slicetype.c    |   10 ++--
 tools/checkasm.c       |  169 +++++++++++++++++++++++++----------------------
 x264.c                 |   24 ++++---
 x264.h                 |    4 +-
 37 files changed, 516 insertions(+), 336 deletions(-)

diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index d294eff..b1106dd 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -64,6 +64,19 @@ MC_WEIGHT(_nodenom)
 MC_WEIGHT(_offsetadd)
 MC_WEIGHT(_offsetsub)

+void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
+
+void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+#if !X264_HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
     if( w->i_scale == 1<<w->i_denom )
@@ -85,14 +98,6 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
         w->weightfn = x264_mc_wtab_neon;
 }

-void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
-
-void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
-void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
-
 static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
 {
     NULL,
@@ -174,10 +179,6 @@ static uint8_t *get_ref_neon( uint8_t *dst,   int *i_dst_stride,
     }
 }

-void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
-void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
-void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
-
 static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
                               int stride, int width, int height, int16_t *buf )
 {
@@ -198,18 +199,22 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
         src  += stride;
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH

 void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_ARMV6) )
         return;

+#if !X264_HIGH_BIT_DEPTH
     pf->prefetch_fenc = x264_prefetch_fenc_arm;
     pf->prefetch_ref  = x264_prefetch_ref_arm;
+#endif // !X264_HIGH_BIT_DEPTH

     if( !(cpu&X264_CPU_NEON) )
         return;

+#if !X264_HIGH_BIT_DEPTH
     pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
@@ -229,15 +234,16 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->offsetsub = x264_mc_offsetsub_wtab_neon;
     pf->weight_cache = x264_weight_cache_neon;

-// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
-#ifndef SYS_MACOSX
-    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
-#endif
-    pf->memzero_aligned = x264_memzero_aligned_neon;
-
     pf->mc_chroma = x264_mc_chroma_neon;
     pf->mc_luma = mc_luma_neon;
     pf->get_ref = get_ref_neon;
     pf->hpel_filter = hpel_filter_neon;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+#endif // !X264_HIGH_BIT_DEPTH
+
+// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
+#ifndef SYS_MACOSX
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+#endif
+    pf->memzero_aligned = x264_memzero_aligned_neon;
 }
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
index fa7b9f7..b40dc9a 100644
--- a/common/arm/predict-c.c
+++ b/common/arm/predict-c.c
@@ -51,6 +51,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
     if (!(cpu&X264_CPU_ARMV6))
         return;

+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
     pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
     pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
@@ -59,6 +60,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
         return;

     pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
@@ -66,12 +68,14 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
     if (!(cpu&X264_CPU_NEON))
         return;

+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
     pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
     pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
     pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
     pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
     pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
@@ -79,8 +83,10 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_
     if (!(cpu&X264_CPU_NEON))
         return;

+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
     pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
@@ -88,10 +94,12 @@ void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
     if (!(cpu&X264_CPU_NEON))
         return;

+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
     pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
     pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
     pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
     pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
     pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/bitstream.h b/common/bitstream.h
index dd8118d..318c790 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -53,7 +53,7 @@ typedef struct bs_s
 typedef struct
 {
     int     last;
-    int16_t level[16];
+    dctcoef level[16];
     uint8_t run[16];
 } x264_run_level_t;

diff --git a/common/common.c b/common/common.c
index 14dd716..728dfab 100644
--- a/common/common.c
+++ b/common/common.c
@@ -91,10 +91,10 @@ void x264_param_default( x264_param_t *param )
     param->rc.i_vbv_max_bitrate = 0;
     param->rc.i_vbv_buffer_size = 0;
     param->rc.f_vbv_buffer_init = 0.9;
-    param->rc.i_qp_constant = 23;
-    param->rc.f_rf_constant = 23;
+    param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
+    param->rc.f_rf_constant = 23 + QP_BD_OFFSET;
     param->rc.i_qp_min = 10;
-    param->rc.i_qp_max = 51;
+    param->rc.i_qp_max = QP_MAX;
     param->rc.i_qp_step = 4;
     param->rc.f_ip_factor = 1.4;
     param->rc.f_pb_factor = 1.3;
@@ -418,6 +418,15 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
     if( !profile )
         return 0;

+#if BIT_DEPTH > 8
+    if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) ||
+        !strcasecmp( profile, "high" ) )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH );
+        return -1;
+    }
+#endif
+
     if( !strcasecmp( profile, "baseline" ) )
     {
         param->analyse.b_transform_8x8 = 0;
@@ -441,7 +450,7 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
         param->analyse.b_transform_8x8 = 0;
         param->i_cqm_preset = X264_CQM_FLAT;
     }
-    else if( !strcasecmp( profile, "high" ) )
+    else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) )
     {
         /* Default */
     }
diff --git a/common/common.h b/common/common.h
index 7b60811..a218d35 100644
--- a/common/common.h
+++ b/common/common.h
@@ -54,8 +54,13 @@ do {\
 #define X264_THREAD_MAX 128
 #define X264_PCM_COST (386*8)
 #define X264_LOOKAHEAD_MAX 250
+#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
+#define QP_MAX (51+QP_BD_OFFSET)
+#define QP_MAX_MAX (51+2*6)
+#define LAMBDA_MAX (91 << (BIT_DEPTH-8))
+#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
 // arbitrary, but low because SATD scores are 1/4 normal
-#define X264_LOOKAHEAD_QP 12
+#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)

 // number of pixels (per thread) in progress at any given time.
 // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
@@ -101,17 +106,23 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
 #define CP64(dst,src) M64(dst) = M64(src)
 #define CP128(dst,src) M128(dst) = M128(src)

-typedef uint8_t pixel;
-typedef uint32_t pixel4;
-typedef int16_t dctcoef;
+#if X264_HIGH_BIT_DEPTH
+    typedef uint16_t pixel;
+    typedef uint64_t pixel4;
+    typedef int32_t  dctcoef;

-#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
-#define MPIXEL_X4(src) M32(src)
-#define CPPIXEL_X4(dst,src) CP32(dst,src)
-#define CPPIXEL_X8(dst,src) CP64(dst,src)
-#define MDCT_X2(dct) M32(dct)
-#define CPDCT_X2(dst,src) CP32(dst,src)
-#define CPDCT_X4(dst,src) CP64(dst,src)
+#   define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
+#   define MPIXEL_X4(src) M64(src)
+#else
+    typedef uint8_t  pixel;
+    typedef uint32_t pixel4;
+    typedef int16_t  dctcoef;
+
+#   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
+#   define MPIXEL_X4(src) M32(src)
+#endif
+
+#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)

 #define X264_SCAN8_SIZE (6*8)
 #define X264_SCAN8_LUMA_SIZE (5*8)
@@ -189,7 +200,7 @@ void x264_init_vlc_tables();

 static ALWAYS_INLINE pixel x264_clip_pixel( int x )
 {
-    return x&(~255) ? (-x)>>31 : x;
+    return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
 }

 static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
@@ -449,8 +460,8 @@ struct x264_t
     /* mv/ref cost arrays.  Indexed by lambda instead of
      * qp because, due to rounding, some quantizers share
      * lambdas.  This saves memory. */
-    uint16_t *cost_mv[92];
-    uint16_t *cost_mv_fpel[92][4];
+    uint16_t *cost_mv[LAMBDA_MAX+1];
+    uint16_t *cost_mv_fpel[LAMBDA_MAX+1][4];

     const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */

diff --git a/common/dct.c b/common/dct.c
index 60dbd55..cd27363 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -418,6 +418,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->dct4x4dc  = dct4x4dc;
     dctf->idct4x4dc = idct4x4dc;

+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -515,6 +516,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_dct_init_weights( void )
@@ -599,11 +601,9 @@ static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )

 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
 {
-    CPDCT_X2( level, dct );
+    memcpy( level, dct, 2 * sizeof(dctcoef) );
     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
-    CPDCT_X2( level+6, dct+6 );
-    CPDCT_X4( level+8, dct+8 );
-    CPDCT_X4( level+12, dct+12 );
+    memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
 }

 #undef ZIG
@@ -618,6 +618,7 @@ static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
     CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
     CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
     CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
+#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
 #define COPY8x8\
     CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
     CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
@@ -709,6 +710,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->sub_8x8    = zigzag_sub_8x8_field;
         pf->sub_4x4    = zigzag_sub_4x4_field;
         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
         if( cpu&X264_CPU_MMXEXT )
         {
@@ -726,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         if( cpu&X264_CPU_ALTIVEC )
             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
     }
     else
     {
@@ -734,6 +737,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->sub_8x8    = zigzag_sub_8x8_frame;
         pf->sub_4x4    = zigzag_sub_4x4_frame;
         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
         if( cpu&X264_CPU_MMX )
             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
@@ -759,13 +763,16 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         if( cpu&X264_CPU_NEON )
             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
     }

     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/deblock.c b/common/deblock.c
index db9c95d..0b3b6df 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -265,18 +265,19 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264

 static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 {
-    int index_a = i_qp + h->sh.i_alpha_c0_offset;
-    int alpha = alpha_table(index_a);
-    int beta  = beta_table(i_qp + h->sh.i_beta_offset);
+    int index_a = x264_clip3((i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset), 0, 51);
+    int index_b = x264_clip3((i_qp-QP_BD_OFFSET + h->sh.i_beta_offset), 0, 51);
+    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
+    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
     int8_t tc[4];

     if( !M32(bS) || !alpha || !beta )
         return;

-    tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
-    tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
-    tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
-    tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
+    tc[0] = (tc0_table(index_a)[bS[0]] << (BIT_DEPTH-8)) + b_chroma;
+    tc[1] = (tc0_table(index_a)[bS[1]] << (BIT_DEPTH-8)) + b_chroma;
+    tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
+    tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;

     pf_inter( pix1, i_stride, alpha, beta, tc );
     if( b_chroma )
@@ -285,8 +286,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri

 static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 {
-    int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
-    int beta  = beta_table(i_qp + h->sh.i_beta_offset);
+    int index_a = x264_clip3((i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset), 0, 51);
+    int index_b = x264_clip3((i_qp-QP_BD_OFFSET + h->sh.i_beta_offset), 0, 51);
+    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
+    int beta  = beta_table(index_b) << (BIT_DEPTH-8);

     if( !alpha || !beta )
         return;
@@ -450,6 +453,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 #if HAVE_MMX
     if( cpu&X264_CPU_MMXEXT )
     {
+#if !X264_HIGH_BIT_DEPTH
         pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
         pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
         pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
@@ -460,10 +464,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
         pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
         pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
         pf->deblock_strength = x264_deblock_strength_mmxext;
         if( cpu&X264_CPU_SSE2 )
         {
             pf->deblock_strength = x264_deblock_strength_sse2;
+#if !X264_HIGH_BIT_DEPTH
             if( !(cpu&X264_CPU_STACK_MOD4) )
             {
                 pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
@@ -471,12 +477,14 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
                 pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
                 pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
             }
+#endif // !X264_HIGH_BIT_DEPTH
         }
         if( cpu&X264_CPU_SSSE3 )
             pf->deblock_strength = x264_deblock_strength_ssse3;
     }
 #endif

+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
     if( cpu&X264_CPU_ALTIVEC )
     {
@@ -494,4 +502,5 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
         pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
    }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/macroblock.c b/common/macroblock.c
index 4561d8a..f0a624f 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -337,7 +337,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
     int scratch_size = 0;
     if( !b_lookahead )
     {
-        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
+        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(dctcoef);
         int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
         int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
         int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
diff --git a/common/macroblock.h b/common/macroblock.h
index 1a4992f..e09cd55 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -238,17 +238,30 @@ static const uint16_t block_idx_xy_fdec[16] =
     2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
 };

-static const uint8_t i_chroma_qp_table[52+12*2] =
+#define QP(qP) ( (qP)+QP_BD_OFFSET )
+static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
 {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
-    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
-    39, 39,
-    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+         0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,
+#if BIT_DEPTH > 9
+   QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
+#endif
+#if BIT_DEPTH > 8
+    QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
+#endif
+     QP(0),  QP(1),  QP(2),  QP(3),  QP(4),  QP(5),
+     QP(6),  QP(7),  QP(8),  QP(9), QP(10), QP(11),
+    QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
+    QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
+    QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
+    QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
+    QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
+    QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
+    QP(39), QP(39), QP(39), QP(39),
+    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
+    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
 };
+#undef QP

 enum cabac_ctx_block_cat_e
 {
@@ -340,26 +353,31 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
    return (a&0xFFFF) + (b<<16);
 #endif
 }
+static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + ((uint64_t)a<<32);
+#else
+   return a + ((uint64_t)b<<32);
+#endif
+}

-#define pack_pixel_1to2 pack8to16
-#define pack_pixel_2to4 pack16to32
+#if X264_HIGH_BIT_DEPTH
+#   define pack_pixel_1to2 pack16to32
+#   define pack_pixel_2to4 pack32to64
+#else
+#   define pack_pixel_1to2 pack8to16
+#   define pack_pixel_2to4 pack16to32
+#endif

-#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
+#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef))
 #define array_non_zero_int array_non_zero_int
 static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count )
 {
-    if(i_count == 8)
-        return !!M64( &v[0] );
-    else if(i_count == 16)
-        return !!(M64( &v[0] ) | M64( &v[4] ));
-    else if(i_count == 32)
-        return !!(M64( &v[0] ) | M64( &v[4] ) | M64( &v[8] ) | M64( &v[12] ));
-    else
-    {
-        for( int i = 0; i < i_count; i+=4 )
-            if( M64( &v[i] ) ) return 1;
-        return 0;
-    }
+    for( int i = 0; i < i_count; i++ )
+        if( v[i] )
+            return 1;
+    return 0;
 }
 static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
 {
diff --git a/common/mc.c b/common/mc.c
index 9776bec..5ef0682 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -117,11 +117,14 @@ static void x264_weight_cache( x264_t *h, x264_weight_t *w )
 {
     w->weightfn = h->mc.weight;
 }
-#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset )
-#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * weight->i_scale + weight->i_offset )
-static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
+#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
+static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
 {
-    if( weight->i_denom >= 1 )
+    int offset = weight->i_offset << (BIT_DEPTH-8);
+    int scale = weight->i_scale;
+    int denom = weight->i_denom;
+    if( denom >= 1 )
     {
         for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
             for( int x = 0; x < i_width; x++ )
@@ -135,21 +138,10 @@ static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_sr
     }
 }

-#define MC_WEIGHT_C( name, lx ) \
+#define MC_WEIGHT_C( name, width ) \
     static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \
 { \
-    if( weight->i_denom >= 1 ) \
-    { \
-        for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
-            for( int x = 0; x < lx; x++ ) \
-                opscale( x ); \
-    } \
-    else \
-    { \
-        for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
-            for( int x = 0; x < lx; x++ ) \
-                opscale_noden( x ); \
-    } \
+    mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
 }

 MC_WEIGHT_C( mc_weight_w20, 20 )
@@ -182,7 +174,7 @@ static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride,

 #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
 static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         int stride, int width, int height, int16_t *buf )
+                         int stride, int width, int height, dctcoef *buf )
 {
     for( int y = 0; y < height; y++ )
     {
@@ -301,7 +293,12 @@ void x264_plane_copy_c( pixel *dst, int i_dst,
 {
     while( h-- )
     {
+#if X264_HIGH_BIT_DEPTH
+        for( int i = 0; i < w; i++ )
+            dst[i] = src[i] << (BIT_DEPTH-8);
+#else
         memcpy( dst, src, w );
+#endif
         dst += i_dst;
         src += i_src;
     }
diff --git a/common/mc.h b/common/mc.h
index bb16d13..cbdf1a6 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -82,7 +82,7 @@ typedef struct
                         uint8_t *src, int i_src, int w, int h);

     void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         int i_stride, int i_width, int i_height, int16_t *buf );
+                         int i_stride, int i_width, int i_height, dctcoef *buf );

     /* prefetch the next few macroblocks of fenc or fdec */
     void (*prefetch_fenc)( pixel *pix_y, int stride_y,
diff --git a/common/pixel.c b/common/pixel.c
index 8441c7a..069589f 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -177,7 +177,7 @@ static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride
         pix2 += i_stride2;
     }
     sum = abs(sum);
-    var = sqr - (sum * sum >> 6);
+    var = sqr - ((uint64_t)sum * sum >> 6);
     *ssd = sqr;
     return var;
 }
@@ -406,12 +406,14 @@ SAD_X( 8x4 )
 SAD_X( 4x8 )
 SAD_X( 4x4 )

+#if !X264_HIGH_BIT_DEPTH
 #if ARCH_UltraSparc
 SAD_X( 16x16_vis )
 SAD_X( 16x8_vis )
 SAD_X( 8x16_vis )
 SAD_X( 8x8_vis )
 #endif
+#endif // !X264_HIGH_BIT_DEPTH

 /****************************************************************************
  * pixel_satd_x4
@@ -444,6 +446,7 @@ SATD_X_DECL6( cpu )\
 SATD_X( 4x4, cpu )

 SATD_X_DECL7()
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
 SATD_X_DECL7( _mmxext )
 SATD_X_DECL6( _sse2 )
@@ -454,6 +457,7 @@ SATD_X_DECL7( _sse4 )
 #if HAVE_ARMV6
 SATD_X_DECL7( _neon )
 #endif
+#endif // !X264_HIGH_BIT_DEPTH

 #define INTRA_MBCMP_8x8( mbcmp )\
 void x264_intra_##mbcmp##_x3_8x8( pixel *fenc, pixel edge[33], int res[3] )\
@@ -520,8 +524,8 @@ static void ssim_4x4x2_core( const pixel *pix1, int stride1,

 static float ssim_end1( int s1, int s2, int ss, int s12 )
 {
-    static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5);
-    static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5);
+    static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
+    static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
     int vars = ss*64 - s1*s1 - s2*s2;
     int covar = s12*64 - s1*s2;
     return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)
@@ -678,6 +682,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16;
     pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;

+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -903,17 +908,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         }
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
     if( cpu&X264_CPU_ALTIVEC )
     {
         x264_pixel_altivec_init( pixf );
     }
 #endif
+#if !X264_HIGH_BIT_DEPTH
 #if ARCH_UltraSparc
     INIT4( sad, _vis );
     INIT4( sad_x3, _vis );
     INIT4( sad_x4, _vis );
 #endif
+#endif // !X264_HIGH_BIT_DEPTH

     pixf->ads[PIXEL_8x16] =
     pixf->ads[PIXEL_8x4] =
diff --git a/common/ppc/dct.c b/common/ppc/dct.c
index eb223ae..85d5ce7 100644
--- a/common/ppc/dct.c
+++ b/common/ppc/dct.c
@@ -24,6 +24,7 @@
 #include "common/common.h"
 #include "ppccommon.h"

+#if !X264_HIGH_BIT_DEPTH
 #define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \
     b1 = vec_add( a0, a3 );              \
     b3 = vec_add( a1, a2 );              \
@@ -482,4 +483,5 @@ void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[4][4] )
     vec_st( tmp0v, 0x00, level );
     vec_st( tmp1v, 0x10, level );
 }
+#endif // !X264_HIGH_BIT_DEPTH

diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c
index 0c8d2d4..986710d 100644
--- a/common/ppc/deblock.c
+++ b/common/ppc/deblock.c
@@ -21,6 +21,7 @@
 #include "common/common.h"
 #include "ppccommon.h"

+#if !X264_HIGH_BIT_DEPTH
 #define transpose4x16(r0, r1, r2, r3)        \
 {                                            \
     register vec_u8_t r4;                    \
@@ -292,3 +293,4 @@ void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta,
     transpose4x16(line1, line2, line3, line4);
     write16x4(pix-2, stride, line1, line2, line3, line4);
 }
+#endif // !X264_HIGH_BIT_DEPTH
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 7ad8050..744a804 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -33,6 +33,7 @@
 #include "mc.h"
 #include "ppccommon.h"

+#if !X264_HIGH_BIT_DEPTH
 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
                          uint8_t *dst, int i_dst, int i_height );

@@ -792,9 +793,11 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
         dstc += dst_stride;
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH

 void x264_mc_altivec_init( x264_mc_functions_t *pf )
 {
+#if !X264_HIGH_BIT_DEPTH
     pf->mc_luma   = mc_luma_altivec;
     pf->get_ref   = get_ref_altivec;
     pf->mc_chroma = mc_chroma_altivec;
@@ -804,4 +807,5 @@ void x264_mc_altivec_init( x264_mc_functions_t *pf )

     pf->hpel_filter = x264_hpel_filter_altivec;
     pf->frame_init_lowres_core = frame_init_lowres_core_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 3f99606..bd5f547 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -24,6 +24,7 @@
 #include "common/common.h"
 #include "ppccommon.h"

+#if !X264_HIGH_BIT_DEPTH
 /***********************************************************************
  * SAD routines
  **********************************************************************/
@@ -1979,12 +1980,14 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
     sums[0][3] = temp[0];
     sums[1][3] = temp[1];
 }
+#endif // !X264_HIGH_BIT_DEPTH

 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
 void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
 {
+#if !X264_HIGH_BIT_DEPTH
     pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
     pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
     pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
@@ -2023,4 +2026,5 @@ void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
     pixf->hadamard_ac[PIXEL_8x8]   = x264_pixel_hadamard_ac_8x8_altivec;

     pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/ppc/predict.c b/common/ppc/predict.c
index 3fb1a2b..c71dbb5 100644
--- a/common/ppc/predict.c
+++ b/common/ppc/predict.c
@@ -23,6 +23,7 @@
 #include "pixel.h"
 #include "ppccommon.h"

+#if !X264_HIGH_BIT_DEPTH
 static void predict_8x8c_p_altivec( uint8_t *src )
 {
     int H = 0, V = 0;
@@ -194,6 +195,7 @@ static void predict_16x16_v_altivec( uint8_t *src )
         src += FDEC_STRIDE;
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH


 /****************************************************************************
@@ -201,6 +203,7 @@ static void predict_16x16_v_altivec( uint8_t *src )
  ****************************************************************************/
 void x264_predict_16x16_init_altivec( x264_predict_t pf[7] )
 {
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_16x16_V ]      = predict_16x16_v_altivec;
     pf[I_PRED_16x16_H ]      = predict_16x16_h_altivec;
     pf[I_PRED_16x16_DC]      = predict_16x16_dc_altivec;
@@ -208,9 +211,12 @@ void x264_predict_16x16_init_altivec( x264_predict_t pf[7] )
     pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec;
     pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec;
     pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] )
 {
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_CHROMA_P]       = predict_8x8c_p_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/ppc/quant.c b/common/ppc/quant.c
index 6f41a06..ffd6a1b 100644
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -22,6 +22,7 @@
 #include "ppccommon.h"
 #include "quant.h"

+#if !X264_HIGH_BIT_DEPTH
 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
 #define QUANT_16_U( idx0, idx1 )                                    \
 {                                                                   \
@@ -360,4 +361,5 @@ void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i
             DEQUANT_SHR();
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH

diff --git a/common/predict.c b/common/predict.c
index 79ec1fc..dc92083 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -53,40 +53,40 @@

 void x264_predict_16x16_dc_c( pixel *src )
 {
-    pixel4 dc = 0;
+    int dc = 0;

     for( int i = 0; i < 16; i++ )
     {
         dc += src[-1 + i * FDEC_STRIDE];
         dc += src[i - FDEC_STRIDE];
     }
-    dc = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 );
+    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 );

-    PREDICT_16x16_DC( dc );
+    PREDICT_16x16_DC( dcsplat );
 }
 static void x264_predict_16x16_dc_left_c( pixel *src )
 {
-    pixel4 dc = 0;
+    int dc = 0;

     for( int i = 0; i < 16; i++ )
         dc += src[-1 + i * FDEC_STRIDE];
-    dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
+    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );

-    PREDICT_16x16_DC( dc );
+    PREDICT_16x16_DC( dcsplat );
 }
 static void x264_predict_16x16_dc_top_c( pixel *src )
 {
-    pixel4 dc = 0;
+    int dc = 0;

     for( int i = 0; i < 16; i++ )
         dc += src[i - FDEC_STRIDE];
-    dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
+    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );

-    PREDICT_16x16_DC( dc );
+    PREDICT_16x16_DC( dcsplat );
 }
 static void x264_predict_16x16_dc_128_c( pixel *src )
 {
-    PREDICT_16x16_DC( PIXEL_SPLAT_X4( 0x80 ) );
+    PREDICT_16x16_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
 }
 void x264_predict_16x16_h_c( pixel *src )
 {
@@ -155,53 +155,53 @@ static void x264_predict_8x8c_dc_128_c( pixel *src )
 {
     for( int y = 0; y < 8; y++ )
     {
-        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 0x80 );
-        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 0x80 );
+        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
         src += FDEC_STRIDE;
     }
 }
 static void x264_predict_8x8c_dc_left_c( pixel *src )
 {
-    pixel4 dc0 = 0, dc1 = 0;
+    int dc0 = 0, dc1 = 0;

     for( int y = 0; y < 4; y++ )
     {
         dc0 += src[y * FDEC_STRIDE     - 1];
         dc1 += src[(y+4) * FDEC_STRIDE - 1];
     }
-    dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
-    dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
+    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );

     for( int y = 0; y < 4; y++ )
     {
-        MPIXEL_X4( src+0 ) = dc0;
-        MPIXEL_X4( src+4 ) = dc0;
+        MPIXEL_X4( src+0 ) = dc0splat;
+        MPIXEL_X4( src+4 ) = dc0splat;
         src += FDEC_STRIDE;
     }
     for( int y = 0; y < 4; y++ )
     {
-        MPIXEL_X4( src+0 ) = dc1;
-        MPIXEL_X4( src+4 ) = dc1;
+        MPIXEL_X4( src+0 ) = dc1splat;
+        MPIXEL_X4( src+4 ) = dc1splat;
         src += FDEC_STRIDE;
     }

 }
 static void x264_predict_8x8c_dc_top_c( pixel *src )
 {
-    pixel4 dc0 = 0, dc1 = 0;
+    int dc0 = 0, dc1 = 0;

     for( int x = 0; x < 4; x++ )
     {
         dc0 += src[x     - FDEC_STRIDE];
         dc1 += src[x + 4 - FDEC_STRIDE];
     }
-    dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
-    dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
+    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );

     for( int y = 0; y < 8; y++ )
     {
-        MPIXEL_X4( src+0 ) = dc0;
-        MPIXEL_X4( src+4 ) = dc1;
+        MPIXEL_X4( src+0 ) = dc0splat;
+        MPIXEL_X4( src+4 ) = dc1splat;
         src += FDEC_STRIDE;
     }
 }
@@ -306,7 +306,7 @@ static void x264_predict_8x8c_p_c( pixel *src )

 static void x264_predict_4x4_dc_128_c( pixel *src )
 {
-    PREDICT_4x4_DC( PIXEL_SPLAT_X4( 0x80 ) );
+    PREDICT_4x4_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
 }
 static void x264_predict_4x4_dc_left_c( pixel *src )
 {
@@ -491,7 +491,8 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbo
             }
             else
             {
-                M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL;
+                MPIXEL_X4( edge+24 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
+                MPIXEL_X4( edge+28 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
                 edge[32] = SRC(7,-1);
             }
         }
@@ -523,7 +524,7 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbo

 static void x264_predict_8x8_dc_128_c( pixel *src, pixel edge[33] )
 {
-    PREDICT_8x8_DC( PIXEL_SPLAT_X4( 0x80 ) );
+    PREDICT_8x8_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
 }
 static void x264_predict_8x8_dc_left_c( pixel *src, pixel edge[33] )
 {
@@ -554,9 +555,13 @@ void x264_predict_8x8_h_c( pixel *src, pixel edge[33] )
 }
 void x264_predict_8x8_v_c( pixel *src, pixel edge[33] )
 {
-    uint64_t top = M64( edge+16 );
+    pixel4 top[2] = { MPIXEL_X4( edge+16 ),
+                      MPIXEL_X4( edge+20 ) };
     for( int y = 0; y < 8; y++ )
-        M64( src+y*FDEC_STRIDE ) = top;
+    {
+        MPIXEL_X4( src+y*FDEC_STRIDE+0 ) = top[0];
+        MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = top[1];
+    }
 }
 static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[33] )
 {
diff --git a/common/quant.c b/common/quant.c
index ece52f9..a7b72cf 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -142,7 +142,7 @@ static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, uint16_t *offset, int
     for( int i = 1; i < size; i++ )
     {
         int level = dct[i];
-        int sign = level>>15;
+        int sign = level>>31;
         level = (level+sign)^sign;
         sum[i] += level;
         level -= offset[i];
@@ -177,10 +177,7 @@ static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
     int i_score = 0;
     int idx = i_max - 1;

-    /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned.  idx>=0 instead of 1 works correctly for the same reason */
-    while( idx >= 0 && MDCT_X2( &dct[idx-1] ) == 0 )
-        idx -= 2;
-    if( idx >= 0 && dct[idx] == 0 )
+    while( idx >= 0 && dct[idx] == 0 )
         idx--;
     while( idx >= 0 )
     {
@@ -216,10 +213,7 @@ static int x264_decimate_score64( dctcoef *dct )

 static int ALWAYS_INLINE x264_coeff_last_internal( dctcoef *l, int i_count )
 {
-    int i_last;
-    for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
-        if( M64( l+i_last-3 ) )
-            break;
+    int i_last = i_count-1;
     while( i_last >= 0 && l[i_last] == 0 )
         i_last--;
     return i_last;
@@ -287,6 +281,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
     pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;

+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -425,6 +420,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
     pf->coeff_last[  DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
     pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
     pf->coeff_level_run[  DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
diff --git a/common/set.c b/common/set.c
index 16cff8e..86f3854 100644
--- a/common/set.c
+++ b/common/set.c
@@ -78,6 +78,7 @@ int x264_cqm_init( x264_t *h )
                         32 - 11, 32 - 21 };
     int max_qp_err = -1;
     int max_chroma_qp_err = -1;
+    int min_qp_err = QP_MAX+1;

     for( int i = 0; i < 6; i++ )
     {
@@ -94,9 +95,9 @@ int x264_cqm_init( x264_t *h )
         }
         else
         {
-            CHECKED_MALLOC( h->  quant4_mf[i], 52*size*sizeof(uint16_t) );
+            CHECKED_MALLOC( h->  quant4_mf[i], (QP_MAX+1)*size*sizeof(uint16_t) );
             CHECKED_MALLOC( h->dequant4_mf[i],  6*size*sizeof(int) );
-            CHECKED_MALLOC( h->unquant4_mf[i], 52*size*sizeof(int) );
+            CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) );
         }

         for( j = (i<4 ? 0 : 4); j < i; j++ )
@@ -106,7 +107,7 @@ int x264_cqm_init( x264_t *h )
         if( j < i )
             h->quant4_bias[i] = h->quant4_bias[j];
         else
-            CHECKED_MALLOC( h->quant4_bias[i], 52*size*sizeof(uint16_t) );
+            CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(uint16_t) );
     }

     for( int q = 0; q < 6; q++ )
@@ -140,7 +141,7 @@ int x264_cqm_init( x264_t *h )
                      quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
             }
     }
-    for( int q = 0; q < 52; q++ )
+    for( int q = 0; q < QP_MAX+1; q++ )
     {
         int j;
         for( int i_list = 0; i_list < 4; i_list++ )
@@ -148,6 +149,11 @@ int x264_cqm_init( x264_t *h )
             {
                 h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
                 h->quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
+                if( !j )
+                {
+                    min_qp_err = X264_MIN( min_qp_err, q );
+                    continue;
+                }
                 // round to nearest, unless that would cause the deadzone to be negative
                 h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                 if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
@@ -161,6 +167,11 @@ int x264_cqm_init( x264_t *h )
                 {
                     h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
                     h->quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
+                    if( !j )
+                    {
+                        min_qp_err = X264_MIN( min_qp_err, q );
+                        continue;
+                    }
                     h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                     if( j > 0xffff && q > max_qp_err )
                         max_qp_err = q;
@@ -179,6 +190,12 @@ int x264_cqm_init( x264_t *h )
         x264_log( h, X264_LOG_ERROR, "but min chroma QP is implied to be %d.\n", h->chroma_qp_table[h->param.rc.i_qp_min] );
         return -1;
     }
+    if( !h->mb.b_lossless && min_qp_err <= h->param.rc.i_qp_max )
+    {
+        x264_log( h, X264_LOG_ERROR, "Quantization underflow.  Your CQM is incompatible with QP > %d,\n", min_qp_err-1 );
+        x264_log( h, X264_LOG_ERROR, "but max QP is implied to be %d.\n", h->param.rc.i_qp_max );
+        return -1;
+    }
     return 0;
 fail:
     x264_cqm_delete( h );
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 2dcd671..4bb5f33 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -125,6 +125,7 @@ PIXEL_AVG_WALL(sse2)
 PIXEL_AVG_WALL(sse2_misalign)
 PIXEL_AVG_WALL(cache64_ssse3)

+#if !X264_HIGH_BIT_DEPTH
 #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
 static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
 {\
@@ -355,24 +356,28 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i
         x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 );
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH

 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_MMX) )
         return;

+    pf->memcpy_aligned = x264_memcpy_aligned_mmx;
+    pf->memzero_aligned = x264_memzero_aligned_mmx;
+#if !X264_HIGH_BIT_DEPTH
     pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
-    pf->memcpy_aligned = x264_memcpy_aligned_mmx;
-    pf->memzero_aligned = x264_memzero_aligned_mmx;
     pf->integral_init4v = x264_integral_init4v_mmx;
     pf->integral_init8v = x264_integral_init8v_mmx;
+#endif // !X264_HIGH_BIT_DEPTH

     if( !(cpu&X264_CPU_MMXEXT) )
         return;

+#if !X264_HIGH_BIT_DEPTH
     pf->mc_luma = mc_luma_mmxext;
     pf->get_ref = get_ref_mmxext;
     pf->mc_chroma = x264_mc_chroma_mmxext;
@@ -412,12 +417,14 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH

     if( !(cpu&X264_CPU_SSE2) )
         return;

     pf->memcpy_aligned = x264_memcpy_aligned_sse2;
     pf->memzero_aligned = x264_memzero_aligned_sse2;
+#if !X264_HIGH_BIT_DEPTH
     pf->integral_init4v = x264_integral_init4v_sse2;
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
@@ -492,4 +499,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )

     pf->integral_init4h = x264_integral_init4h_sse4;
     pf->integral_init8h = x264_integral_init8h_sse4;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index e771431..4004265 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -75,6 +75,7 @@
  void x264_predict_16x16_v_sse2( uint8_t *src );
  void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );

+#if !X264_HIGH_BIT_DEPTH
 ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
 ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
 ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
@@ -364,6 +365,7 @@ INTRA_SA8D_X3(ssse3)
 #else
 INTRA_SA8D_X3(mmxext)
 #endif
+#endif // !X264_HIGH_BIT_DEPTH

 /****************************************************************************
  * Exported functions:
@@ -372,6 +374,7 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
 {
     if( !(cpu&X264_CPU_MMX) )
         return;
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_16x16_V]       = x264_predict_16x16_v_mmx;
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
@@ -397,12 +400,14 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
 #ifdef __GNUC__
     pf[I_PRED_16x16_P]       = x264_predict_16x16_p_ssse3;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
 {
     if( !(cpu&X264_CPU_MMX) )
         return;
+#if !X264_HIGH_BIT_DEPTH
 #if ARCH_X86_64
     pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
 #endif
@@ -424,12 +429,14 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
 #ifdef __GNUC__
     pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_ssse3;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
 {
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmxext;
     pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmxext;
     pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_mmxext;
@@ -456,12 +463,14 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
     pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_ssse3;
     pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_ssse3;
     *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
+#endif // !X264_HIGH_BIT_DEPTH
 }

 void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
 {
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
     pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
     pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_mmxext;
@@ -474,4 +483,5 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
     pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
     pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
     pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/configure b/configure
index 24d15ad..5cdd82d 100755
--- a/configure
+++ b/configure
@@ -18,6 +18,7 @@ echo "  --enable-gprof           adds -pg, doesn't strip"
 echo "  --enable-visualize       enables visualization (X11 only)"
 echo "  --enable-pic             build position-independent code"
 echo "  --enable-shared          build libx264.so"
+echo "  --bit-depth=BIT_DEPTH    sets output bit depth (8-10), default 8"
 echo "  --extra-asflags=EASFLAGS add EASFLAGS to ASFLAGS"
 echo "  --extra-cflags=ECFLAGS   add ECFLAGS to CFLAGS"
 echo "  --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
@@ -124,6 +125,7 @@ gprof="no"
 pic="no"
 vis="no"
 shared="no"
+bit_depth="8"

 CFLAGS="$CFLAGS -Wall -I."
 LDFLAGS="$LDFLAGS"
@@ -208,6 +210,13 @@ for opt do
             CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}"
             LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}"
             ;;
+        --bit-depth=*)
+            bit_depth="${opt#--bit-depth=}"
+            if [ "$bit_depth" -lt "8" ] || [ "$bit_depth" -gt "10" ]; then
+                echo "Supplied bit depth must be in range [8,10]."
+                exit 1
+            fi
+            ;;
         *)
             echo "Unknown option $opt, ignored"
             ;;
@@ -644,6 +653,12 @@ if cc_check '' -Wshadow ; then
     CFLAGS="-Wshadow $CFLAGS"
 fi

+if [ "$bit_depth" -gt "8" ]; then
+    define X264_HIGH_BIT_DEPTH
+fi
+
+define BIT_DEPTH $bit_depth
+
 rm -f conftest*

 # generate config files
@@ -724,6 +739,7 @@ gprof:      $gprof
 PIC:        $pic
 shared:     $shared
 visualize:  $vis
+bit depth:  $bit_depth
 EOF

 echo >> config.log
diff --git a/encoder/analyse.c b/encoder/analyse.c
index cdbdd1e..93f7eed 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -134,25 +134,27 @@ typedef struct
 } x264_mb_analysis_t;

 /* lambda = pow(2,qp/6-2) */
-const uint8_t x264_lambda_tab[52] = {
-   1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
-   1, 1, 1, 1,              /*  8-11 */
-   1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
-   3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
-   6, 7, 8, 9,10,11,13,14,  /* 28-35 */
-  16,18,20,23,25,29,32,36,  /* 36-43 */
-  40,45,51,57,64,72,81,91   /* 44-51 */
+const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = {
+   1,   1,   1,   1,   1,   1,   1,   1, /*  0- 7 */
+   1,   1,   1,   1,   1,   1,   1,   1, /*  8-15 */
+   2,   2,   2,   2,   3,   3,   3,   4, /* 16-23 */
+   4,   4,   5,   6,   6,   7,   8,   9, /* 24-31 */
+  10,  11,  13,  14,  16,  18,  20,  23, /* 32-39 */
+  25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
+  64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
+ 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
 };

 /* lambda2 = pow(lambda,2) * .9 * 256 */
-const int x264_lambda2_tab[52] = {
-    14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
-    91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
-   580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
-  3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
- 23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
-148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
-943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
+const int x264_lambda2_tab[QP_MAX_MAX+1] = {
+     14,     18,     22,      28,      36,      45,      57,      72, /*  0- 7 */
+     91,    115,    145,     182,     230,     290,     365,     460, /*  8-15 */
+    580,    731,    921,    1161,    1462,    1843,    2322,    2925, /* 16-23 */
+   3686,   4644,   5851,    7372,    9289,   11703,   14745,   18578, /* 24-31 */
+  23407,  29491,  37156,   46814,   58982,   74313,   93628,  117964, /* 32-39 */
+ 148626, 187257, 235929,  297252,  374514,  471859,  594505,  749029, /* 40-47 */
+ 943718,1189010,1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
+5992238,7549747,9512085,11984476,15099494,19024170,23968953,30198988, /* 56-63 */
 };

 const uint8_t x264_exp2_lut[64] = {
@@ -188,27 +190,31 @@ const float x264_log2_lz_lut[32] = {

 // should the intra and inter lambdas be different?
 // I'm just matching the behaviour of deadzone quant.
-static const int x264_trellis_lambda2_tab[2][52] = {
+static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = {
     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    46,      58,      73,      92,     117,     147,
-        185,     233,     294,     370,     466,     587,
-        740,     932,    1174,    1480,    1864,    2349,
-       2959,    3728,    4697,    5918,    7457,    9395,
-      11837,   14914,   18790,   23674,   29828,   37581,
-      47349,   59656,   75163,   94699,  119313,  150326,
-     189399,  238627,  300652,  378798,  477255,  601304,
-     757596,  954511, 1202608, 1515192, 1909022, 2405217,
-    3030384, 3818045, 4810435, 6060769 },
+    {      46,      58,      73,      92,     117,     147,
+          185,     233,     294,     370,     466,     587,
+          740,     932,    1174,    1480,    1864,    2349,
+         2959,    3728,    4697,    5918,    7457,    9395,
+        11837,   14914,   18790,   23674,   29828,   37581,
+        47349,   59656,   75163,   94699,  119313,  150326,
+       189399,  238627,  300652,  378798,  477255,  601304,
+       757596,  954511, 1202608, 1515192, 1909022, 2405217,
+      3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
+     12121539,15272182,19241743,24243077,30544363,38483486,
+     48486154,61088726,76966972,96972308 },
     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    27,      34,      43,      54,      68,      86,
-        108,     136,     172,     216,     273,     343,
-        433,     545,     687,     865,    1090,    1374,
-       1731,    2180,    2747,    3461,    4361,    5494,
-       6922,    8721,   10988,   13844,   17442,   21976,
-      27688,   34885,   43953,   55377,   69771,   87906,
-     110755,  139543,  175813,  221511,  279087,  351627,
-     443023,  558174,  703255,  886046, 1116348, 1406511,
-    1772093, 2232697, 2813022, 3544186 }
+    {      27,      34,      43,      54,      68,      86,
+          108,     136,     172,     216,     273,     343,
+          433,     545,     687,     865,    1090,    1374,
+         1731,    2180,    2747,    3461,    4361,    5494,
+         6922,    8721,   10988,   13844,   17442,   21976,
+        27688,   34885,   43953,   55377,   69771,   87906,
+       110755,  139543,  175813,  221511,  279087,  351627,
+       443023,  558174,  703255,  886046, 1116348, 1406511,
+      1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
+      7088374, 8930791,11252092,14176748,17861583,22504184,
+     28353495,35723165,45008368,56706990 }
 };

 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
@@ -237,7 +243,7 @@ static const uint8_t i_sub_mb_p_cost_table[4] = {

 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );

-static uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[LAMBDA_MAX+1][3][33];
 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;

 int x264_analyse_init_costs( x264_t *h, int qp )
@@ -275,7 +281,7 @@ fail:

 void x264_analyse_free_costs( x264_t *h )
 {
-    for( int i = 0; i < 92; i++ )
+    for( int i = 0; i < LAMBDA_MAX+1; i++ )
     {
         if( h->cost_mv[i] )
             x264_free( h->cost_mv[i] - 2*4*2048 );
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 8bd40f1..e82d7e9 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -262,9 +262,9 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
     if( i_dqp != 0 )
     {
         int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1);
-        /* dqp is interpreted modulo 52 */
-        if( val >= 51 && val != 52 )
-            val = 103 - val;
+        /* dqp is interpreted modulo (QP_MAX+1) */
+        if( val >= QP_MAX && val != QP_MAX+1 )
+            val = 2*QP_MAX+1 - val;
         do
         {
             x264_cabac_encode_decision( cb, 60 + ctx, 1 );
@@ -767,15 +767,18 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
         i_mb_pos_tex = x264_cabac_pos( cb );
         h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;

-        memcpy( cb->p, h->mb.pic.p_fenc[0], 256 );
-        cb->p += 256;
-        for( int i = 0; i < 8; i++ )
-            memcpy( cb->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
-        cb->p += 64;
-        for( int i = 0; i < 8; i++ )
-            memcpy( cb->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
-        cb->p += 64;
+        bs_t s;
+        bs_init( &s, cb->p, cb->p_end - cb->p );

+        for( int i = 0; i < 256; i++ )
+            bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] );
+        for( int ch = 0; ch < 2; ch++ )
+            for( int i = 0; i < 8; i++ )
+                for( int j = 0; j < 8; j++ )
+                    bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
+
+        bs_flush( &s );
+        cb->p = s.p;
         x264_cabac_encode_init_core( cb );

         h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index e2f60b1..632ed41 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -66,7 +66,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_len
     bs_t *s = &h->out.bs;
     static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
     int i_level_prefix = 15;
-    int mask = level >> 15;
+    int mask = level >> 31;
     int abs_level = (level^mask)-mask;
     int i_level_code = abs_level*2-mask-2;
     if( ( i_level_code >> i_suffix_length ) < 15 )
@@ -219,10 +219,10 @@ static void cavlc_qp_delta( x264_t *h )

     if( i_dqp )
     {
-        if( i_dqp < -26 )
-            i_dqp += 52;
-        else if( i_dqp > 25 )
-            i_dqp -= 52;
+        if( i_dqp < -(QP_MAX+1)/2 )
+            i_dqp += QP_MAX+1;
+        else if( i_dqp > QP_MAX/2 )
+            i_dqp -= QP_MAX+1;
     }
     bs_write_se( s, i_dqp );
 }
@@ -309,14 +309,12 @@ void x264_macroblock_write_cavlc( x264_t *h )

         bs_align_0( s );

-        memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
-        s->p += 256;
-        for( int i = 0; i < 8; i++ )
-            memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
-        s->p += 64;
-        for( int i = 0; i < 8; i++ )
-            memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
-        s->p += 64;
+        for( int i = 0; i < 256; i++ )
+            bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] );
+        for( int ch = 0; ch < 2; ch++ )
+            for( int i = 0; i < 8; i++ )
+                for( int j = 0; j < 8; j++ )
+                    bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );

         bs_init( s, s->p, s->p_end - s->p );
         s->p_start = p_start;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 31cb84a..f7e0e38 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -51,7 +51,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
  ****************************************************************************/
 static float x264_psnr( int64_t i_sqe, int64_t i_size )
 {
-    double f_mse = (double)i_sqe / ((double)65025.0 * (double)i_size);
+    double f_mse = (double)i_sqe / (PIXEL_MAX*PIXEL_MAX * (double)i_size);
     if( f_mse <= 0.0000000001 ) /* Max 100dB */
         return 100;

@@ -68,11 +68,13 @@ static void x264_frame_dump( x264_t *h )
     FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
     if( !f )
         return;
+    int bytes_per_pixel = (BIT_DEPTH+7)/8;
     /* Write the frame in display order */
-    fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
+    fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * bytes_per_pixel, SEEK_SET );
     for( int i = 0; i < h->fdec->i_plane; i++ )
         for( int y = 0; y < h->param.i_height >> !!i; y++ )
-            fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f );
+            for( int j = 0; j < h->param.i_width >> !!i; j++ )
+                fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]]+j, bytes_per_pixel, 1, f );
     fclose( f );
 }

@@ -469,8 +471,8 @@ static int x264_validate_parameters( x264_t *h )
         x264_log( h, X264_LOG_ERROR, "no ratecontrol method specified\n" );
         return -1;
     }
-    h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, 51 );
-    h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
+    h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, QP_MAX );
+    h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
     if( h->param.rc.i_rc_method == X264_RC_CRF )
     {
         h->param.rc.i_qp_constant = h->param.rc.f_rf_constant;
@@ -502,12 +504,12 @@ static int x264_validate_parameters( x264_t *h )
         float qp_p = h->param.rc.i_qp_constant;
         float qp_i = qp_p - 6*log2f( h->param.rc.f_ip_factor );
         float qp_b = qp_p + 6*log2f( h->param.rc.f_pb_factor );
-        h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
-        h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
+        h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, QP_MAX );
+        h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX );
         h->param.rc.i_aq_mode = 0;
         h->param.rc.b_mb_tree = 0;
     }
-    h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
+    h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX );
     h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
     if( h->param.rc.i_vbv_buffer_size )
     {
@@ -1054,8 +1056,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
     if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
         goto fail;

+    static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
     /* Checks for known miscompilation issues. */
-    if( h->cost_mv[1][2013] != 24 )
+    if( h->cost_mv[x264_lambda_tab[X264_LOOKAHEAD_QP]][2013] != cost_mv_correct[BIT_DEPTH-8] )
     {
         x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
         goto fail;
@@ -1147,11 +1150,22 @@ x264_t *x264_encoder_open( x264_param_t *param )
         fclose( f );
     }

-    x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n",
-        h->sps->i_profile_idc == PROFILE_BASELINE ? "Baseline" :
-        h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
-        h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
-        "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
+    const char *profile = h->sps->i_profile_idc == PROFILE_BASELINE ? "Baseline" :
+                          h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
+                          h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
+                          h->sps->i_profile_idc == PROFILE_HIGH10 ? "High 10" :
+                          "High 4:4:4 Predictive";
+
+    if( h->sps->i_profile_idc < PROFILE_HIGH10 )
+    {
+        x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n",
+            profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
+    }
+    else
+    {
+        x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d, bit depth %d\n",
+            profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10, BIT_DEPTH );
+    }

     return h;
 fail:
@@ -1836,7 +1850,7 @@ static int x264_slice_write( x264_t *h )
         bs_align_1( &h->out.bs );

         /* init cabac */
-        x264_cabac_context_init( &h->cabac, h->sh.i_type, h->sh.i_qp, h->sh.i_cabac_init_idc );
+        x264_cabac_context_init( &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc );
         x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end );
     }
     h->mb.i_last_qp = h->sh.i_qp;
@@ -2705,6 +2719,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     for( int i = 0; i < 3; i++ )
     {
         pic_out->img.i_stride[i] = h->fdec->i_stride[i];
+        // FIXME This breaks the API when pixel != uint8_t.
         pic_out->img.plane[i] = h->fdec->plane[i];
     }

diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index b1b02fa..7c83344 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -26,8 +26,8 @@

 #include "common/macroblock.h"

-extern const int x264_lambda2_tab[52];
-extern const uint8_t x264_lambda_tab[52];
+extern const int x264_lambda2_tab[QP_MAX_MAX+1];
+extern const uint16_t x264_lambda_tab[QP_MAX_MAX+1];

 void x264_rdo_init( void );

diff --git a/encoder/me.h b/encoder/me.h
index 912b05d..b125f3d 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -68,7 +68,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
 void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
 uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );

-extern uint16_t *x264_cost_mv_fpel[92][4];
+extern uint16_t *x264_cost_mv_fpel[LAMBDA_MAX+1][4];

 #define COPY1_IF_LT(x,y)\
 if((y)<(x))\
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 6fdaa98..bdf44dc 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -219,7 +219,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
     uint32_t ssd = res >> 32;
     frame->i_pixel_sum[i] += sum;
     frame->i_pixel_ssd[i] += ssd;
-    return ssd - (sum * sum >> shift);
+    return ssd - ((uint64_t)sum * sum >> shift);
 }

 // Find the total AC energy of the block in all planes.
@@ -300,7 +300,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
             avg_adj /= h->mb.i_mb_count;
             avg_adj_pow2 /= h->mb.i_mb_count;
             strength = h->param.rc.f_aq_strength * avg_adj;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (14.f + 2*(BIT_DEPTH-8))) / avg_adj;
         }
         else
             strength = h->param.rc.f_aq_strength * 1.0397f;
@@ -318,7 +318,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
                 else
                 {
                     uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
-                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
+                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - (14.427f + 2*(BIT_DEPTH-8)));
                 }
                 if( quant_offsets )
                     qp_adj += quant_offsets[mb_xy];
@@ -620,8 +620,8 @@ int x264_ratecontrol_new( x264_t *h )
     rc->ip_offset = 6.0 * log2f( h->param.rc.f_ip_factor );
     rc->pb_offset = 6.0 * log2f( h->param.rc.f_pb_factor );
     rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
-    rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, 51 );
-    rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, 51 );
+    rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, QP_MAX );
+    rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, QP_MAX );
     h->mb.ip_offset = rc->ip_offset + 0.5;

     rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
@@ -1231,7 +1231,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )

     rc->qpa_rc =
     rc->qpa_aq = 0;
-    rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 );
+    rc->qp = x264_clip3( (int)(q + 0.5), 0, QP_MAX );
     h->fdec->f_qp_avg_rc =
     h->fdec->f_qp_avg_aq =
     rc->qpm = q;
@@ -1416,9 +1416,9 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
              * So just calculate the average QP used so far. */
             h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24
                                       : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P];
-            rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
-            rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, 51 );
-            rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, 51 );
+            rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
+            rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, QP_MAX );
+            rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, QP_MAX );

             x264_log(h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries);
             x264_log(h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant);
@@ -2652,7 +2652,7 @@ static int init_pass2( x264_t *h )
         }
         else if( expected_bits > all_available_bits && avgq > h->param.rc.i_qp_max - 2 )
         {
-            if( h->param.rc.i_qp_max < 51 )
+            if( h->param.rc.i_qp_max < QP_MAX )
                 x264_log( h, X264_LOG_WARNING, "try increasing target bitrate or increasing qp_max (currently %d)\n", h->param.rc.i_qp_max );
             else
                 x264_log( h, X264_LOG_WARNING, "try increasing target bitrate\n");
diff --git a/encoder/rdo.c b/encoder/rdo.c
index afaa894..4fae811 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -443,10 +443,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct,
         /* We only need to zero an empty 4x4 block. 8x8 can be
            implicitly emptied via zero nnz, as can dc. */
         if( i_coefs == 16 && !dc )
-        {
-            M128( &dct[0] ) = M128_ZERO;
-            M128( &dct[8] ) = M128_ZERO;
-        }
+            memset( dct, 0, 16 * sizeof(dctcoef) );
         return 0;
     }

@@ -613,10 +610,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct,
     if( bnode == &nodes_cur[0] )
     {
         if( i_coefs == 16 && !dc )
-        {
-            M128( &dct[0] ) = M128_ZERO;
-            M128( &dct[8] ) = M128_ZERO;
-        }
+            memset( dct, 0, 16 * sizeof(dctcoef) );
         return 0;
     }

diff --git a/encoder/set.c b/encoder/set.c
index 9e6e736..55d6df7 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -104,6 +104,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
     if( sps->b_qpprime_y_zero_transform_bypass )
         sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
+    else if( BIT_DEPTH > 8 )
+        sps->i_profile_idc  = PROFILE_HIGH10;
     else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
         sps->i_profile_idc  = PROFILE_HIGH;
     else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 )
@@ -260,8 +262,8 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
     if( sps->i_profile_idc >= PROFILE_HIGH )
     {
         bs_write_ue( s, 1 ); // chroma_format_idc = 4:2:0
-        bs_write_ue( s, 0 ); // bit_depth_luma_minus8
-        bs_write_ue( s, 0 ); // bit_depth_chroma_minus8
+        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
+        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
         bs_write( s, 1, sps->b_qpprime_y_zero_transform_bypass );
         bs_write( s, 1, 0 ); // seq_scaling_matrix_present_flag
     }
@@ -488,7 +490,7 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps )
     bs_write( s, 1, pps->b_weighted_pred );
     bs_write( s, 2, pps->b_weighted_bipred );

-    bs_write_se( s, pps->i_pic_init_qp - 26 );
+    bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET );
     bs_write_se( s, pps->i_pic_init_qs - 26 );
     bs_write_se( s, pps->i_chroma_qp_index_offset );

diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 7d69b71..ad2a8c2 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -303,7 +303,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
                                   (mv1)[0], (mv1)[1], 8, 8, w ); \
             h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
         } \
-        i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
+        i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \
                            m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
         COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
     }
@@ -393,9 +393,9 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             }

             x264_me_search( h, &m[l], mvc, i_mvc );
-            m[l].cost -= 2; // remove mvcost from skip mbs
+            m[l].cost -= 2 * a->i_lambda; // remove mvcost from skip mbs
             if( M32( m[l].mv ) )
-                m[l].cost += 5;
+                m[l].cost += 5 * a->i_lambda;

 skip_motionest:
             CP32( fenc_mvs[l], m[l].mv );
@@ -418,7 +418,7 @@ lowres_intra_mb:
         ALIGNED_ARRAY_16( pixel, edge,[33] );
         pixel *pix = &pix1[8+FDEC_STRIDE - 1];
         pixel *src = &fenc->lowres[0][i_pel_offset - 1];
-        const int intra_penalty = 5;
+        const int intra_penalty = 5 * a->i_lambda;
         int satds[3];

         memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) );
@@ -496,7 +496,7 @@ lowres_intra_mb:
         }
     }

-    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT);
+    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
 }
 #undef TRY_BIDIR

diff --git a/tools/checkasm.c b/tools/checkasm.c
index 7fa2c0c..a5ffa17 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -40,8 +40,10 @@
 uint8_t *buf1, *buf2;
 /* buf3, buf4: used to store output */
 uint8_t *buf3, *buf4;
-/* pbuf*: point to the same memory as above, just for type convenience */
-pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;
+/* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */
+pixel *pbuf1, *pbuf2;
+/* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */
+pixel *pbuf3, *pbuf4;

 int quiet = 0;

@@ -256,11 +258,15 @@ static int check_pixel( int cpu_ref, int cpu_new )
         int z = i|(i>>4);
         z ^= z>>2;
         z ^= z>>1;
-        buf3[i] = ~(buf4[i] = -(z&1));
+        pbuf4[i] = -(z&1) & PIXEL_MAX;
+        pbuf3[i] = ~pbuf4[i] & PIXEL_MAX;
     }
     // random pattern made of maxed pixel differences, in case an intermediate value overflows
     for( int i = 256; i < 0x1000; i++ )
-        buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1));
+    {
+        pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX;
+        pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX;
+    }

 #define TEST_PIXEL( name, align ) \
     ok = 1, used_asm = 0; \
@@ -535,22 +541,22 @@ static int check_dct( int cpu_ref, int cpu_new )
         used_asm = 1; \
         call_c( dct_c.name, t1, pbuf1, pbuf2 ); \
         call_a( dct_asm.name, t2, pbuf1, pbuf2 ); \
-        if( memcmp( t1, t2, size ) ) \
+        if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
         } \
     }
     ok = 1; used_asm = 0;
-    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
-    TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
-    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 );
-    TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
+    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
+    TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
+    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
+    TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
     report( "sub_dct4 :" );

     ok = 1; used_asm = 0;
-    TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64*2 );
-    TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*2*4 );
+    TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 );
+    TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 );
     report( "sub_dct8 :" );
 #undef TEST_DCT

@@ -574,13 +580,13 @@ static int check_dct( int cpu_ref, int cpu_new )
     { \
         set_func_name( #name ); \
         used_asm = 1; \
-        memcpy( buf3, buf1, 32*32 * sizeof(pixel) ); \
-        memcpy( buf4, buf1, 32*32 * sizeof(pixel) ); \
-        memcpy( dct1, src, 512 * sizeof(pixel) ); \
-        memcpy( dct2, src, 512 * sizeof(pixel) ); \
+        memcpy( pbuf3, pbuf1, 32*32 * sizeof(pixel) ); \
+        memcpy( pbuf4, pbuf1, 32*32 * sizeof(pixel) ); \
+        memcpy( dct1, src, 256 * sizeof(dctcoef) ); \
+        memcpy( dct2, src, 256 * sizeof(dctcoef) ); \
         call_c1( dct_c.name, pbuf3, (void*)dct1 ); \
         call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \
-        if( memcmp( buf3, buf4, 32*32 * sizeof(pixel) ) ) \
+        if( memcmp( pbuf3, pbuf4, 32*32 * sizeof(pixel) ) ) \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
@@ -615,10 +621,10 @@ static int check_dct( int cpu_ref, int cpu_new )
                 dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
                            : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
                            : ((*p++)&0x1fff)-0x1000; /* general case */\
-            memcpy( dct2, dct1, 32 );\
+            memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
             call_c1( dct_c.name, dct1[0] );\
             call_a1( dct_asm.name, dct2[0] );\
-            if( memcmp( dct1, dct2, 32 ) )\
+            if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\
                 ok = 0;\
         }\
         call_c2( dct_c.name, dct1[0] );\
@@ -658,11 +664,11 @@ static int check_dct( int cpu_ref, int cpu_new )
         int nz_a, nz_c; \
         set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
         used_asm = 1; \
-        memcpy( buf3, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
-        memcpy( buf4, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
+        memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
+        memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
         nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3 ); \
         nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4 ); \
-        if( memcmp( t1, t2, size*sizeof(dctcoef) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
+        if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
@@ -680,8 +686,8 @@ static int check_dct( int cpu_ref, int cpu_new )
         used_asm = 1; \
         for( int i = 0; i < 2; i++ ) \
         { \
-            memcpy( buf3, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
-            memcpy( buf4, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
+            memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
+            memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
             for( int j = 0; j < 4; j++ ) \
             { \
                 memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
@@ -689,7 +695,7 @@ static int check_dct( int cpu_ref, int cpu_new )
             } \
             nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \
             nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \
-            if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
+            if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name " [FAILED]\n" ); \
@@ -779,11 +785,11 @@ static int check_mc( int cpu_ref, int cpu_new )
             const x264_weight_t *weight = weight_none; \
             set_func_name( "mc_luma_%dx%d", w, h ); \
             used_asm = 1; \
-            memset( buf3, 0xCD, 1024 ); \
-            memset( buf4, 0xCD, 1024 ); \
+            for( int i = 0; i < 1024; i++ ) \
+                pbuf3[i] = pbuf4[i] = 0xCD; \
             call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
             call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
-            if( memcmp( buf3, buf4, 1024 ) ) \
+            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                 ok = 0; \
@@ -796,8 +802,8 @@ static int check_mc( int cpu_ref, int cpu_new )
             const x264_weight_t *weight = weight_none; \
             set_func_name( "get_ref_%dx%d", w, h ); \
             used_asm = 1; \
-            memset( buf3, 0xCD, 1024 ); \
-            memset( buf4, 0xCD, 1024 ); \
+            for( int i = 0; i < 1024; i++ ) \
+                pbuf3[i] = pbuf4[i] = 0xCD; \
             call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
             ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
             for( int i = 0; i < h; i++ ) \
@@ -814,15 +820,15 @@ static int check_mc( int cpu_ref, int cpu_new )
         { \
             set_func_name( "mc_chroma_%dx%d", w, h ); \
             used_asm = 1; \
-            memset( buf3, 0xCD, 1024 ); \
-            memset( buf4, 0xCD, 1024 ); \
+            for( int i = 0; i < 1024; i++ ) \
+                pbuf3[i] = pbuf4[i] = 0xCD; \
             call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
             call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
             /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
             for( int j = 0; j < h; j++ ) \
                 for( int i = w; i < 4; i++ ) \
                     dst2[i+j*16] = dst1[i+j*16]; \
-            if( memcmp( buf3, buf4, 1024 ) ) \
+            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                 ok = 0; \
@@ -867,15 +873,15 @@ static int check_mc( int cpu_ref, int cpu_new )
     ok = 1, used_asm = 0; \
     for( int i = 0; i < 10; i++ ) \
     { \
-        memcpy( buf3, pbuf1+320, 320 * sizeof(pixel) ); \
-        memcpy( buf4, pbuf1+320, 320 * sizeof(pixel) ); \
+        memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
+        memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
         if( mc_a.name[i] != mc_ref.name[i] ) \
         { \
             set_func_name( "%s_%s", #name, pixel_names[i] ); \
             used_asm = 1; \
             call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
             call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
-            if( memcmp( buf3, buf4, 320 * sizeof(pixel) ) ) \
+            if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
@@ -971,8 +977,8 @@ static int check_mc( int cpu_ref, int cpu_new )
         void *tmp = pbuf3+49*64;
         set_func_name( "hpel_filter" );
         ok = 1; used_asm = 1;
-        memset( buf3, 0, 4096 * sizeof(pixel) );
-        memset( buf4, 0, 4096 * sizeof(pixel) );
+        memset( pbuf3, 0, 4096 * sizeof(pixel) );
+        memset( pbuf4, 0, 4096 * sizeof(pixel) );
         call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp );
         call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp );
         for( int i = 0; i < 3; i++ )
@@ -1030,13 +1036,13 @@ static int check_mc( int cpu_ref, int cpu_new )
         int stride = 80;\
         set_func_name( #name );\
         used_asm = 1;\
-        memcpy( buf3, buf1, size*2*stride * sizeof(pixel) );\
-        memcpy( buf4, buf1, size*2*stride * sizeof(pixel) );\
-        uint16_t *sum = (uint16_t*)buf3;\
+        memcpy( pbuf3, pbuf1, size*2*stride * sizeof(pixel) );\
+        memcpy( pbuf4, pbuf1, size*2*stride * sizeof(pixel) );\
+        uint16_t *sum = (uint16_t*)pbuf3;\
         call_c1( mc_c.name, __VA_ARGS__ );\
-        sum = (uint16_t*)buf4;\
+        sum = (uint16_t*)pbuf4;\
         call_a1( mc_a.name, __VA_ARGS__ );\
-        if( memcmp( buf3, buf4, (stride-8)*2 * sizeof(pixel) )\
+        if( memcmp( pbuf3, pbuf4, (stride-8)*2 * sizeof(pixel) )\
             || (size>9 && memcmp( pbuf3+18*stride, pbuf4+18*stride, (stride-8)*2 * sizeof(pixel) )))\
             ok = 0;\
         call_c2( mc_c.name, __VA_ARGS__ );\
@@ -1096,11 +1102,11 @@ static int check_deblock( int cpu_ref, int cpu_new )
     /* not exactly the real values of a,b,tc but close enough */
     for( int i = 35, a = 255, c = 250; i >= 0; i-- )
     {
-        alphas[i] = a;
-        betas[i] = (i+1)/2;
-        tcs[i][0] = tcs[i][3] = (c+6)/10;
-        tcs[i][1] = (c+7)/15;
-        tcs[i][2] = (c+9)/20;
+        alphas[i] = a << (BIT_DEPTH-8);
+        betas[i] = (i+1)/2 << (BIT_DEPTH-8);
+        tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8);
+        tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8);
+        tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8);
         a = a*9/10;
         c = c*9/10;
     }
@@ -1111,15 +1117,15 @@ static int check_deblock( int cpu_ref, int cpu_new )
         int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
         for( int j = 0; j < 1024; j++ ) \
             /* two distributions of random to excersize different failure modes */ \
-            buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
-        memcpy( buf4, buf3, 1024 * sizeof(pixel) ); \
+            pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
+        memcpy( pbuf4, pbuf3, 1024 * sizeof(pixel) ); \
         if( db_a.name != db_ref.name ) \
         { \
             set_func_name( #name ); \
             used_asm = 1; \
             call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
             call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-            if( memcmp( buf3, buf4, 1024 * sizeof(pixel) ) ) \
+            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
@@ -1200,7 +1206,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     h->pps = h->pps_array;
     x264_param_default( &h->param );
     h->chroma_qp_table = i_chroma_qp_table + 12;
-    h->param.rc.i_qp_min = 26;
+    h->param.rc.i_qp_min = 26 + QP_BD_OFFSET;
     h->param.analyse.b_transform_8x8 = 1;

     for( int i_cqm = 0; i_cqm < 4; i_cqm++ )
@@ -1219,9 +1225,10 @@ static int check_quant( int cpu_ref, int cpu_new )
         }
         else
         {
+            int max_scale = BIT_DEPTH < 10 ? 255 : 228;
             if( i_cqm == 2 )
                 for( int i = 0; i < 64; i++ )
-                    cqm_buf[i] = 10 + rand() % 246;
+                    cqm_buf[i] = 10 + rand() % (max_scale - 9);
             else
                 for( int i = 0; i < 64; i++ )
                     cqm_buf[i] = 1;
@@ -1260,7 +1267,7 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( #name ); \
             used_asms[0] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 for( int j = 0; j < 2; j++ ) \
                 { \
@@ -1269,7 +1276,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                         dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
                     result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                     result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
+                    if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
@@ -1286,14 +1293,14 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( #qname ); \
             used_asms[0] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 for( int j = 0; j < 2; j++ ) \
                 { \
                     INIT_QUANT##w(j) \
                     int result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                     int result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                    if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \
+                    if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
@@ -1317,14 +1324,14 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
             used_asms[1] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 INIT_QUANT##w(1) \
                 call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                memcpy( dct2, dct1, w*w*2 ); \
+                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-                if( memcmp( dct1, dct2, w*w*2 ) ) \
+                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
                 { \
                     oks[1] = 0; \
                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
@@ -1345,15 +1352,15 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
             used_asms[1] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 for( int i = 0; i < 16; i++ ) \
                     dct1[i] = rand(); \
                 call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
-                memcpy( dct2, dct1, w*w*2 ); \
+                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-                if( memcmp( dct1, dct2, w*w*2 ) ) \
+                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
                 { \
                     oks[1] = 0; \
                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
@@ -1381,12 +1388,12 @@ static int check_quant( int cpu_ref, int cpu_new )
         for( int size = 16; size <= 64; size += 48 )
         {
             set_func_name( "denoise_dct" );
-            memcpy( dct1, buf1, size*2 );
-            memcpy( dct2, buf1, size*2 );
+            memcpy( dct1, buf1, size*sizeof(dctcoef) );
+            memcpy( dct2, buf1, size*sizeof(dctcoef) );
             memcpy( buf3+256, buf3, 256 );
             call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
             call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
-            if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
+            if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
                 ok = 0;
             call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
             call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
@@ -1431,7 +1438,7 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             int nnz = 0; \
             int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*2 ); \
+            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
             for( int idx = ac; idx < max; idx++ ) \
                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
             if( !nnz ) \
@@ -1464,7 +1471,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             x264_run_level_t runlevel_c, runlevel_a; \
             int nnz = 0; \
             int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*2 ); \
+            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
             memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
             memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
             for( int idx = ac; idx < max; idx++ ) \
@@ -1474,7 +1481,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
             int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
             if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
-                memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
+                memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
                 memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
             { \
                 ok = 0; \
@@ -1529,11 +1536,11 @@ static int check_intra( int cpu_ref, int cpu_new )
     {\
         set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
         used_asm = 1;\
-        memcpy( buf3, buf1, 32*20 * sizeof(pixel) );\
-        memcpy( buf4, buf1, 32*20 * sizeof(pixel) );\
+        memcpy( pbuf3, pbuf1, 32*20 * sizeof(pixel) );\
+        memcpy( pbuf4, pbuf1, 32*20 * sizeof(pixel) );\
         call_c( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\
         call_a( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\
-        if( memcmp( buf3, buf4, 32*20 * sizeof(pixel) ) )\
+        if( memcmp( pbuf3, pbuf4, 32*20 * sizeof(pixel) ) )\
         {\
             fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
             ok = 0;\
@@ -1544,7 +1551,7 @@ static int check_intra( int cpu_ref, int cpu_new )
             {\
                 printf( "%2x ", edge[14-j] );\
                 for( int k = 0; k < w; k++ )\
-                    printf( "%2x ", buf4[48+k+j*32] );\
+                    printf( "%2x ", pbuf4[48+k+j*32] );\
                 printf( "\n" );\
             }\
             printf( "\n" );\
@@ -1552,7 +1559,7 @@ static int check_intra( int cpu_ref, int cpu_new )
             {\
                 printf( "   " );\
                 for( int k = 0; k < w; k++ )\
-                    printf( "%2x ", buf3[48+k+j*32] );\
+                    printf( "%2x ", pbuf3[48+k+j*32] );\
                 printf( "\n" );\
             }\
         }\
@@ -1831,8 +1838,9 @@ int main(int argc, char *argv[])
     fprintf( stderr, "x264: using random seed %u\n", seed );
     srand( seed );

-    buf1 = x264_malloc( 0x3e00 + 16*BENCH_ALIGNS );
-    if( !buf1 )
+    buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 16*BENCH_ALIGNS );
+    pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 16*BENCH_ALIGNS );
+    if( !buf1 || !pbuf1 )
     {
         fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
         return -1;
@@ -1840,15 +1848,17 @@ int main(int argc, char *argv[])
 #define INIT_POINTER_OFFSETS\
     buf2 = buf1 + 0xf00;\
     buf3 = buf2 + 0xf00;\
-    buf4 = buf3 + 0x1000;\
-    pbuf1 = (pixel*)buf1;\
-    pbuf2 = (pixel*)buf2;\
+    buf4 = buf3 + 0x1000*sizeof(pixel);\
+    pbuf2 = pbuf1 + 0xf00;\
     pbuf3 = (pixel*)buf3;\
     pbuf4 = (pixel*)buf4;
     INIT_POINTER_OFFSETS;
     for( int i = 0; i < 0x1e00; i++ )
+    {
         buf1[i] = rand() & 0xFF;
-    memset( buf1+0x1e00, 0, 0x2000 );
+        pbuf1[i] = rand() & PIXEL_MAX;
+    }
+    memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );

     /* 16-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
     if( do_bench )
@@ -1857,6 +1867,7 @@ int main(int argc, char *argv[])
             INIT_POINTER_OFFSETS;
             ret |= x264_stack_pagealign( check_all_flags, i*16 );
             buf1 += 16;
+            pbuf1 += 16;
             quiet = 1;
             fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
         }
diff --git a/x264.c b/x264.c
index 0bede93..c1141ab 100644
--- a/x264.c
+++ b/x264.c
@@ -262,6 +262,7 @@ static void Help( x264_param_t *defaults, int longhelp )
         " .mkv -> Matroska\n"
         " .flv -> Flash Video\n"
         " .mp4 -> MP4 if compiled with GPAC support (%s)\n"
+        "Output bit depth: %d (configured at compile time)\n"
         "\n"
         "Options:\n"
         "\n"
@@ -286,10 +287,11 @@ static void Help( x264_param_t *defaults, int longhelp )
         "no",
 #endif
 #if HAVE_GPAC
-        "yes"
+        "yes",
 #else
-        "no"
+        "no",
 #endif
+        BIT_DEPTH
       );
     H0( "Example usage:\n" );
     H0( "\n" );
@@ -311,8 +313,8 @@ static void Help( x264_param_t *defaults, int longhelp )
     H0( "\n" );
     H0( "Presets:\n" );
     H0( "\n" );
-    H0( "      --profile               Force the limits of an H.264 profile [high]\n"
-        "                                  Overrides all settings.\n" );
+    H0( "      --profile               Force the limits of an H.264 profile [%s]\n"
+        "                                  Overrides all settings.\n", BIT_DEPTH > 8 ? "high10" : "high" );
     H2( "                                  - baseline:\n"
         "                                    --no-8x8dct --bframes 0 --no-cabac\n"
         "                                    --cqm flat --weightp 0\n"
@@ -322,8 +324,11 @@ static void Help( x264_param_t *defaults, int longhelp )
         "                                    --no-8x8dct --cqm flat\n"
         "                                    No lossless.\n"
         "                                  - high:\n"
-        "                                    No lossless.\n" );
-        else H0( "                                  - baseline,main,high\n" );
+        "                                    No lossless.\n"
+        "                                  - high10:\n"
+        "                                    No lossless.\n"
+        "                                    Support for bit depth 8-10.\n" );
+        else H0( "                                  - baseline,main,high,high10\n" );
     H0( "      --preset                Use a preset to select encoding settings [medium]\n"
         "                                  Overridden by user settings.\n" );
     H2( "                                  - ultrafast:\n"
@@ -453,9 +458,9 @@ static void Help( x264_param_t *defaults, int longhelp )
     H0( "\n" );
     H0( "Ratecontrol:\n" );
     H0( "\n" );
-    H1( "  -q, --qp <integer>          Force constant QP (0-51, 0=lossless)\n" );
+    H1( "  -q, --qp <integer>          Force constant QP (0-%d, 0=lossless)\n", QP_MAX );
     H0( "  -B, --bitrate <integer>     Set bitrate (kbit/s)\n" );
-    H0( "      --crf <float>           Quality-based VBR (0-51, 0=lossless) [%.1f]\n", defaults->rc.f_rf_constant );
+    H0( "      --crf <float>           Quality-based VBR (0-%d, 0=lossless) [%.1f]\n", QP_MAX, defaults->rc.f_rf_constant );
     H1( "      --rc-lookahead <integer> Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead );
     H0( "      --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
     H0( "      --vbv-bufsize <integer> Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
@@ -1040,6 +1045,7 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
 #else
                 printf( "using a non-gcc compiler\n" );
 #endif
+                printf( "configuration: --bit-depth=%d\n", BIT_DEPTH );
                 exit(0);
             case OPT_FRAMES:
                 param->i_frame_total = X264_MAX( atoi( optarg ), 0 );
@@ -1318,7 +1324,7 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
         else if( type == 'B' ) pic->i_type = X264_TYPE_BREF;
         else if( type == 'b' ) pic->i_type = X264_TYPE_B;
         else ret = 0;
-        if( ret != 3 || qp < -1 || qp > 51 )
+        if( ret != 3 || qp < -1 || qp > QP_MAX )
         {
             x264_cli_log( "x264", X264_LOG_ERROR, "can't parse qpfile for frame %d\n", i_frame );
             fclose( opt->qpfile );
diff --git a/x264.h b/x264.h
index 097365a..4d9b9ca 100644
--- a/x264.h
+++ b/x264.h
@@ -344,7 +344,7 @@ typedef struct x264_param_t
     {
         int         i_rc_method;    /* X264_RC_* */

-        int         i_qp_constant;  /* 0-51 */
+        int         i_qp_constant;  /* 0 to (51 + 6*(BIT_DEPTH-8)) */
         int         i_qp_min;       /* min allowed QP value */
         int         i_qp_max;       /* max allowed QP value */
         int         i_qp_step;      /* max QP step between frames */
@@ -550,7 +550,7 @@ void    x264_param_apply_fastfirstpass( x264_param_t * );
 /* x264_param_apply_profile:
  *      Applies the restrictions of the given profile.
  *      Currently available profiles are, from most to least restrictive: */
-static const char * const x264_profile_names[] = { "baseline", "main", "high", 0 };
+static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", 0 };

 /*      (can be NULL, in which case the function will do nothing)
  *
--
1.7.1