Untitled

diff --git a/common/common.h b/common/common.h
index 4307619..5c0ef1d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -59,12 +59,13 @@ do {\
 #define X264_PCM_COST (384*BIT_DEPTH+16)
 #define X264_LOOKAHEAD_MAX 250
 #define QP_BD_OFFSET (6*(BIT_DEPTH-8))
-#define QP_MAX (51+QP_BD_OFFSET)
-#define QP_MAX_MAX (51+2*6)
-#define LAMBDA_MAX (91 << (BIT_DEPTH-8))
+#define QP_MAX_SPEC (51+QP_BD_OFFSET)
+#define QP_MAX (QP_MAX_SPEC+24)
+#define QP_MAX_MAX (51+2*6+24)
 #define PIXEL_MAX ((1 << BIT_DEPTH)-1)
 // arbitrary, but low because SATD scores are 1/4 normal
 #define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
+#define SPEC_QP(x) X264_MIN((x), QP_MAX_SPEC)

 // number of pixels (per thread) in progress at any given time.
 // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
@@ -458,12 +459,11 @@ struct x264_t
     udctcoef        (*quant8_mf[2])[64];     /* [2][52][64] */
     udctcoef        (*quant4_bias[4])[16];   /* [4][52][16] */
     udctcoef        (*quant8_bias[2])[64];   /* [2][52][64] */
+    udctcoef        (*nr_offset_emergency)[3][64];

-    /* mv/ref cost arrays.  Indexed by lambda instead of
-     * qp because, due to rounding, some quantizers share
-     * lambdas.  This saves memory. */
-    uint16_t *cost_mv[LAMBDA_MAX+1];
-    uint16_t *cost_mv_fpel[LAMBDA_MAX+1][4];
+    /* mv/ref cost arrays. */
+    uint16_t *cost_mv[QP_MAX+1];
+    uint16_t *cost_mv_fpel[QP_MAX+1][4];

     const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */

@@ -811,9 +811,14 @@ struct x264_t

     } stat;

-    ALIGNED_16( uint32_t nr_residual_sum[2][64] );
-    ALIGNED_16( udctcoef nr_offset[2][64] );
-    uint32_t        nr_count[2];
+    /* 0 = luma 4x4, 1 = luma 8x8, 2 = chroma 4x4 */
+    udctcoef (*nr_offset)[64];
+    uint32_t (*nr_residual_sum)[64];
+    uint32_t *nr_count;
+
+    ALIGNED_16( udctcoef nr_offset_denoise[3][64] );
+    ALIGNED_16( uint32_t nr_residual_sum_buf[2][3][64] );
+    uint32_t nr_count_buf[2][3];

     /* Buffers that are allocated per-thread even in sliced threads. */
     void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
diff --git a/common/macroblock.c b/common/macroblock.c
index 5c76d3f..8013957 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1171,7 +1171,7 @@ void x264_macroblock_cache_save( x264_t *h )
         if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
             h->mb.i_qp = h->mb.i_last_qp;
         h->mb.qp[i_mb_xy] = h->mb.i_qp;
-        h->mb.i_last_dqp = h->mb.i_qp - h->mb.i_last_qp;
+        h->mb.i_last_dqp = SPEC_QP( h->mb.i_qp ) - SPEC_QP( h->mb.i_last_qp );
         h->mb.i_last_qp = h->mb.i_qp;
     }

diff --git a/common/quant.c b/common/quant.c
index 816e60a..36e59ee 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -143,7 +143,7 @@ static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )

 static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
 {
-    for( int i = 1; i < size; i++ )
+    for( int i = 0; i < size; i++ )
     {
         int level = dct[i];
         int sign = level>>31;
diff --git a/common/set.c b/common/set.c
index 92b6b7a..c4d5549 100644
--- a/common/set.c
+++ b/common/set.c
@@ -23,6 +23,8 @@
  * For more information, contact us at licensing@x264.com.
  *****************************************************************************/

+#define _ISOC99_SOURCE
+#include <math.h>
 #include "common.h"

 #define SHIFT(x,s) ((s)<=0 ? (x)<<-(s) : ((x)+(1<<((s)-1)))>>(s))
@@ -146,21 +148,22 @@ int x264_cqm_init( x264_t *h )
                      quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
             }
     }
-    for( int q = 0; q < QP_MAX+1; q++ )
+    for( int qm = 0; qm <= QP_MAX; qm++ )
     {
         int j;
+        int q = SPEC_QP( qm );
         for( int i_list = 0; i_list < 4; i_list++ )
             for( int i = 0; i < 16; i++ )
             {
-                h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
-                h->quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
+                h->unquant4_mf[i_list][qm][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
+                h->quant4_mf[i_list][qm][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
                 if( !j )
                 {
                     min_qp_err = X264_MIN( min_qp_err, q );
                     continue;
                 }
                 // round to nearest, unless that would cause the deadzone to be negative
-                h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+                h->quant4_bias[i_list][qm][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                 if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
                     max_qp_err = q;
                 if( j > 0xffff && q > max_chroma_qp_err && (i_list == CQM_4IC || i_list == CQM_4PC) )
@@ -170,20 +173,62 @@ int x264_cqm_init( x264_t *h )
             for( int i_list = 0; i_list < 2; i_list++ )
                 for( int i = 0; i < 64; i++ )
                 {
-                    h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
+                    h->unquant8_mf[i_list][qm][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
                     j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
-                    h->quant8_mf[i_list][q][i] = (uint16_t)j;
+                    h->quant8_mf[i_list][qm][i] = (uint16_t)j;

                     if( !j )
                     {
                         min_qp_err = X264_MIN( min_qp_err, q );
                         continue;
                     }
-                    h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
+                    h->quant8_bias[i_list][qm][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                     if( j > 0xffff && q > max_qp_err )
                         max_qp_err = q;
                 }
     }
+    x264_emms();
+    CHECKED_MALLOC( h->nr_offset_emergency, sizeof(*h->nr_offset_emergency)*(QP_MAX-QP_MAX_SPEC) );
+    for( int q = 0; q < QP_MAX - QP_MAX_SPEC; q++ )
+        for( int cat = 0; cat <= 2; cat++ )
+        {
+            int dct8x8 = cat == 1;
+            int size = dct8x8 ? 64 : 16;
+            udctcoef *nr_offset = h->nr_offset_emergency[q][cat];
+            int lowest_dc = (QP_MAX-QP_MAX_SPEC)/2;
+
+            for( int i = 0; i < size; i++ )
+            {
+                /* True "emergency mode": remove all DCT coefficients */
+                if( q == QP_MAX - QP_MAX_SPEC - 1 )
+                {
+                    nr_offset[i] = (1 << (7 + BIT_DEPTH)) - 1;
+                    continue;
+                }
+
+                double pos;
+                if( i == 0 )
+                {
+                    /* Only reduce DC at very high QPs */
+                    if( q < lowest_dc )
+                    {
+                        nr_offset[i] = 0;
+                        continue;
+                    }
+                    pos = (double)(q-lowest_dc+1) / (QP_MAX - QP_MAX_SPEC - lowest_dc);
+                }
+                else
+                    pos = (double)(q+1) / (QP_MAX - QP_MAX_SPEC);
+
+                /* Exponentially interpolate between "start" and "end" */
+                double start = dct8x8 ? h->unquant8_mf[CQM_8PY][QP_MAX_SPEC][i] / 4096.0
+                                      : h->unquant4_mf[CQM_4PY][QP_MAX_SPEC][i] / 4096.0;
+                double end = start * 32;
+                double scale = log(end / start);
+                double bias = exp(scale * pos) * start;
+                nr_offset[i] = bias + 0.5;
+            }
+        }

     if( !h->mb.b_lossless && max_qp_err >= h->param.rc.i_qp_min )
     {
@@ -233,6 +278,7 @@ void x264_cqm_delete( x264_t *h )
 {
     CQM_DELETE( 4, 4 );
     CQM_DELETE( 8, 2 );
+    x264_free( h->nr_offset_emergency );
 }

 static int x264_cqm_parse_jmlist( x264_t *h, const char *buf, const char *name,
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 5d7a15e..e09f6c9 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -699,8 +699,7 @@ DEQUANT_DC sse2
 ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
 ;-----------------------------------------------------------------------------
 %macro DENOISE_DCT 1-2 0
-cglobal denoise_dct_%1, 4,5,%2
-    mov       r4d, [r0] ; backup DC coefficient
+cglobal denoise_dct_%1, 4,4,%2
     pxor      m6, m6
 .loop:
     sub       r3, mmsize/2
@@ -727,8 +726,7 @@ cglobal denoise_dct_%1, 4,5,%2
     mova      [r1+r3*4+0*mmsize], m4
     mova      [r1+r3*4+1*mmsize], m5
     jg .loop
-    mov       [r0], r4d ; restore DC coefficient
-    RET
+    REP_RET
 %endmacro

 %define PABSD PABSD_MMX
@@ -749,8 +747,7 @@ DENOISE_DCT ssse3, 8
 ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
 ;-----------------------------------------------------------------------------
 %macro DENOISE_DCT 1-2 0
-cglobal denoise_dct_%1, 4,5,%2
-    movzx     r4d, word [r0]
+cglobal denoise_dct_%1, 4,4,%2
     pxor      m6, m6
 .loop:
     sub       r3, mmsize
@@ -781,8 +778,7 @@ cglobal denoise_dct_%1, 4,5,%2
     mova      [r1+r3*4+2*mmsize], m5
     mova      [r1+r3*4+3*mmsize], m3
     jg .loop
-    mov       [r0], r4w
-    RET
+    REP_RET
 %endmacro

 %define PABSW PABSW_MMX
diff --git a/encoder/analyse.c b/encoder/analyse.c
index a997425..d2ec63f 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -147,18 +147,26 @@ const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = {
   25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
   64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
  161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
+ 406, 456, 512, 575, 645, 724, 813, 912, /* 64-71 */
+1024,1149,1290,1448,1625,1825,2048,2299, /* 72-79 */
+2048,2299,2580,2896,3251,3649,4096,4598, /* 80-87 */
 };

 /* lambda2 = pow(lambda,2) * .9 * 256 */
+/* Capped to avoid overflow */
 const int x264_lambda2_tab[QP_MAX_MAX+1] = {
-     14,     18,     22,      28,      36,      45,      57,      72, /*  0- 7 */
-     91,    115,    145,     182,     230,     290,     365,     460, /*  8-15 */
-    580,    731,    921,    1161,    1462,    1843,    2322,    2925, /* 16-23 */
-   3686,   4644,   5851,    7372,    9289,   11703,   14745,   18578, /* 24-31 */
-  23407,  29491,  37156,   46814,   58982,   74313,   93628,  117964, /* 32-39 */
- 148626, 187257, 235929,  297252,  374514,  471859,  594505,  749029, /* 40-47 */
- 943718,1189010,1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
-5992238,7549747,9512085,11984476,15099494,19024170,23968953,30198988, /* 56-63 */
+       14,       18,       22,       28,       36,       45,      57,      72, /*  0- 7 */
+       91,      115,      145,      182,      230,      290,     365,     460, /*  8-15 */
+      580,      731,      921,     1161,     1462,     1843,    2322,    2925, /* 16-23 */
+     3686,     4644,     5851,     7372,     9289,    11703,   14745,   18578, /* 24-31 */
+    23407,    29491,    37156,    46814,    58982,    74313,   93628,  117964, /* 32-39 */
+   148626,   187257,   235929,   297252,   374514,   471859,  594505,  749029, /* 40-47 */
+   943718,  1189010,  1498059,  1887436,  2378021,  2996119, 3774873, 4756042, /* 48-55 */
+  5992238,  7549747,  9512085, 11984476, 15099494, 19024170,23968953,30198988, /* 56-63 */
+ 38048341, 47937906, 60397977, 76096683, 95875813,120795955,                   /* 64-69 */
+134217727,134217727,134217727,134217727,134217727,134217727,                   /* 70-75 */
+134217727,134217727,134217727,134217727,134217727,134217727,                   /* 76-81 */
+134217727,134217727,134217727,134217727,134217727,134217727,                   /* 82-87 */
 };

 const uint8_t x264_exp2_lut[64] = {
@@ -196,29 +204,40 @@ const float x264_log2_lz_lut[32] = {
 // I'm just matching the behaviour of deadzone quant.
 static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = {
     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {      46,      58,      73,      92,     117,     147,
-          185,     233,     294,     370,     466,     587,
-          740,     932,    1174,    1480,    1864,    2349,
-         2959,    3728,    4697,    5918,    7457,    9395,
-        11837,   14914,   18790,   23674,   29828,   37581,
-        47349,   59656,   75163,   94699,  119313,  150326,
-       189399,  238627,  300652,  378798,  477255,  601304,
-       757596,  954511, 1202608, 1515192, 1909022, 2405217,
-      3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
-     12121539,15272182,19241743,24243077,30544363,38483486,
-     48486154,61088726,76966972,96972308 },
+    {
+               46,       58,       73,       92,      117,      147,
+              185,      233,      294,      370,      466,      587,
+              740,      932,     1174,     1480,     1864,     2349,
+             2959,     3728,     4697,     5918,     7457,     9395,
+            11837,    14914,    18790,    23674,    29828,    37581,
+            47349,    59656,    75163,    94699,   119313,   150326,
+           189399,   238627,   300652,   378798,   477255,   601304,
+           757596,   954511,  1202608,  1515192,  1909022,  2405217,
+          3030384,  3818045,  4810435,  6060769,  7636091,  9620872,
+         12121539, 15272182, 19241743, 24243077, 30544363, 38483486,
+         48486154, 61088726, 76966972, 96972308,
+        122177453,134217727,134217727,134217727,134217727,134217727,
+        134217727,134217727,134217727,134217727,134217727,134217727,
+        134217727,134217727,134217727,134217727,134217727,134217727
+    },
     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {      27,      34,      43,      54,      68,      86,
-          108,     136,     172,     216,     273,     343,
-          433,     545,     687,     865,    1090,    1374,
-         1731,    2180,    2747,    3461,    4361,    5494,
-         6922,    8721,   10988,   13844,   17442,   21976,
-        27688,   34885,   43953,   55377,   69771,   87906,
-       110755,  139543,  175813,  221511,  279087,  351627,
-       443023,  558174,  703255,  886046, 1116348, 1406511,
-      1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
-      7088374, 8930791,11252092,14176748,17861583,22504184,
-     28353495,35723165,45008368,56706990 }
+    {
+               27,       34,       43,       54,       68,       86,
+              108,      136,      172,      216,      273,      343,
+              433,      545,      687,      865,     1090,     1374,
+             1731,     2180,     2747,     3461,     4361,     5494,
+             6922,     8721,    10988,    13844,    17442,    21976,
+            27688,    34885,    43953,    55377,    69771,    87906,
+           110755,   139543,   175813,   221511,   279087,   351627,
+           443023,   558174,   703255,   886046,  1116348,  1406511,
+          1772093,  2232697,  2813022,  3544186,  4465396,  5626046,
+          7088374,  8930791, 11252092, 14176748, 17861583, 22504184,
+         28353495, 35723165, 45008368, 56706990,
+         71446330, 90016736,113413980,134217727,134217727,134217727,
+        134217727,134217727,134217727,134217727,134217727,134217727,
+        134217727,134217727,134217727,134217727,134217727,134217727,
+        134217727,134217727,134217727,134217727,134217727,134217727
+    }
 };

 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
@@ -247,35 +266,35 @@ static const uint8_t i_sub_mb_p_cost_table[4] = {

 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );

-static uint16_t x264_cost_ref[LAMBDA_MAX+1][3][33];
+static uint16_t x264_cost_ref[QP_MAX+1][3][33];
 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;

 int x264_analyse_init_costs( x264_t *h, int qp )
 {
     int lambda = x264_lambda_tab[qp];
-    if( h->cost_mv[lambda] )
+    if( h->cost_mv[qp] )
         return 0;
     /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
-    CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
-    h->cost_mv[lambda] += 2*4*2048;
+    CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
+    h->cost_mv[qp] += 2*4*2048;
     for( int i = 0; i <= 2*4*2048; i++ )
     {
-        h->cost_mv[lambda][-i] =
-        h->cost_mv[lambda][i]  = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+        h->cost_mv[qp][-i] =
+        h->cost_mv[qp][i]  = X264_MIN( lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f, (1<<16)-1 );
     }
     x264_pthread_mutex_lock( &cost_ref_mutex );
     for( int i = 0; i < 3; i++ )
         for( int j = 0; j < 33; j++ )
-            x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
+            x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
     x264_pthread_mutex_unlock( &cost_ref_mutex );
-    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
+    if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
     {
         for( int j = 0; j < 4; j++ )
         {
-            CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
-            h->cost_mv_fpel[lambda][j] += 2*2048;
+            CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
+            h->cost_mv_fpel[qp][j] += 2*2048;
             for( int i = -2*2048; i < 2*2048; i++ )
-                h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
+                h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
         }
     }
     return 0;
@@ -285,7 +304,7 @@ fail:

 void x264_analyse_free_costs( x264_t *h )
 {
-    for( int i = 0; i < LAMBDA_MAX+1; i++ )
+    for( int i = 0; i < QP_MAX+1; i++ )
     {
         if( h->cost_mv[i] )
             x264_free( h->cost_mv[i] - 2*4*2048 );
@@ -326,16 +345,16 @@ void x264_analyse_weight_frame( x264_t *h, int end )
 /* initialize an array of lambda*nbits for all possible mvs */
 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
 {
-    a->p_cost_mv = h->cost_mv[a->i_lambda];
-    a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
-    a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+    a->p_cost_mv = h->cost_mv[a->i_qp];
+    a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+    a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
 }

 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
 {
     /* conduct the analysis using this lamda and QP */
     a->i_qp = h->mb.i_qp = i_qp;
-    h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
+    h->mb.i_chroma_qp = h->chroma_qp_table[SPEC_QP( i_qp )];

     a->i_lambda = x264_lambda_tab[i_qp];
     a->i_lambda2 = x264_lambda2_tab[i_qp];
@@ -343,10 +362,11 @@ static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp
     h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
     if( h->param.analyse.i_trellis )
     {
+        int effective_chroma_qp = h->mb.i_chroma_qp + X264_MAX( i_qp - QP_MAX_SPEC, 0 );
         h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
         h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
-        h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
-        h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
+        h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
+        h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
     }
     h->mb.i_psy_rd_lambda = a->i_lambda;
     /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
@@ -366,7 +386,6 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
     x264_mb_analyse_init_qp( h, a, i_qp );

     h->mb.b_transform_8x8 = 0;
-    h->mb.b_noise_reduction = 0;

     /* I: Intra part */
     a->i_satd_i16x16 =
@@ -384,6 +403,21 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
         a->i_mbrd ? 2 :
         !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;

+    if( h->mb.i_qp > QP_MAX_SPEC )
+    {
+        h->nr_offset = h->nr_offset_emergency[h->mb.i_qp-QP_MAX_SPEC-1];
+        h->nr_residual_sum = h->nr_residual_sum_buf[1];
+        h->nr_count = h->nr_count_buf[1];
+        h->mb.b_noise_reduction = 1;
+    }
+    else
+    {
+        h->nr_offset = h->nr_offset_denoise;
+        h->nr_residual_sum = h->nr_residual_sum_buf[0];
+        h->nr_count = h->nr_count_buf[0];
+        h->mb.b_noise_reduction = 0;
+    }
+
     /* II: Inter part P/B frame */
     if( h->sh.i_type != SLICE_TYPE_I )
     {
@@ -2731,6 +2765,7 @@ void x264_macroblock_analyse( x264_t *h )
     int i_cost = COST_MAX;

     h->mb.i_qp = x264_ratecontrol_mb_qp( h );
+
     /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
      * to lower the bit cost of the qp_delta.  Don't do this if QPRD is enabled. */
     if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
@@ -3476,7 +3511,8 @@ intra_analysis:
         x264_mb_analyse_qp_rd( h, &analysis );

     h->mb.b_trellis = h->param.analyse.i_trellis;
-    h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
+    h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
+
     if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
         x264_psy_trellis_init( h, 0 );
     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 5539b4b..a2d13f0 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -246,7 +246,7 @@ static void x264_cabac_mb_cbp_chroma( x264_t *h, x264_cabac_t *cb )

 static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
 {
-    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
+    int i_dqp = SPEC_QP( h->mb.i_qp) - SPEC_QP( h->mb.i_last_qp );
     int ctx;

     /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index ca35210..7224a9d 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -208,7 +208,7 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
 static void cavlc_qp_delta( x264_t *h )
 {
     bs_t *s = &h->out.bs;
-    int i_dqp = h->mb.i_qp - h->mb.i_last_qp;
+    int i_dqp = SPEC_QP( h->mb.i_qp ) - SPEC_QP( h->mb.i_last_qp );

     /* Avoid writing a delta quant if we have an empty i16x16 block, e.g. in a completely flat background area */
     if( h->mb.i_type == I_16x16 && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma)
diff --git a/encoder/encoder.c b/encoder/encoder.c
index a6b53fd..89f1f39 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -163,7 +163,7 @@ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
     sh->i_cabac_init_idc = param->i_cabac_init_idc;

     sh->i_qp = i_qp;
-    sh->i_qp_delta = i_qp - pps->i_pic_init_qp;
+    sh->i_qp_delta = SPEC_QP( sh->i_qp ) - pps->i_pic_init_qp;
     sh->b_sp_for_swidth = 0;
     sh->i_qs_delta = 0;

@@ -1050,7 +1050,8 @@ x264_t *x264_encoder_open( x264_param_t *param )
         p += sprintf( p, " none!" );
     x264_log( h, X264_LOG_INFO, "%s\n", buf );

-    for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
+    int qp_max = h->param.rc.i_qp_max == QP_MAX_SPEC ? QP_MAX : h->param.rc.i_qp_max;
+    for( qp = h->param.rc.i_qp_min; qp <= qp_max; qp++ )
         if( x264_analyse_init_costs( h, qp ) )
             goto fail;
     if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
@@ -1058,7 +1059,7 @@ x264_t *x264_encoder_open( x264_param_t *param )

     static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
     /* Checks for known miscompilation issues. */
-    if( h->cost_mv[x264_lambda_tab[X264_LOOKAHEAD_QP]][2013] != cost_mv_correct[BIT_DEPTH-8] )
+    if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] )
     {
         x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
         goto fail;
@@ -1879,7 +1880,7 @@ static int x264_slice_write( x264_t *h )
     if( h->sh.i_first_mb != h->i_threadslice_start * h->mb.i_mb_width )
     {
         h->sh.i_qp = h->mb.i_last_qp;
-        h->sh.i_qp_delta = h->sh.i_qp - h->pps->i_pic_init_qp;
+        h->sh.i_qp_delta = SPEC_QP( h->sh.i_qp ) - h->pps->i_pic_init_qp;
     }

     x264_slice_header_write( &h->out.bs, &h->sh, h->i_nal_ref_idc );
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 39fed61..11a63d9 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -84,6 +84,8 @@ static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
 static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY;
+    if( h->mb.b_noise_reduction && ctx_block_cat != DCT_LUMA_AC )
+        h->quantf.denoise_dct( dct, h->nr_residual_sum[0], h->nr_offset[0], 16 );
     if( h->mb.b_trellis )
         return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, ctx_block_cat, b_intra, 0, idx );
     else
@@ -93,6 +95,8 @@ static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, i
 static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, dctcoef dct[64], int i_qp, int b_intra, int idx )
 {
     int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY;
+    if( h->mb.b_noise_reduction )
+        h->quantf.denoise_dct( dct, h->nr_residual_sum[1], h->nr_offset[1], 64 );
     if( h->mb.b_trellis )
         return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx );
     else
@@ -115,6 +119,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
     pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
+    int i_qp_spec = SPEC_QP( i_qp );
     ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );

     if( h->mb.b_lossless )
@@ -133,7 +138,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
     {
         h->mb.i_cbp_luma |= 1<<(idx>>2);
         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 );
-        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp );
+        h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp_spec );
         h->dctf.add4x4_idct( p_dst, dct4x4 );
     }
 }
@@ -161,6 +166,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
     pixel *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
+    int i_qp_spec = SPEC_QP( i_qp );
     ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );

     if( h->mb.b_lossless )
@@ -178,7 +184,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
     {
         h->mb.i_cbp_luma |= 1<<idx;
         h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 );
-        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp );
+        h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp_spec );
         h->dctf.add8x8_idct8( p_dst, dct8x8 );
         STORE_8x8_NNZ( s8, 1 );
     }
@@ -196,6 +202,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )

     int nz;
     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
+    int i_qp_spec = SPEC_QP( i_qp );

     if( h->mb.b_lossless )
     {
@@ -218,6 +225,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     for( int i = 0; i < 16; i++ )
     {
         /* copy dc coeff */
+        if( h->mb.b_noise_reduction )
+            h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[0], h->nr_offset[0], 16 );
         dct_dc4x4[block_idx_xy_1d[i]] = dct4x4[i][0];
         dct4x4[i][0] = 0;

@@ -227,7 +236,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
         if( nz )
         {
             h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] );
-            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp );
+            h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp_spec );
             if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[i] );
             h->mb.i_cbp_luma = 0xf;
         }
@@ -254,7 +263,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )

         /* output samples to fdec */
         h->dctf.idct4x4dc( dct_dc4x4 );
-        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp );  /* XXX not inversed */
+        h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp_spec );  /* XXX not inversed */
         if( h->mb.i_cbp_luma )
             for( int i = 0; i < 16; i++ )
                 dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
@@ -326,13 +335,15 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
 {
     int nz, nz_dc;
     int b_decimate = b_inter && h->mb.b_dct_decimate;
+    int i_qp_spec = SPEC_QP( i_qp );
     ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
     h->mb.i_cbp_chroma = 0;
+    h->nr_count[2] += h->mb.b_noise_reduction * 4;

     /* Early termination: check variance of chroma residual before encoding.
      * Don't bother trying early termination at low QPs.
      * Values are experimentally derived. */
-    if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) )
+    if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
     {
         int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
         int ssd[2];
@@ -363,11 +374,11 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )

                     if( nz_dc )
                     {
-                        if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp, dct2x2 ) )
+                        if( !x264_mb_optimize_chroma_dc( h, b_inter, i_qp_spec, dct2x2 ) )
                             continue;
                         h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1;
                         zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-                        idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                        idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp_spec );
                         h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
                         h->mb.i_cbp_chroma = 1;
                     }
@@ -401,6 +412,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
         }

         h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+        if( h->mb.b_noise_reduction )
+            for( int i = 0; i < 4; i++ )
+                h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
         dct2x2dc( dct2x2, dct4x4 );
         /* calculate dct coeffs */
         for( int i = 0; i < 4; i++ )
@@ -414,7 +428,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
             {
                 nz_ac = 1;
                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] );
-                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp_spec );
                 if( b_decimate )
                     i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] );
             }
@@ -443,7 +457,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
             }
             /* DC-only */
             zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-            idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+            idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp_spec );
             h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
         }
         else
@@ -452,7 +466,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
             if( nz_dc )
             {
                 zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
-                idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+                idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp_spec );
             }
             h->dctf.add8x8_idct( p_dst, dct4x4 );
         }
@@ -587,6 +601,7 @@ void x264_macroblock_encode( x264_t *h )
     int b_decimate = h->mb.b_dct_decimate;
     int b_force_no_skip = 0;
     int nz;
+    int i_qp_spec = SPEC_QP( i_qp );
     h->mb.i_cbp_luma = 0;
     h->mb.cache.non_zero_count[x264_scan8[24]] = 0;

@@ -748,8 +763,6 @@ void x264_macroblock_encode( x264_t *h )

             for( int idx = 0; idx < 4; idx++ )
             {
-                if( h->mb.b_noise_reduction )
-                    h->quantf.denoise_dct( dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
                 nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx );

                 if( nz )
@@ -782,7 +795,7 @@ void x264_macroblock_encode( x264_t *h )

                     if( h->mb.i_cbp_luma&(1<<idx) )
                     {
-                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp );
+                        h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp_spec );
                         h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE], dct8x8[idx] );
                         STORE_8x8_NNZ( s8, 1 );
                     }
@@ -807,15 +820,13 @@ void x264_macroblock_encode( x264_t *h )
                 {
                     int idx = i8x8 * 4 + i4x4;

-                    if( h->mb.b_noise_reduction )
-                        h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
                     nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx );
                     h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;

                     if( nz )
                     {
                         h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] );
-                        h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp );
+                        h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp_spec );
                         if( b_decimate && i_decimate_8x8 < 6 )
                             i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] );
                         cbp = 1;
@@ -1019,12 +1030,16 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )

 void x264_noise_reduction_update( x264_t *h )
 {
-    for( int cat = 0; cat < 2; cat++ )
+    h->nr_offset = h->nr_offset_denoise;
+    h->nr_residual_sum = h->nr_residual_sum_buf[0];
+    h->nr_count = h->nr_count_buf[0];
+    for( int cat = 0; cat < 3; cat++ )
     {
-        int size = cat ? 64 : 16;
-        const uint16_t *weight = cat ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
+        int dct8x8 = cat == 1;
+        int size = dct8x8 ? 64 : 16;
+        const uint16_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;

-        if( h->nr_count[cat] > (cat ? (1<<16) : (1<<18)) )
+        if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
         {
             for( int i = 0; i < size; i++ )
                 h->nr_residual_sum[cat][i] >>= 1;
@@ -1036,6 +1051,9 @@ void x264_noise_reduction_update( x264_t *h )
                 ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
                  + h->nr_residual_sum[cat][i]/2)
               / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
+
+        /* Don't denoise DC coefficients */
+        h->nr_offset[cat][0] = 0;
     }
 }

@@ -1054,6 +1072,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
     int b_decimate = h->mb.b_dct_decimate;
     int nnz8x8 = 0;
     int nz;
+    int i_qp_spec = SPEC_QP( i_qp );

     if( !h->mb.b_skip_mc )
         x264_mb_mc_8x8( h, i8 );
@@ -1101,7 +1120,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )

                 if( nnz8x8 )
                 {
-                    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp );
+                    h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp_spec );
                     h->dctf.add8x8_idct8( p_fdec, dct8x8 );
                     STORE_8x8_NNZ( s8, 1 );
                 }
@@ -1123,7 +1142,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
                 if( nz )
                 {
                     h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] );
-                    h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp );
+                    h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp_spec );
                     if( b_decimate )
                         i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] );
                     nnz8x8 = 1;
@@ -1146,8 +1165,9 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
             p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
             p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
-
             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
+            if( h->mb.b_noise_reduction );
+                h->quantf.denoise_dct( dct4x4, h->nr_residual_sum[2], h->nr_offset[2], 16 );
             dct4x4[0] = 0;

             if( h->mb.b_trellis )
@@ -1159,7 +1179,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
             if( nz )
             {
                 h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 );
-                h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
+                h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp_spec );
                 h->dctf.add4x4_idct( p_fdec, dct4x4 );
             }
         }
diff --git a/encoder/me.c b/encoder/me.c
index 90f7dfd..9ca04d0 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -597,7 +597,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
             int delta = x264_pixel_size[sad_size].w;
             int16_t *xs = h->scratch_buffer;
             int xn;
-            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);

             h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
                 p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
diff --git a/encoder/me.h b/encoder/me.h
index fd99a5d..58cd3e0 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -71,7 +71,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
 void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
 uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );

-extern uint16_t *x264_cost_mv_fpel[LAMBDA_MAX+1][4];
+extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4];

 #define COPY1_IF_LT(x,y)\
 if((y)<(x))\
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 8f7ef9a..e727ab0 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -1441,8 +1441,14 @@ int x264_ratecontrol_mb_qp( x264_t *h )
     x264_emms();
     float qp = h->rc->qpm;
     if( h->param.rc.i_aq_mode )
-        /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
-        qp += h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
+    {
+         /* MB-tree currently doesn't adjust quantizers in unreferenced frames. */
+        float qp_offset = h->fdec->b_kept_as_ref ? h->fenc->f_qp_offset[h->mb.i_mb_xy] : h->fenc->f_qp_offset_aq[h->mb.i_mb_xy];
+        /* Scale AQ's effect towards zero in emergency mode. */
+        if( qp > QP_MAX_SPEC )
+            qp_offset *= (QP_MAX - qp) / (QP_MAX_SPEC - QP_MAX);
+        qp += qp_offset;
+    }
     return x264_clip3( qp + .5, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
 }