URL
https://opencores.org/ocsvn/oc-h264-encoder/oc-h264-encoder/trunk
Subversion Repositories oc-h264-encoder
Compare Revisions
- This comparison shows the changes necessary to convert path
/
- from Rev 50 to Rev 51
- ↔ Reverse comparison
Rev 50 → Rev 51
/oc-h264-encoder/trunk/x264/patches/x264-e381f6d-or32-or1ksim-with-fp-1.2.patch
0,0 → 1,6158
diff --exclude=.git --exclude=/gitignore -Naur x264/common/common.c x264-or/common/common.c |
--- x264/common/common.c 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/common/common.c 2009-10-28 15:05:29.000000000 +0100 |
@@ -764,8 +764,10 @@ |
align_buf = malloc( i_size ); |
#elif defined( HAVE_MALLOC_H ) |
align_buf = memalign( 16, i_size ); |
+ //fprintf(stderr, "memalign, result addr: 0x%.8x\n", (unsigned int) align_buf); |
#else |
uint8_t *buf = malloc( i_size + 15 + sizeof(void **) + sizeof(int) ); |
+ //fprintf(stderr, "malloc, result addr: 0x%.8x\n", (unsigned int) buf); |
if( buf ) |
{ |
align_buf = buf + 15 + sizeof(void **) + sizeof(int); |
diff --exclude=.git --exclude=/gitignore -Naur x264/common/common.h x264-or/common/common.h |
--- x264/common/common.h 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/common/common.h 2009-11-10 22:42:24.000000000 +0100 |
@@ -51,7 +51,8 @@ |
#define X264_BFRAME_MAX 16 |
#define X264_THREAD_MAX 128 |
#define X264_PCM_COST (386*8) |
-#define X264_LOOKAHEAD_MAX 250 |
+//#define X264_LOOKAHEAD_MAX 250 |
+#define X264_LOOKAHEAD_MAX 140 |
// arbitrary, but low because SATD scores are 1/4 normal |
#define X264_LOOKAHEAD_QP 12 |
|
@@ -260,7 +261,7 @@ |
*/ |
#define X264_SCAN8_SIZE (6*8) |
#define X264_SCAN8_0 (4+1*8) |
- |
+/* this array indicates the position in the */ |
static const int x264_scan8[16+2*4+3] = |
{ |
/* Luma */ |
@@ -518,7 +519,9 @@ |
/* space for p_fenc and p_fdec */ |
#define FENC_STRIDE 16 |
#define FDEC_STRIDE 32 |
- ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] ); |
+ /* comment regarding FDEC_STRIDE: the catch with fdec is that the chroma blocks must be aligned to 8 and the luma blocks must be aligned to 16 but we need to have pixels on the left side for prediction so we have a lot of padding on the left side. so for luma: X X X X X X X X X X X X X X X L _ _ _ _ _ ... where X are junk, L is the left pixel, _ are normal pixels, so 32 wide total */ |
+ |
+ ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] ); /* FENC stores both Y, U, and V, 16x16 + 8x8 + 8x8 = 24 * 16*/ |
ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] ); |
|
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ |
@@ -551,9 +554,29 @@ |
|
/* pointer over mb of the references */ |
int i_fref[2]; |
- uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */ |
- uint16_t *p_integral[2][16]; |
- |
+ uint8_t *p_fref[2][32][4+2]; /* last: 4hpel pointers, 2 chroma pointers: lN (normal), lH (horizontally interpolated), lV (vertically interpolated), lHV (center position), cU (chroma U), cV (chroma V) |
+ Important!: This is our post-motion compensation frame data, stored in half-pixel (hpel) resolution. |
+ During motion compensation, hpel data is calculated via 6-tap FIR filter (very similar to Lanczos). It is an even filter (even function) so every input pixel corresponds to 4 output pixels |
+ one of those 4 is equal to the input, so the data contains one fullpel pixel, one H, one V, one C (aka HV) |
+ From this data, qpel is calculated via interpolation, which is easy and quick to do. |
+ F 0 H 1 F |
+ 2 3 4 5 6 |
+ V 7 C 8 V |
+ 9 A B C D |
+ F _ H _ F |
+ Here 0 is the average of F and H |
+ 2 is the average of F and V |
+ 3 is the average of H and V |
+ 4 is the average of H and C |
+ etc. |
+ So each qpel position is the result of averaging two known hpel positions.*/ |
+ uint16_t *p_integral[2][16]; /* pointers to "integral": for each position X,Y, it's the sum of values in an 8x8 block with that point X,Y as the upper left of that block |
+ Discussion on this value: |
+ < pengvado> Dark_Shikari: "sum of values in an 8x8 block" is what I would call "dc". the variable name comes because it used to be an indefinite integral, i.e. the sum of everything above and to the left of that point. |
+ < pengvado> but if you're only interested in 16x16, 16x8, 8x16, and 8x8, then caching 8x8 dcs is strictly faster |
+ */ |
+ |
+ |
/* fref stride */ |
int i_stride[3]; |
} pic; |
diff --exclude=.git --exclude=/gitignore -Naur x264/common/frame.c x264-or/common/frame.c |
--- x264/common/frame.c 2009-10-25 17:41:10.000000000 +0100 |
+++ x264-or/common/frame.c 2009-10-28 15:05:29.000000000 +0100 |
@@ -31,6 +31,8 @@ |
x264_frame_t *frame; |
int i, j; |
|
+ //V(fprintf(stderr, "x264_frame_new\n")); |
+ |
int i_mb_count = h->mb.i_mb_count; |
int i_stride, i_width, i_lines; |
int i_padv = PADV << h->param.b_interlaced; |
@@ -981,11 +983,14 @@ |
|
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ) |
{ |
+ //V(fprintf(stderr, "x264_frame_pop_unused\n")); |
x264_frame_t *frame; |
if( h->frames.unused[b_fdec][0] ) |
frame = x264_frame_pop( h->frames.unused[b_fdec] ); |
else |
frame = x264_frame_new( h, b_fdec ); |
+ |
+ //V(fprintf(stderr, "x264_frame_pop_unused: frame ptr = 0x%.8x\n", (unsigned long) frame)); |
if( !frame ) |
return NULL; |
frame->b_last_minigop_bframe = 0; |
diff --exclude=.git --exclude=/gitignore -Naur x264/common/macroblock.c x264-or/common/macroblock.c |
--- x264/common/macroblock.c 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/common/macroblock.c 2009-11-09 13:02:31.000000000 +0100 |
@@ -118,7 +118,7 @@ |
|
int i_count = 0; |
|
- if( i_refc == -2 ) |
+ if( i_refc == -2 )/* -2 = unavailable */ |
{ |
i_refc = h->mb.cache.ref[i_list][X264_SCAN8_0 - 8 - 1]; |
mv_c = h->mb.cache.mv[i_list][X264_SCAN8_0 - 8 - 1]; |
diff --exclude=.git --exclude=/gitignore -Naur x264/common/mdate.c x264-or/common/mdate.c |
--- x264/common/mdate.c 2009-10-25 17:41:10.000000000 +0100 |
+++ x264-or/common/mdate.c 2009-10-28 15:05:29.000000000 +0100 |
@@ -32,9 +32,12 @@ |
int64_t x264_mdate( void ) |
{ |
#ifndef __MINGW32__ |
+ /* --jb |
struct timeval tv_date; |
gettimeofday( &tv_date, NULL ); |
return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec ); |
+ */ |
+ return 0; |
#else |
struct _timeb tb; |
_ftime(&tb); |
diff --exclude=.git --exclude=/gitignore -Naur x264/common/or32/or32.h x264-or/common/or32/or32.h |
--- x264/common/or32/or32.h 1970-01-01 01:00:00.000000000 +0100 |
+++ x264-or/common/or32/or32.h 2009-11-17 10:20:27.000000000 +0100 |
@@ -0,0 +1,22 @@ |
+/* Struct for SAD/SSD hardware module */ |
+typedef struct { |
+ uint32_t control; |
+ uint32_t result; |
+ uint32_t pix_ptr1; |
+ uint32_t pix_ptr2; |
+ uint32_t stride1; |
+ uint32_t stride2; |
+ uint32_t x; |
+ uint32_t y; |
+} x264_or32_sadssdmod_regs_t; |
+ |
+#define OR32_SADSSDMOD_REG_BASE 0x26400000 |
+ |
+#define OR32_SADSSDMOD_CONTROL_REG_BASE (OR32_SADSSDMOD_REG_BASE+0) |
+#define OR32_SADSSDMOD_CONTROL_REG_BUSY 0x1 |
+#define OR32_SADSSDMOD_CONTROL_REG_MODE_MASK 0x2 |
+#define OR32_SADSSDMOD_CONTROL_REG_MODE_SAD (OR32_SADSSDMOD_CONTROL_REG_MODE_MASK & 0x0) |
+#define OR32_SADSSDMOD_CONTROL_REG_MODE_SSD (OR32_SADSSDMOD_CONTROL_REG_MODE_MASK & 0xff) |
+ |
+ |
+#define OR32_SADSSDMOD 1 |
diff --exclude=.git --exclude=/gitignore -Naur x264/common/pixel.c x264-or/common/pixel.c |
--- x264/common/pixel.c 2009-10-25 17:41:10.000000000 +0100 |
+++ x264-or/common/pixel.c 2009-11-17 10:26:39.000000000 +0100 |
@@ -22,6 +22,7 @@ |
*****************************************************************************/ |
|
#include "common.h" |
+#include "x264.h" |
|
#ifdef HAVE_MMX |
# include "x86/pixel.h" |
@@ -35,29 +36,51 @@ |
#ifdef ARCH_UltraSparc |
# include "sparc/pixel.h" |
#endif |
+#ifdef ARCH_OR32 |
+# include "or32/or32.h" |
+#endif |
|
|
/**************************************************************************** |
* pixel_sad_WxH |
****************************************************************************/ |
-#define PIXEL_SAD_C( name, lx, ly ) \ |
-static int name( uint8_t *pix1, int i_stride_pix1, \ |
- uint8_t *pix2, int i_stride_pix2 ) \ |
-{ \ |
- int i_sum = 0; \ |
- int x, y; \ |
- for( y = 0; y < ly; y++ ) \ |
- { \ |
- for( x = 0; x < lx; x++ ) \ |
- { \ |
- i_sum += abs( pix1[x] - pix2[x] ); \ |
- } \ |
- pix1 += i_stride_pix1; \ |
- pix2 += i_stride_pix2; \ |
- } \ |
- return i_sum; \ |
-} |
- |
+#if OR32_SADSSDMOD |
+#define PIXEL_SAD_C( name, lx, ly ) \ |
+ static int name( uint8_t *pix1, int i_stride_pix1, \ |
+ uint8_t *pix2, int i_stride_pix2 ) \ |
+ { /* Configure and set the hardware SAD/SSD module running */ \ |
+ volatile x264_or32_sadssdmod_regs_t* sadssdmod_regs; \ |
+ sadssdmod_regs = (x264_or32_sadssdmod_regs_t*) OR32_SADSSDMOD_REG_BASE; \ |
+ sadssdmod_regs->pix_ptr1 = (uint32_t)pix1; \ |
+ sadssdmod_regs->pix_ptr2 = (uint32_t)pix2; \ |
+ sadssdmod_regs->stride1 = (uint32_t)i_stride_pix1; \ |
+ sadssdmod_regs->stride2 = (uint32_t)i_stride_pix2; \ |
+ sadssdmod_regs->x = (uint32_t)lx; \ |
+ sadssdmod_regs->y = (uint32_t)ly; \ |
+ sadssdmod_regs->control = (uint32_t)(OR32_SADSSDMOD_CONTROL_REG_MODE_SAD | \ |
+ OR32_SADSSDMOD_CONTROL_REG_BUSY) ; \ |
+ while (sadssdmod_regs->control & OR32_SADSSDMOD_CONTROL_REG_BUSY); \ |
+ return (int) sadssdmod_regs->result; \ |
+ } |
+#else |
+#define PIXEL_SAD_C( name, lx, ly ) \ |
+ static int name( uint8_t *pix1, int i_stride_pix1, \ |
+ uint8_t *pix2, int i_stride_pix2 ) \ |
+ { \ |
+int i_sum = 0; \ |
+int x, y; \ |
+for( y = 0; y < ly; y++ ) \ |
+ { \ |
+ for( x = 0; x < lx; x++ ) \ |
+ { \ |
+ i_sum += abs( pix1[x] - pix2[x] ); \ |
+ } \ |
+ pix1 += i_stride_pix1; \ |
+ pix2 += i_stride_pix2; \ |
+ } \ |
+return i_sum; \ |
+} |
+#endif |
|
PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 ) |
PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 ) |
@@ -71,10 +94,29 @@ |
/**************************************************************************** |
* pixel_ssd_WxH |
****************************************************************************/ |
+#if OR32_SADSSDMOD |
+#define PIXEL_SSD_C( name, lx, ly ) \ |
+static int name( uint8_t *pix1, int i_stride_pix1, \ |
+ uint8_t *pix2, int i_stride_pix2 ) \ |
+{ /* Configure and set the hardware SAD/SSD module running */ \ |
+ volatile x264_or32_sadssdmod_regs_t* sadssdmod_regs; \ |
+ sadssdmod_regs = (x264_or32_sadssdmod_regs_t*) OR32_SADSSDMOD_REG_BASE; \ |
+ sadssdmod_regs->pix_ptr1 = (uint32_t)pix1; \ |
+ sadssdmod_regs->pix_ptr2 = (uint32_t)pix2; \ |
+ sadssdmod_regs->stride1 = (uint32_t)i_stride_pix1; \ |
+ sadssdmod_regs->stride2 = (uint32_t)i_stride_pix2; \ |
+ sadssdmod_regs->x = (uint32_t)lx; \ |
+ sadssdmod_regs->y = (uint32_t)ly; \ |
+ sadssdmod_regs->control = (uint32_t)(OR32_SADSSDMOD_CONTROL_REG_MODE_SSD | \ |
+ OR32_SADSSDMOD_CONTROL_REG_BUSY) ; \ |
+ while (sadssdmod_regs->control & OR32_SADSSDMOD_CONTROL_REG_BUSY); \ |
+ return (int) sadssdmod_regs->result; \ |
+} |
+#else |
#define PIXEL_SSD_C( name, lx, ly ) \ |
static int name( uint8_t *pix1, int i_stride_pix1, \ |
uint8_t *pix2, int i_stride_pix2 ) \ |
-{ \ |
+{ \ |
int i_sum = 0; \ |
int x, y; \ |
for( y = 0; y < ly; y++ ) \ |
@@ -89,7 +131,7 @@ |
} \ |
return i_sum; \ |
} |
- |
+#endif |
PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 ) |
PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 ) |
PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 ) |
@@ -387,6 +429,7 @@ |
|
/**************************************************************************** |
* pixel_sad_x4 |
+ * Debug printf : V(fprintf(stderr, "x264_pixel_sadx4 size : *fenc: 0x%.8x *pix0: 0x%.8x *pix1: 0x%.8x *pix2: 0x%.8x *pix3: 0x%.8x i_stride: %d *scores: 0x%.8x\n", (unsigned long) fenc, (unsigned long) pix0, (unsigned long) pix1, (unsigned long)pix2, (unsigned long)pix3, i_stride, (unsigned long) scores)); \ |
****************************************************************************/ |
#define SAD_X( size ) \ |
static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\ |
diff --exclude=.git --exclude=/gitignore -Naur x264/configure x264-or/configure |
--- x264/configure 2009-10-25 17:41:10.000000000 +0100 |
+++ x264-or/configure 2009-10-28 15:05:29.000000000 +0100 |
@@ -29,7 +29,7 @@ |
rm -f conftest.c |
[ -n "$1" ] && echo "#include <$1>" > conftest.c |
echo "int main () { $3 return 0; }" >> conftest.c |
- $CC conftest.c $CFLAGS $LDFLAGS $2 -o conftest 2>$DEVNULL |
+ $CC conftest.c $CFLAGS $LDFLAGS $2 -o conftest #2>$DEVNULL |
} |
|
as_check() { |
diff --exclude=.git --exclude=/gitignore -Naur x264/encoder/analyse.c x264-or/encoder/analyse.c |
--- x264/encoder/analyse.c 2009-10-25 17:41:10.000000000 +0100 |
+++ x264-or/encoder/analyse.c 2009-11-09 13:11:23.000000000 +0100 |
@@ -27,6 +27,7 @@ |
#include <unistd.h> |
|
#include "common/common.h" |
+#include "x264.h" |
#include "common/cpu.h" |
#include "macroblock.h" |
#include "me.h" |
@@ -34,6 +35,7 @@ |
#include "analyse.h" |
#include "rdo.c" |
|
+ |
typedef struct |
{ |
/* 16x16 */ |
@@ -233,40 +235,75 @@ |
5, 3, 3, 1 |
}; |
|
+extern const float x264_analyse_init_log2_array[]; //jb |
+ |
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); |
|
static uint16_t x264_cost_ref[92][3][33]; |
static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; |
|
+/* |
+float log2f( float n ) |
+{ |
+// log(n)/log(2) is log2. |
+//return log10( n ) / log10( 2 ); |
+ return (float) (log( n ) / _M_LOG2_E); |
+} |
+*/ |
int x264_analyse_init_costs( x264_t *h, int qp ) |
{ |
int i, j; |
int lambda = x264_lambda_tab[qp]; |
+ float tmp_f,lambda_f = lambda; |
+ |
if( h->cost_mv[lambda] ) |
return 0; |
+ |
+ fprintf( stderr, |
+ "x264_analyse_init_costs: lambda (= x264_lambda_tab[%d]) = %d\n", |
+ qp, lambda); |
+ |
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ |
CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) ); |
+ |
h->cost_mv[lambda] += 2*4*2048; |
+ |
+ /* Computationally expensive thing: logs, floating point math, much of -- jb*/ |
for( i = 0; i <= 2*4*2048; i++ ) |
- { |
- h->cost_mv[lambda][-i] = |
- h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f; |
- } |
+ { |
+ |
+ //h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f; |
+ //h->cost_mv[lambda][i] = lambda * (log2(i+1)*2 + 0.718f + !!i) + .5f; |
+ |
+ // precalulated array, doing the above few calculations for us |
+ tmp_f = lambda_f * x264_analyse_init_log2_array[i]; |
+ tmp_f += .5f; |
+ h->cost_mv[lambda][-i] = |
+ h->cost_mv[lambda][i] = tmp_f; |
+ |
+ |
+ } |
x264_pthread_mutex_lock( &cost_ref_mutex ); |
+ |
for( i = 0; i < 3; i++ ) |
- for( j = 0; j < 33; j++ ) |
- x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0; |
+ for( j = 0; j < 33; j++ ) |
+ x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0; |
+ |
x264_pthread_mutex_unlock( &cost_ref_mutex ); |
+ |
+ /* for now, disable -- jb |
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] ) |
- { |
+ { |
+ printf("analyse_init_costs: h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0]\n"); |
for( j=0; j<4; j++ ) |
- { |
+ { |
CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) ); |
h->cost_mv_fpel[lambda][j] += 2*2048; |
for( i = -2*2048; i < 2*2048; i++ ) |
- h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j]; |
+ h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j]; |
} |
} |
+ */ |
return 0; |
fail: |
return -1; |
@@ -288,6 +325,7 @@ |
/* initialize an array of lambda*nbits for all possible mvs */ |
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) |
{ |
+ //V(fprintf(stderr, "x264_mb_analyse_load_costs() called\n")); |
a->p_cost_mv = h->cost_mv[a->i_lambda]; |
a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; |
a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; |
@@ -1161,6 +1199,7 @@ |
(m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \ |
(m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; |
|
+ |
#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \ |
(m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \ |
(m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ |
@@ -1183,20 +1222,35 @@ |
|
/* 16x16 Search on all ref frame */ |
m.i_pixel = PIXEL_16x16; |
- m.p_cost_mv = a->p_cost_mv; |
- LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); |
+ m.p_cost_mv = a->p_cost_mv; /* where a->p_cost_mv = h->cost_mv[a->i_lambda] from x264_mb_analyse_init(). a->i_lambda related to qp of MB, and h->cost_mv[] calculated at beginning in x264_mb_analyse_local_costs() */ |
+ /* LOAD_FENC(m, src, xoff, yoff) : |
+ h->mb.pic.p_fenc: pointer over mb of the frame to be compressed, i think array of 3 pointers, one each to Y, Cb, Cr data. |
+ x264_me_t m is loaded with appropriate stride variables, and pointers to the appropriate pixels |
+ Essentially loads the me struct with stride and offsets, this is done in a macro because it's convenient |
+ because the many partitions can use the same macro to setup x264_me_t struct |
+ */ |
+ LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); |
+ //printf("x264_mb_analyse_inter_p16x16: m.i_stride[0]: %d\n",m.i_stride[0]); |
+ //printf("x264_mb_analyse_inter_p16x16: m.i_stride[1]: %d\n",m.i_stride[1]); |
+ //usually m.i_stride[0] is 416 and m.i_stride[1] is 208 -- jb |
|
a->l0.me16x16.cost = INT_MAX; |
+ |
+ //printf("x264_mb_analyse_inter_p16x16: h->mb.pic.i_fref[0] = %d\n",h->mb.pic.i_fref[0]); |
+ |
+ // Usually just the single reference frame here |
for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ ) |
{ |
- const int i_ref_cost = REF_COST( 0, i_ref ); |
- i_halfpel_thresh -= i_ref_cost; |
+ const int i_ref_cost = REF_COST( 0, i_ref ); /* looks in a->p_cost_ref0/1[], which is set in x264_mb_analyse_load_costs() */ |
+ i_halfpel_thresh -= i_ref_cost; |
m.i_ref_cost = i_ref_cost; |
m.i_ref = i_ref; |
|
/* search with ref */ |
- LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 ); |
- x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); |
+ /* m, src, list, ref, xoff, yoff */ |
+ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0 , 0 ); |
+ /* h,i_list, i_ref, int16_t mvp[2] */ |
+ x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); |
x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); |
x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); |
|
diff --exclude=.git --exclude=/gitignore -Naur x264/encoder/analyse_gen_init_array.sh x264-or/encoder/analyse_gen_init_array.sh |
--- x264/encoder/analyse_gen_init_array.sh 1970-01-01 01:00:00.000000000 +0100 |
+++ x264-or/encoder/analyse_gen_init_array.sh 2009-10-28 15:05:29.000000000 +0100 |
@@ -0,0 +1,32 @@ |
+#!/bin/bash |
+# Pre-generate an array of numbers according to the algorithm from analyse_init_costs() |
+# to speedup calculation. Here we calculate the stuff in brackets: |
+# lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f; |
+outfile=analyse_init_log2.c |
+ |
+echo; echo "Generating analyse_init_log2_array[] to $outfile" |
+echo "This may take a minute or two..." |
+ |
+#Size of the array: 4*2*2048 |
+numvals=16384 |
+ |
+rm -f $outfile |
+ |
+echo "const float x264_analyse_init_log2_array[] = {" >> $outfile |
+ |
+n=0 |
+ |
+ for num in `seq 1 $numvals`; |
+ do |
+# Use bc to do the calculation, scale down to the output to one decimal place |
+ newnum=`echo "a=((l($num)/l(2))*2)+1.718; scale=1; a=(a*2)/2; a" | bc -l` |
+ echo -n $newnum >> $outfile |
+ if [ $num -lt $numvals ]; then echo -n ", " >> $outfile; fi; |
+ # Newline every 16 values |
+ let "n = $num % 16" |
+ if [ $n -eq 0 ]; then echo >> $outfile; fi; |
+ done |
+echo "};" >> $outfile |
+ |
+echo "finished" |
+echo |
diff --exclude=.git --exclude=/gitignore -Naur x264/encoder/encoder.c x264-or/encoder/encoder.c |
--- x264/encoder/encoder.c 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/encoder/encoder.c 2009-10-28 15:05:29.000000000 +0100 |
@@ -62,6 +62,9 @@ |
|
static void x264_frame_dump( x264_t *h ) |
{ |
+ //printf("frame_dump()\n"); |
+ return; |
+ |
FILE *f = fopen( h->param.psz_dump_yuv, "r+b" ); |
int i, y; |
if( !f ) |
@@ -301,6 +304,7 @@ |
if( ( h->param.b_cabac && (h->cabac.p_end - h->cabac.p < 2500) ) |
|| ( h->out.bs.p_end - h->out.bs.p < 2500 ) ) |
{ |
+ //V(fprintf(stderr, "x264_bitstream_check_buffer: h->out.bs.p = 0x%.8x, h->out.bs.p_end = 0x%.8x\n",h->out.bs.p_end, h->out.bs.p )); |
intptr_t delta; |
int i; |
|
@@ -695,6 +699,8 @@ |
static void mbcmp_init( x264_t *h ) |
{ |
int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1; |
+ //V(if (satd)fprintf(stderr, "macroblock compare functions. satd enabled\n")); |
+ //V(if (!satd)fprintf(stderr, "macroblock compare functions. satd not enabled\n")); |
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); |
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); |
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; |
@@ -750,15 +756,14 @@ |
x264_t *h; |
char buf[1000], *p; |
int i, qp, i_slicetype_length; |
- |
+ |
CHECKED_MALLOCZERO( h, sizeof(x264_t) ); |
- |
+ |
/* Create a copy of param */ |
memcpy( &h->param, param, sizeof(x264_param_t) ); |
- |
if( param->param_free ) |
param->param_free( param ); |
- |
+ |
if( x264_validate_parameters( h ) < 0 ) |
goto fail; |
|
@@ -772,7 +777,7 @@ |
h->param.rc.psz_stat_in = strdup( h->param.rc.psz_stat_in ); |
|
x264_set_aspect_ratio( h, param, 1 ); |
- |
+ |
x264_reduce_fraction( &h->param.i_fps_num, &h->param.i_fps_den ); |
|
/* Init x264_t */ |
@@ -831,8 +836,8 @@ |
h->i_ref0 = 0; |
h->i_ref1 = 0; |
|
- x264_rdo_init(); |
- |
+ x264_rdo_init(); |
+ |
/* init CPU functions */ |
x264_predict_16x16_init( h->param.cpu, h->predict_16x16 ); |
x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); |
@@ -870,17 +875,25 @@ |
p += sprintf( p, " none!" ); |
x264_log( h, X264_LOG_INFO, "%s\n", buf ); |
|
+ /* // should skip during cycle accurate -- jb */ |
+ // Check the different quantisation points and their costs |
for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ ) |
+ { |
+ //printf("%d\n", qp); |
if( x264_analyse_init_costs( h, qp ) ) |
- goto fail; |
+ goto fail; |
+ } |
if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) ) |
goto fail; |
if( h->cost_mv[1][2013] != 24 ) |
- { |
+ { |
+ //fprintf(stderr, "h->cost_mv[1][2013] = %d\n",h->cost_mv[1][2013]); |
x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" ); |
goto fail; |
} |
|
+ //exit(0); // jb |
+ |
h->out.i_nal = 0; |
h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4 |
* ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min ) |
@@ -908,16 +921,17 @@ |
if( x264_macroblock_cache_init( h->thread[i] ) < 0 ) |
goto fail; |
} |
- |
+ |
if( x264_lookahead_init( h, i_slicetype_length ) ) |
goto fail; |
|
if( x264_ratecontrol_new( h ) < 0 ) |
goto fail; |
- |
+ |
+ /* // skip this for now, it's a fopen() call we won't really do, plus it's an option we'll never need --jb |
if( h->param.psz_dump_yuv ) |
{ |
- /* create or truncate the reconstructed video file */ |
+ //create or truncate the reconstructed video file |
FILE *f = fopen( h->param.psz_dump_yuv, "w" ); |
if( f ) |
fclose( f ); |
@@ -927,6 +941,8 @@ |
goto fail; |
} |
} |
+ */ |
+ |
|
x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n", |
h->sps->i_profile_idc == PROFILE_BASELINE ? "Baseline" : |
@@ -1307,6 +1323,8 @@ |
int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-3-NALU_OVERHEAD)*8 : INT_MAX; |
int starting_bits = bs_pos(&h->out.bs); |
|
+ //V(fprintf(stderr, "x264_slice_write\n")); |
+ |
/* Slice */ |
x264_nal_start( h, h->i_nal_type, h->i_nal_ref_idc ); |
|
@@ -1320,6 +1338,7 @@ |
/* init cabac */ |
x264_cabac_context_init( &h->cabac, h->sh.i_type, h->sh.i_qp, h->sh.i_cabac_init_idc ); |
x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end ); |
+ //V(fprintf(stderr, "\tcabac inited\n")); |
} |
h->mb.i_last_qp = h->sh.i_qp; |
h->mb.i_last_dqp = 0; |
@@ -1362,7 +1381,10 @@ |
x264_macroblock_encode( h ); |
|
if( x264_bitstream_check_buffer( h ) ) |
+ { |
+ //V(fprintf(stderr, "\tx264_bitstream_check_buffer failed\n")); |
return -1; |
+ } |
|
if( h->param.b_cabac ) |
{ |
@@ -1522,6 +1544,8 @@ |
x264_fdec_filter_row( h, h->sps->i_mb_height ); |
} |
|
+ //V(fprintf(stderr, "x264_slice_write ok\n")); |
+ |
return 0; |
} |
|
@@ -1625,8 +1649,8 @@ |
int i_nal_type, i; |
int i_nal_ref_idc; |
|
- int i_global_qp; |
|
+ int i_global_qp; |
if( h->param.i_threads > 1) |
{ |
int i = ++h->i_thread_phase; |
@@ -1653,6 +1677,7 @@ |
/* no data out */ |
*pi_nal = 0; |
*pp_nal = NULL; |
+ |
|
/* ------------------- Setup new frame from picture -------------------- */ |
if( pic_in != NULL ) |
@@ -1663,7 +1688,10 @@ |
return -1; |
|
if( x264_frame_copy_picture( h, fenc, pic_in ) < 0 ) |
+ { |
+ //V(fprintf(stderr, "x264_frame_copy_picture returned >0\n")); |
return -1; |
+ } |
|
if( h->param.i_width != 16 * h->sps->i_mb_width || |
h->param.i_height != 16 * h->sps->i_mb_height ) |
@@ -1684,13 +1712,15 @@ |
|
/* 2: Place the frame into the queue for its slice type decision */ |
x264_lookahead_put_frame( h, fenc ); |
- |
+ |
+ //V(fprintf(stderr, "x264_encoder_encode setup new frame from picture\n")); |
+ |
if( h->frames.i_input <= h->frames.i_delay + 1 - h->param.i_threads ) |
{ |
/* Nothing yet to encode, waiting for filling of buffers */ |
pic_out->i_type = X264_TYPE_AUTO; |
return 0; |
- } |
+ } |
} |
else |
{ |
@@ -1873,7 +1903,10 @@ |
} |
else |
if( (intptr_t)x264_slices_write( h ) ) |
+ { |
+ //V(fprintf(stderr, "x264_encoder_encode(): x264_slices_write() failed\n")); |
return -1; |
+ } |
|
return x264_encoder_frame_end( thread_oldest, thread_current, pp_nal, pi_nal, pic_out ); |
} |
diff --exclude=.git --exclude=/gitignore -Naur x264/encoder/me.c x264-or/encoder/me.c |
--- x264/encoder/me.c 2009-10-25 17:41:10.000000000 +0100 |
+++ x264-or/encoder/me.c 2009-11-10 13:36:34.000000000 +0100 |
@@ -33,17 +33,17 @@ |
* the subme=8,9 values are much higher because any amount of satd search makes |
* up its time by reducing the number of qpel-rd iterations. */ |
static const int subpel_iterations[][4] = |
- {{0,0,0,0}, |
- {1,1,0,0}, |
- {0,1,1,0}, |
- {0,2,1,0}, |
- {0,2,1,1}, |
- {0,2,1,2}, |
- {0,0,2,2}, |
- {0,0,2,2}, |
- {0,0,4,10}, |
- {0,0,4,10}, |
- {0,0,4,10}}; |
+ {{0,0,0,0}, |
+ {1,1,0,0}, |
+ {0,1,1,0}, |
+ {0,2,1,0}, |
+ {0,2,1,1}, |
+ {0,2,1,2}, |
+ {0,0,2,2}, |
+ {0,0,2,2}, |
+ {0,0,4,10}, |
+ {0,0,4,10}, |
+ {0,0,4,10}}; |
|
/* (x-1)%6 */ |
static const int mod6m1[8] = {5,0,1,2,3,4,5,0}; |
@@ -53,1108 +53,1114 @@ |
|
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ); |
|
-#define BITS_MVD( mx, my )\ |
- (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2]) |
+#define BITS_MVD( mx, my ) \ |
+ (p_cost_mvx[(mx)<<2] + p_cost_mvy[(my)<<2]) |
|
-#define COST_MV( mx, my )\ |
-{\ |
- int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\ |
- &p_fref[(my)*stride+(mx)], stride )\ |
- + BITS_MVD(mx,my);\ |
- COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\ |
-} |
- |
-#define COST_MV_HPEL( mx, my ) \ |
-{ \ |
- int stride2 = 16; \ |
+#define COST_MV( mx, my ) \ |
+ { \ |
+ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, \ |
+ &p_fref[(my)*stride+(mx)], stride ) \ |
+ + BITS_MVD(mx,my); \ |
+ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ |
+ } |
+ |
+#define COST_MV_HPEL( mx, my ) \ |
+ { \ |
+ int stride2 = 16; \ |
uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh ); \ |
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \ |
- + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ |
- COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \ |
-} |
- |
-#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ |
-{\ |
- uint8_t *pix_base = p_fref + bmx + bmy*stride;\ |
- h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\ |
- pix_base + (m0x) + (m0y)*stride,\ |
- pix_base + (m1x) + (m1y)*stride,\ |
- pix_base + (m2x) + (m2y)*stride,\ |
- stride, costs );\ |
- (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\ |
- (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\ |
- (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\ |
-} |
- |
-#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\ |
-{\ |
- uint8_t *pix_base = p_fref + bmx + bmy*stride;\ |
- h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ |
- pix_base + (m0x) + (m0y)*stride,\ |
- pix_base + (m1x) + (m1y)*stride,\ |
- pix_base + (m2x) + (m2y)*stride,\ |
- pix_base + (m3x) + (m3y)*stride,\ |
- stride, costs );\ |
- (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) );\ |
- (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) );\ |
- (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) );\ |
- (costs)[3] += BITS_MVD( bmx+(m3x), bmy+(m3y) );\ |
-} |
- |
-#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ |
-{\ |
- uint8_t *pix_base = p_fref + omx + omy*stride;\ |
- h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ |
- pix_base + (m0x) + (m0y)*stride,\ |
- pix_base + (m1x) + (m1y)*stride,\ |
- pix_base + (m2x) + (m2y)*stride,\ |
- pix_base + (m3x) + (m3y)*stride,\ |
- stride, costs );\ |
- costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) );\ |
- costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) );\ |
- costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) );\ |
- costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) );\ |
- COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) );\ |
- COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) );\ |
- COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) );\ |
- COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\ |
-} |
- |
-#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\ |
-{\ |
- h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\ |
- p_fref + (m0x) + (m0y)*stride,\ |
- p_fref + (m1x) + (m1y)*stride,\ |
- p_fref + (m2x) + (m2y)*stride,\ |
- stride, costs );\ |
- costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\ |
- costs[1] += p_cost_mvx[(m1x)<<2];\ |
- costs[2] += p_cost_mvx[(m2x)<<2];\ |
- COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\ |
- COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\ |
- COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\ |
-} |
+ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ |
+ COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \ |
+ } |
+ |
+#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs ) \ |
+ { \ |
+ uint8_t *pix_base = p_fref + bmx + bmy*stride; \ |
+ h->pixf.fpelcmp_x3[i_pixel]( p_fenc, \ |
+ pix_base + (m0x) + (m0y)*stride, \ |
+ pix_base + (m1x) + (m1y)*stride, \ |
+ pix_base + (m2x) + (m2y)*stride, \ |
+ stride, costs ); \ |
+ (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) ); \ |
+ (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) ); \ |
+ (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) ); \ |
+ } |
+ |
+#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs ) \ |
+ { \ |
+ uint8_t *pix_base = p_fref + bmx + bmy*stride; \ |
+ h->pixf.fpelcmp_x4[i_pixel]( p_fenc, \ |
+ pix_base + (m0x) + (m0y)*stride, \ |
+ pix_base + (m1x) + (m1y)*stride, \ |
+ pix_base + (m2x) + (m2y)*stride, \ |
+ pix_base + (m3x) + (m3y)*stride, \ |
+ stride, costs ); \ |
+ (costs)[0] += BITS_MVD( bmx+(m0x), bmy+(m0y) ); \ |
+ (costs)[1] += BITS_MVD( bmx+(m1x), bmy+(m1y) ); \ |
+ (costs)[2] += BITS_MVD( bmx+(m2x), bmy+(m2y) ); \ |
+ (costs)[3] += BITS_MVD( bmx+(m3x), bmy+(m3y) ); \ |
+ } |
+ |
+#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y ) \ |
+ { \ |
+ uint8_t *pix_base = p_fref + omx + omy*stride; \ |
+ h->pixf.fpelcmp_x4[i_pixel]( p_fenc, \ |
+ pix_base + (m0x) + (m0y)*stride, \ |
+ pix_base + (m1x) + (m1y)*stride, \ |
+ pix_base + (m2x) + (m2y)*stride, \ |
+ pix_base + (m3x) + (m3y)*stride, \ |
+ stride, costs ); \ |
+ costs[0] += BITS_MVD( omx+(m0x), omy+(m0y) ); \ |
+ costs[1] += BITS_MVD( omx+(m1x), omy+(m1y) ); \ |
+ costs[2] += BITS_MVD( omx+(m2x), omy+(m2y) ); \ |
+ costs[3] += BITS_MVD( omx+(m3x), omy+(m3y) ); \ |
+ COPY3_IF_LT( bcost, costs[0], bmx, omx+(m0x), bmy, omy+(m0y) ); \ |
+ COPY3_IF_LT( bcost, costs[1], bmx, omx+(m1x), bmy, omy+(m1y) ); \ |
+ COPY3_IF_LT( bcost, costs[2], bmx, omx+(m2x), bmy, omy+(m2y) ); \ |
+ COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) ); \ |
+ } |
+ |
+#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y ) \ |
+ { \ |
+ h->pixf.fpelcmp_x3[i_pixel]( p_fenc, \ |
+ p_fref + (m0x) + (m0y)*stride, \ |
+ p_fref + (m1x) + (m1y)*stride, \ |
+ p_fref + (m2x) + (m2y)*stride, \ |
+ stride, costs ); \ |
+ costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */ \ |
+ costs[1] += p_cost_mvx[(m1x)<<2]; \ |
+ costs[2] += p_cost_mvx[(m2x)<<2]; \ |
+ COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y ); \ |
+ COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y ); \ |
+ COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y ); \ |
+ } |
|
/* 1 */ |
/* 101 */ |
/* 1 */ |
-#define DIA1_ITER( mx, my )\ |
-{\ |
- omx = mx; omy = my;\ |
- COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 );\ |
-} |
- |
-#define CROSS( start, x_max, y_max )\ |
-{\ |
- i = start;\ |
- if( x_max <= X264_MIN(mv_x_max-omx, omx-mv_x_min) )\ |
- for( ; i < x_max-2; i+=4 )\ |
- COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 );\ |
- for( ; i < x_max; i+=2 )\ |
- {\ |
- if( omx+i <= mv_x_max )\ |
- COST_MV( omx+i, omy );\ |
- if( omx-i >= mv_x_min )\ |
- COST_MV( omx-i, omy );\ |
- }\ |
- i = start;\ |
- if( y_max <= X264_MIN(mv_y_max-omy, omy-mv_y_min) )\ |
- for( ; i < y_max-2; i+=4 )\ |
- COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 );\ |
- for( ; i < y_max; i+=2 )\ |
- {\ |
- if( omy+i <= mv_y_max )\ |
- COST_MV( omx, omy+i );\ |
- if( omy-i >= mv_y_min )\ |
- COST_MV( omx, omy-i );\ |
- }\ |
-} |
+#define DIA1_ITER( mx, my ) \ |
+ { \ |
+ omx = mx; omy = my; \ |
+ COST_MV_X4( 0,-1, 0,1, -1,0, 1,0 ); \ |
+ } |
+ |
+#define CROSS( start, x_max, y_max ) \ |
+ { \ |
+ i = start; \ |
+ if( x_max <= X264_MIN(mv_x_max-omx, omx-mv_x_min) ) \ |
+ for( ; i < x_max-2; i+=4 ) \ |
+ COST_MV_X4( i,0, -i,0, i+2,0, -i-2,0 ); \ |
+ for( ; i < x_max; i+=2 ) \ |
+ { \ |
+ if( omx+i <= mv_x_max ) \ |
+ COST_MV( omx+i, omy ); \ |
+ if( omx-i >= mv_x_min ) \ |
+ COST_MV( omx-i, omy ); \ |
+ } \ |
+ i = start; \ |
+ if( y_max <= X264_MIN(mv_y_max-omy, omy-mv_y_min) ) \ |
+ for( ; i < y_max-2; i+=4 ) \ |
+ COST_MV_X4( 0,i, 0,-i, 0,i+2, 0,-i-2 ); \ |
+ for( ; i < y_max; i+=2 ) \ |
+ { \ |
+ if( omy+i <= mv_y_max ) \ |
+ COST_MV( omx, omy+i ); \ |
+ if( omy-i >= mv_y_min ) \ |
+ COST_MV( omx, omy-i ); \ |
+ } \ |
+ } |
|
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) |
{ |
- const int bw = x264_pixel_size[m->i_pixel].w; |
- const int bh = x264_pixel_size[m->i_pixel].h; |
- const int i_pixel = m->i_pixel; |
- const int stride = m->i_stride[0]; |
- int i_me_range = h->param.analyse.i_me_range; |
- int bmx, bmy, bcost; |
- int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; |
- int omx, omy, pmx, pmy; |
- uint8_t *p_fenc = m->p_fenc[0]; |
- uint8_t *p_fref = m->p_fref[0]; |
- ALIGNED_ARRAY_16( uint8_t, pix,[16*16] ); |
- |
- int i, j; |
- int dir; |
- int costs[16]; |
- |
- int mv_x_min = h->mb.mv_min_fpel[0]; |
- int mv_y_min = h->mb.mv_min_fpel[1]; |
- int mv_x_max = h->mb.mv_max_fpel[0]; |
- int mv_y_max = h->mb.mv_max_fpel[1]; |
+ const int bw = x264_pixel_size[m->i_pixel].w; // x264_pixel_size[] in common/pixel.h |
+ const int bh = x264_pixel_size[m->i_pixel].h; |
+ const int i_pixel = m->i_pixel; // macroblock pixel size |
+ const int stride = m->i_stride[0]; // array stride (usually used only in Y direction) |
+ int i_me_range = h->param.analyse.i_me_range; /* integer pixel motion estimation search range (from predicted mv) */ |
+ int bmx, bmy, bcost; |
+ int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; |
+ int omx, omy, pmx, pmy; |
+ uint8_t *p_fenc = m->p_fenc[0]; |
+ uint8_t *p_fref = m->p_fref[0]; |
+ //V(fprintf(stderr, "s264_me_search_ref *p_fenc = 0x%.8x *p_fref = 0x%.8x\n", (unsigned long) p_fenc, (unsigned long) p_fref)); |
+ ALIGNED_ARRAY_16( uint8_t, pix,[16*16] ); |
+ |
+ int i, j; |
+ int dir; |
+ int costs[16]; |
+ |
+ //fprintf(stderr, "x264_me_search_ref: x264_t *h = 0x%.8x\n", (unsigned int) h); |
+ /* Ranges for MVs - perhaps we're on/near the frame boarder and can't go beyond it, these ranges indicate that. They're set in x264_mb_analyse_init() */ |
+ int mv_x_min = h->mb.mv_min_fpel[0]; |
+ int mv_y_min = h->mb.mv_min_fpel[1]; |
+ int mv_x_max = h->mb.mv_max_fpel[0]; |
+ int mv_y_max = h->mb.mv_max_fpel[1]; |
|
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max ) |
|
- const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; |
- const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; |
- |
- bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 ); |
- bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 ); |
- pmx = ( bmx + 2 ) >> 2; |
- pmy = ( bmy + 2 ) >> 2; |
- bcost = COST_MAX; |
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; |
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; |
+ //V(fprintf(stderr, "p_cost_mvx: 0x%.8x, ( m->p_cost_mv: 0x%.8x m->mvp[0]: 0x%.8x )p_cost_mvy: 0x%.8x (m->p_cost_mv: 0x%.8x m->mvp[1]: 0x%.8x)\n", (unsigned long) p_cost_mvx, (unsigned long) m->p_cost_mv, (unsigned long) m->mvp[0], (unsigned long) p_cost_mvy, (unsigned long) m->p_cost_mv, (unsigned long) m->mvp[1])); |
+ |
+ /* m->mvp has the media MV (x in [0] and y in [1]) */ |
+ bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 ); |
+ bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 ); |
+ pmx = ( bmx + 2 ) >> 2; |
+ pmy = ( bmy + 2 ) >> 2; |
+ bcost = COST_MAX; |
|
- /* try extra predictors if provided */ |
- if( h->mb.i_subpel_refine >= 3 ) |
+ /* try extra predictors if provided */ |
+ if( h->mb.i_subpel_refine >= 3 ) |
{ |
- uint32_t bmv = pack16to32_mask(bmx,bmy); |
- COST_MV_HPEL( bmx, bmy ); |
- for( i = 0; i < i_mvc; i++ ) |
+ uint32_t bmv = pack16to32_mask(bmx,bmy); |
+ COST_MV_HPEL( bmx, bmy ); |
+ for( i = 0; i < i_mvc; i++ ) |
{ |
- if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) ) |
+ if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) ) |
{ |
- int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 ); |
- int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 ); |
- COST_MV_HPEL( mx, my ); |
+ int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 ); |
+ int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 ); |
+ COST_MV_HPEL( mx, my ); |
} |
} |
- bmx = ( bpred_mx + 2 ) >> 2; |
- bmy = ( bpred_my + 2 ) >> 2; |
- COST_MV( bmx, bmy ); |
+ bmx = ( bpred_mx + 2 ) >> 2; |
+ bmy = ( bpred_my + 2 ) >> 2; |
+ COST_MV( bmx, bmy ); |
} |
- else |
+ else |
{ |
- /* check the MVP */ |
- COST_MV( pmx, pmy ); |
- /* Because we are rounding the predicted motion vector to fullpel, there will be |
- * an extra MV cost in 15 out of 16 cases. However, when the predicted MV is |
- * chosen as the best predictor, it is often the case that the subpel search will |
- * result in a vector at or next to the predicted motion vector. Therefore, it is |
- * sensible to remove the cost of the MV from the rounded MVP to avoid unfairly |
- * biasing against use of the predicted motion vector. */ |
- bcost -= BITS_MVD( pmx, pmy ); |
- for( i = 0; i < i_mvc; i++ ) |
+ /* check the MVP */ |
+ COST_MV( pmx, pmy ); |
+ /* Because we are rounding the predicted motion vector to fullpel, there will be |
+ * an extra MV cost in 15 out of 16 cases. However, when the predicted MV is |
+ * chosen as the best predictor, it is often the case that the subpel search will |
+ * result in a vector at or next to the predicted motion vector. Therefore, it is |
+ * sensible to remove the cost of the MV from the rounded MVP to avoid unfairly |
+ * biasing against use of the predicted motion vector. */ |
+ bcost -= BITS_MVD( pmx, pmy ); |
+ for( i = 0; i < i_mvc; i++ ) |
{ |
- int mx = (mvc[i][0] + 2) >> 2; |
- int my = (mvc[i][1] + 2) >> 2; |
- if( (mx | my) && ((mx-bmx) | (my-bmy)) ) |
+ int mx = (mvc[i][0] + 2) >> 2; |
+ int my = (mvc[i][1] + 2) >> 2; |
+ if( (mx | my) && ((mx-bmx) | (my-bmy)) ) |
{ |
- mx = x264_clip3( mx, mv_x_min, mv_x_max ); |
- my = x264_clip3( my, mv_y_min, mv_y_max ); |
- COST_MV( mx, my ); |
+ mx = x264_clip3( mx, mv_x_min, mv_x_max ); |
+ my = x264_clip3( my, mv_y_min, mv_y_max ); |
+ COST_MV( mx, my ); |
} |
} |
} |
- COST_MV( 0, 0 ); |
+ COST_MV( 0, 0 ); |
|
- switch( h->mb.i_me_method ) |
+ switch( h->mb.i_me_method ) |
{ |
case X264_ME_DIA: |
- /* diamond search, radius 1 */ |
- i = 0; |
- bcost <<= 4; |
- do |
+ /* diamond search, radius 1 */ |
+ //V(fprintf(stderr, "x264_me_search_ref; diamond search. *p_fenc = 0x%.8x *p_fref = 0x%.8x\n", (unsigned long) p_fenc, (unsigned long) p_fref)); |
+ i = 0; |
+ bcost <<= 4; |
+ do |
{ |
- COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs ); |
- COPY1_IF_LT( bcost, (costs[0]<<4)+1 ); |
- COPY1_IF_LT( bcost, (costs[1]<<4)+3 ); |
- COPY1_IF_LT( bcost, (costs[2]<<4)+4 ); |
- COPY1_IF_LT( bcost, (costs[3]<<4)+12 ); |
- if( !(bcost&15) ) |
- break; |
- bmx -= (bcost<<28)>>30; |
- bmy -= (bcost<<30)>>30; |
- bcost &= ~15; |
- if( !CHECK_MVRANGE(bmx, bmy) ) |
- break; |
+ COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs ); |
+ COPY1_IF_LT( bcost, (costs[0]<<4)+1 ); |
+ COPY1_IF_LT( bcost, (costs[1]<<4)+3 ); |
+ COPY1_IF_LT( bcost, (costs[2]<<4)+4 ); |
+ COPY1_IF_LT( bcost, (costs[3]<<4)+12 ); |
+ if( !(bcost&15) ) |
+ break; |
+ bmx -= (bcost<<28)>>30; |
+ bmy -= (bcost<<30)>>30; |
+ bcost &= ~15; |
+ if( !CHECK_MVRANGE(bmx, bmy) ) |
+ break; |
} while( ++i < i_me_range ); |
- bcost >>= 4; |
- break; |
+ bcost >>= 4; |
+ break; |
|
case X264_ME_HEX: |
-me_hex2: |
- /* hexagon search, radius 2 */ |
+ me_hex2: |
+ /* hexagon search, radius 2 */ |
#if 0 |
- for( i = 0; i < i_me_range/2; i++ ) |
+ for( i = 0; i < i_me_range/2; i++ ) |
{ |
- omx = bmx; omy = bmy; |
- COST_MV( omx-2, omy ); |
- COST_MV( omx-1, omy+2 ); |
- COST_MV( omx+1, omy+2 ); |
- COST_MV( omx+2, omy ); |
- COST_MV( omx+1, omy-2 ); |
- COST_MV( omx-1, omy-2 ); |
- if( bmx == omx && bmy == omy ) |
- break; |
- if( !CHECK_MVRANGE(bmx, bmy) ) |
- break; |
+ omx = bmx; omy = bmy; |
+ COST_MV( omx-2, omy ); |
+ COST_MV( omx-1, omy+2 ); |
+ COST_MV( omx+1, omy+2 ); |
+ COST_MV( omx+2, omy ); |
+ COST_MV( omx+1, omy-2 ); |
+ COST_MV( omx-1, omy-2 ); |
+ if( bmx == omx && bmy == omy ) |
+ break; |
+ if( !CHECK_MVRANGE(bmx, bmy) ) |
+ break; |
} |
#else |
- /* equivalent to the above, but eliminates duplicate candidates */ |
+ /* equivalent to the above, but eliminates duplicate candidates */ |
|
- /* hexagon */ |
- COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs ); |
- COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 ); |
- bcost <<= 3; |
- COPY1_IF_LT( bcost, (costs[0]<<3)+2 ); |
- COPY1_IF_LT( bcost, (costs[1]<<3)+3 ); |
- COPY1_IF_LT( bcost, (costs[2]<<3)+4 ); |
- COPY1_IF_LT( bcost, (costs[3]<<3)+5 ); |
- COPY1_IF_LT( bcost, (costs[4]<<3)+6 ); |
- COPY1_IF_LT( bcost, (costs[5]<<3)+7 ); |
+ /* hexagon */ |
+ COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs ); |
+ COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 ); |
+ bcost <<= 3; |
+ COPY1_IF_LT( bcost, (costs[0]<<3)+2 ); |
+ COPY1_IF_LT( bcost, (costs[1]<<3)+3 ); |
+ COPY1_IF_LT( bcost, (costs[2]<<3)+4 ); |
+ COPY1_IF_LT( bcost, (costs[3]<<3)+5 ); |
+ COPY1_IF_LT( bcost, (costs[4]<<3)+6 ); |
+ COPY1_IF_LT( bcost, (costs[5]<<3)+7 ); |
|
- if( bcost&7 ) |
+ if( bcost&7 ) |
{ |
- dir = (bcost&7)-2; |
- bmx += hex2[dir+1][0]; |
- bmy += hex2[dir+1][1]; |
- /* half hexagon, not overlapping the previous iteration */ |
- for( i = 1; i < i_me_range/2 && CHECK_MVRANGE(bmx, bmy); i++ ) |
+ dir = (bcost&7)-2; |
+ bmx += hex2[dir+1][0]; |
+ bmy += hex2[dir+1][1]; |
+ /* half hexagon, not overlapping the previous iteration */ |
+ for( i = 1; i < i_me_range/2 && CHECK_MVRANGE(bmx, bmy); i++ ) |
{ |
- COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1], |
- hex2[dir+1][0], hex2[dir+1][1], |
- hex2[dir+2][0], hex2[dir+2][1], |
- costs ); |
- bcost &= ~7; |
- COPY1_IF_LT( bcost, (costs[0]<<3)+1 ); |
- COPY1_IF_LT( bcost, (costs[1]<<3)+2 ); |
- COPY1_IF_LT( bcost, (costs[2]<<3)+3 ); |
- if( !(bcost&7) ) |
- break; |
- dir += (bcost&7)-2; |
- dir = mod6m1[dir+1]; |
- bmx += hex2[dir+1][0]; |
- bmy += hex2[dir+1][1]; |
+ COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1], |
+ hex2[dir+1][0], hex2[dir+1][1], |
+ hex2[dir+2][0], hex2[dir+2][1], |
+ costs ); |
+ bcost &= ~7; |
+ COPY1_IF_LT( bcost, (costs[0]<<3)+1 ); |
+ COPY1_IF_LT( bcost, (costs[1]<<3)+2 ); |
+ COPY1_IF_LT( bcost, (costs[2]<<3)+3 ); |
+ if( !(bcost&7) ) |
+ break; |
+ dir += (bcost&7)-2; |
+ dir = mod6m1[dir+1]; |
+ bmx += hex2[dir+1][0]; |
+ bmy += hex2[dir+1][1]; |
} |
} |
- bcost >>= 3; |
+ bcost >>= 3; |
#endif |
- /* square refine */ |
- dir = 0; |
- COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs ); |
- COPY2_IF_LT( bcost, costs[0], dir, 1 ); |
- COPY2_IF_LT( bcost, costs[1], dir, 2 ); |
- COPY2_IF_LT( bcost, costs[2], dir, 3 ); |
- COPY2_IF_LT( bcost, costs[3], dir, 4 ); |
- COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs ); |
- COPY2_IF_LT( bcost, costs[0], dir, 5 ); |
- COPY2_IF_LT( bcost, costs[1], dir, 6 ); |
- COPY2_IF_LT( bcost, costs[2], dir, 7 ); |
- COPY2_IF_LT( bcost, costs[3], dir, 8 ); |
- bmx += square1[dir][0]; |
- bmy += square1[dir][1]; |
- break; |
+ /* square refine */ |
+ dir = 0; |
+ COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs ); |
+ COPY2_IF_LT( bcost, costs[0], dir, 1 ); |
+ COPY2_IF_LT( bcost, costs[1], dir, 2 ); |
+ COPY2_IF_LT( bcost, costs[2], dir, 3 ); |
+ COPY2_IF_LT( bcost, costs[3], dir, 4 ); |
+ COST_MV_X4_DIR( -1,-1, -1,1, 1,-1, 1,1, costs ); |
+ COPY2_IF_LT( bcost, costs[0], dir, 5 ); |
+ COPY2_IF_LT( bcost, costs[1], dir, 6 ); |
+ COPY2_IF_LT( bcost, costs[2], dir, 7 ); |
+ COPY2_IF_LT( bcost, costs[3], dir, 8 ); |
+ bmx += square1[dir][0]; |
+ bmy += square1[dir][1]; |
+ break; |
|
case X264_ME_UMH: |
- { |
- /* Uneven-cross Multi-Hexagon-grid Search |
- * as in JM, except with different early termination */ |
- |
- static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 }; |
- |
- int ucost1, ucost2; |
- int cross_start = 1; |
+ { |
+ /* Uneven-cross Multi-Hexagon-grid Search |
+ * as in JM, except with different early termination */ |
+ |
+ static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 }; |
+ |
+ int ucost1, ucost2; |
+ int cross_start = 1; |
+ |
+ /* refine predictors */ |
+ ucost1 = bcost; |
+ DIA1_ITER( pmx, pmy ); |
+ if( pmx | pmy ) |
+ DIA1_ITER( 0, 0 ); |
+ |
+ if(i_pixel == PIXEL_4x4) |
+ goto me_hex2; |
+ |
+ ucost2 = bcost; |
+ if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) ) |
+ DIA1_ITER( bmx, bmy ); |
+ if( bcost == ucost2 ) |
+ cross_start = 3; |
+ omx = bmx; omy = bmy; |
|
- /* refine predictors */ |
- ucost1 = bcost; |
- DIA1_ITER( pmx, pmy ); |
- if( pmx | pmy ) |
- DIA1_ITER( 0, 0 ); |
- |
- if(i_pixel == PIXEL_4x4) |
- goto me_hex2; |
- |
- ucost2 = bcost; |
- if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) ) |
- DIA1_ITER( bmx, bmy ); |
- if( bcost == ucost2 ) |
- cross_start = 3; |
- omx = bmx; omy = bmy; |
- |
- /* early termination */ |
+ /* early termination */ |
#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) ) |
- if( bcost == ucost2 && SAD_THRESH(2000) ) |
- { |
- COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 ); |
- COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 ); |
- if( bcost == ucost1 && SAD_THRESH(500) ) |
- break; |
- if( bcost == ucost2 ) |
- { |
- int range = (i_me_range>>1) | 1; |
- CROSS( 3, range, range ); |
- COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 ); |
- COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 ); |
- if( bcost == ucost2 ) |
- break; |
- cross_start = range + 2; |
- } |
- } |
- |
- /* adaptive search range */ |
- if( i_mvc ) |
- { |
- /* range multipliers based on casual inspection of some statistics of |
- * average distance between current predictor and final mv found by ESA. |
- * these have not been tuned much by actual encoding. */ |
- static const int range_mul[4][4] = |
- { |
- { 3, 3, 4, 4 }, |
- { 3, 4, 4, 4 }, |
- { 4, 4, 4, 5 }, |
- { 4, 4, 5, 6 }, |
- }; |
- int mvd; |
- int sad_ctx, mvd_ctx; |
- int denom = 1; |
- |
- if( i_mvc == 1 ) |
- { |
- if( i_pixel == PIXEL_16x16 ) |
- /* mvc is probably the same as mvp, so the difference isn't meaningful. |
- * but prediction usually isn't too bad, so just use medium range */ |
- mvd = 25; |
- else |
- mvd = abs( m->mvp[0] - mvc[0][0] ) |
- + abs( m->mvp[1] - mvc[0][1] ); |
- } |
- else |
- { |
- /* calculate the degree of agreement between predictors. */ |
- /* in 16x16, mvc includes all the neighbors used to make mvp, |
- * so don't count mvp separately. */ |
- denom = i_mvc - 1; |
- mvd = 0; |
- if( i_pixel != PIXEL_16x16 ) |
- { |
- mvd = abs( m->mvp[0] - mvc[0][0] ) |
- + abs( m->mvp[1] - mvc[0][1] ); |
- denom++; |
- } |
- mvd += x264_predictor_difference( mvc, i_mvc ); |
- } |
- |
- sad_ctx = SAD_THRESH(1000) ? 0 |
- : SAD_THRESH(2000) ? 1 |
- : SAD_THRESH(4000) ? 2 : 3; |
- mvd_ctx = mvd < 10*denom ? 0 |
- : mvd < 20*denom ? 1 |
- : mvd < 40*denom ? 2 : 3; |
- |
- i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4; |
- } |
- |
- /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. |
- * we are still centered on the same place as the DIA2. is this desirable? */ |
- CROSS( cross_start, i_me_range, i_me_range/2 ); |
- |
- COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 ); |
- |
- /* hexagon grid */ |
- omx = bmx; omy = bmy; |
- const uint16_t *p_cost_omvx = p_cost_mvx + omx*4; |
- const uint16_t *p_cost_omvy = p_cost_mvy + omy*4; |
- i = 1; |
- do |
- { |
- static const int hex4[16][2] = { |
- { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3}, |
- {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1}, |
- {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1}, |
- {-4, 2}, { 4, 2}, {-2, 3}, { 2, 3}, |
- }; |
- |
- if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min, |
- mv_y_max-omy, omy-mv_y_min ) ) |
- { |
- for( j = 0; j < 16; j++ ) |
- { |
- int mx = omx + hex4[j][0]*i; |
- int my = omy + hex4[j][1]*i; |
- if( CHECK_MVRANGE(mx, my) ) |
- COST_MV( mx, my ); |
- } |
- } |
- else |
- { |
- int dir = 0; |
- uint8_t *pix_base = p_fref + omx + (omy-4*i)*stride; |
- int dy = i*stride; |
-#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\ |
- h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ |
- pix_base x0*i+(y0-2*k+4)*dy,\ |
- pix_base x1*i+(y1-2*k+4)*dy,\ |
- pix_base x2*i+(y2-2*k+4)*dy,\ |
- pix_base x3*i+(y3-2*k+4)*dy,\ |
- stride, costs+4*k );\ |
- pix_base += 2*dy; |
+ if( bcost == ucost2 && SAD_THRESH(2000) ) |
+ { |
+ COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 ); |
+ COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 ); |
+ if( bcost == ucost1 && SAD_THRESH(500) ) |
+ break; |
+ if( bcost == ucost2 ) |
+ { |
+ int range = (i_me_range>>1) | 1; |
+ CROSS( 3, range, range ); |
+ COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 ); |
+ COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 ); |
+ if( bcost == ucost2 ) |
+ break; |
+ cross_start = range + 2; |
+ } |
+ } |
+ |
+ /* adaptive search range */ |
+ if( i_mvc ) |
+ { |
+ /* range multipliers based on casual inspection of some statistics of |
+ * average distance between current predictor and final mv found by ESA. |
+ * these have not been tuned much by actual encoding. */ |
+ static const int range_mul[4][4] = |
+ { |
+ { 3, 3, 4, 4 }, |
+ { 3, 4, 4, 4 }, |
+ { 4, 4, 4, 5 }, |
+ { 4, 4, 5, 6 }, |
+ }; |
+ int mvd; |
+ int sad_ctx, mvd_ctx; |
+ int denom = 1; |
+ |
+ if( i_mvc == 1 ) |
+ { |
+ if( i_pixel == PIXEL_16x16 ) |
+ /* mvc is probably the same as mvp, so the difference isn't meaningful. |
+ * but prediction usually isn't too bad, so just use medium range */ |
+ mvd = 25; |
+ else |
+ mvd = abs( m->mvp[0] - mvc[0][0] ) |
+ + abs( m->mvp[1] - mvc[0][1] ); |
+ } |
+ else |
+ { |
+ /* calculate the degree of agreement between predictors. */ |
+ /* in 16x16, mvc includes all the neighbors used to make mvp, |
+ * so don't count mvp separately. */ |
+ denom = i_mvc - 1; |
+ mvd = 0; |
+ if( i_pixel != PIXEL_16x16 ) |
+ { |
+ mvd = abs( m->mvp[0] - mvc[0][0] ) |
+ + abs( m->mvp[1] - mvc[0][1] ); |
+ denom++; |
+ } |
+ mvd += x264_predictor_difference( mvc, i_mvc ); |
+ } |
+ |
+ sad_ctx = SAD_THRESH(1000) ? 0 |
+ : SAD_THRESH(2000) ? 1 |
+ : SAD_THRESH(4000) ? 2 : 3; |
+ mvd_ctx = mvd < 10*denom ? 0 |
+ : mvd < 20*denom ? 1 |
+ : mvd < 40*denom ? 2 : 3; |
+ |
+ i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4; |
+ } |
+ |
+ /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy. |
+ * we are still centered on the same place as the DIA2. is this desirable? */ |
+ CROSS( cross_start, i_me_range, i_me_range/2 ); |
+ |
+ COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 ); |
+ |
+ /* hexagon grid */ |
+ omx = bmx; omy = bmy; |
+ const uint16_t *p_cost_omvx = p_cost_mvx + omx*4; |
+ const uint16_t *p_cost_omvy = p_cost_mvy + omy*4; |
+ i = 1; |
+ do |
+ { |
+ static const int hex4[16][2] = { |
+ { 0,-4}, { 0, 4}, {-2,-3}, { 2,-3}, |
+ {-4,-2}, { 4,-2}, {-4,-1}, { 4,-1}, |
+ {-4, 0}, { 4, 0}, {-4, 1}, { 4, 1}, |
+ {-4, 2}, { 4, 2}, {-2, 3}, { 2, 3}, |
+ }; |
+ |
+ if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min, |
+ mv_y_max-omy, omy-mv_y_min ) ) |
+ { |
+ for( j = 0; j < 16; j++ ) |
+ { |
+ int mx = omx + hex4[j][0]*i; |
+ int my = omy + hex4[j][1]*i; |
+ if( CHECK_MVRANGE(mx, my) ) |
+ COST_MV( mx, my ); |
+ } |
+ } |
+ else |
+ { |
+ int dir = 0; |
+ uint8_t *pix_base = p_fref + omx + (omy-4*i)*stride; |
+ int dy = i*stride; |
+#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3) \ |
+ h->pixf.fpelcmp_x4[i_pixel]( p_fenc, \ |
+ pix_base x0*i+(y0-2*k+4)*dy, \ |
+ pix_base x1*i+(y1-2*k+4)*dy, \ |
+ pix_base x2*i+(y2-2*k+4)*dy, \ |
+ pix_base x3*i+(y3-2*k+4)*dy, \ |
+ stride, costs+4*k ); \ |
+ pix_base += 2*dy; |
#define ADD_MVCOST(k,x,y) costs[k] += p_cost_omvx[x*4*i] + p_cost_omvy[y*4*i] |
#define MIN_MV(k,x,y) COPY2_IF_LT( bcost, costs[k], dir, x*16+(y&15) ) |
- SADS( 0, +0,-4, +0,+4, -2,-3, +2,-3 ); |
- SADS( 1, -4,-2, +4,-2, -4,-1, +4,-1 ); |
- SADS( 2, -4,+0, +4,+0, -4,+1, +4,+1 ); |
- SADS( 3, -4,+2, +4,+2, -2,+3, +2,+3 ); |
- ADD_MVCOST( 0, 0,-4 ); |
- ADD_MVCOST( 1, 0, 4 ); |
- ADD_MVCOST( 2,-2,-3 ); |
- ADD_MVCOST( 3, 2,-3 ); |
- ADD_MVCOST( 4,-4,-2 ); |
- ADD_MVCOST( 5, 4,-2 ); |
- ADD_MVCOST( 6,-4,-1 ); |
- ADD_MVCOST( 7, 4,-1 ); |
- ADD_MVCOST( 8,-4, 0 ); |
- ADD_MVCOST( 9, 4, 0 ); |
- ADD_MVCOST( 10,-4, 1 ); |
- ADD_MVCOST( 11, 4, 1 ); |
- ADD_MVCOST( 12,-4, 2 ); |
- ADD_MVCOST( 13, 4, 2 ); |
- ADD_MVCOST( 14,-2, 3 ); |
- ADD_MVCOST( 15, 2, 3 ); |
- MIN_MV( 0, 0,-4 ); |
- MIN_MV( 1, 0, 4 ); |
- MIN_MV( 2,-2,-3 ); |
- MIN_MV( 3, 2,-3 ); |
- MIN_MV( 4,-4,-2 ); |
- MIN_MV( 5, 4,-2 ); |
- MIN_MV( 6,-4,-1 ); |
- MIN_MV( 7, 4,-1 ); |
- MIN_MV( 8,-4, 0 ); |
- MIN_MV( 9, 4, 0 ); |
- MIN_MV( 10,-4, 1 ); |
- MIN_MV( 11, 4, 1 ); |
- MIN_MV( 12,-4, 2 ); |
- MIN_MV( 13, 4, 2 ); |
- MIN_MV( 14,-2, 3 ); |
- MIN_MV( 15, 2, 3 ); |
+ SADS( 0, +0,-4, +0,+4, -2,-3, +2,-3 ); |
+ SADS( 1, -4,-2, +4,-2, -4,-1, +4,-1 ); |
+ SADS( 2, -4,+0, +4,+0, -4,+1, +4,+1 ); |
+ SADS( 3, -4,+2, +4,+2, -2,+3, +2,+3 ); |
+ ADD_MVCOST( 0, 0,-4 ); |
+ ADD_MVCOST( 1, 0, 4 ); |
+ ADD_MVCOST( 2,-2,-3 ); |
+ ADD_MVCOST( 3, 2,-3 ); |
+ ADD_MVCOST( 4,-4,-2 ); |
+ ADD_MVCOST( 5, 4,-2 ); |
+ ADD_MVCOST( 6,-4,-1 ); |
+ ADD_MVCOST( 7, 4,-1 ); |
+ ADD_MVCOST( 8,-4, 0 ); |
+ ADD_MVCOST( 9, 4, 0 ); |
+ ADD_MVCOST( 10,-4, 1 ); |
+ ADD_MVCOST( 11, 4, 1 ); |
+ ADD_MVCOST( 12,-4, 2 ); |
+ ADD_MVCOST( 13, 4, 2 ); |
+ ADD_MVCOST( 14,-2, 3 ); |
+ ADD_MVCOST( 15, 2, 3 ); |
+ MIN_MV( 0, 0,-4 ); |
+ MIN_MV( 1, 0, 4 ); |
+ MIN_MV( 2,-2,-3 ); |
+ MIN_MV( 3, 2,-3 ); |
+ MIN_MV( 4,-4,-2 ); |
+ MIN_MV( 5, 4,-2 ); |
+ MIN_MV( 6,-4,-1 ); |
+ MIN_MV( 7, 4,-1 ); |
+ MIN_MV( 8,-4, 0 ); |
+ MIN_MV( 9, 4, 0 ); |
+ MIN_MV( 10,-4, 1 ); |
+ MIN_MV( 11, 4, 1 ); |
+ MIN_MV( 12,-4, 2 ); |
+ MIN_MV( 13, 4, 2 ); |
+ MIN_MV( 14,-2, 3 ); |
+ MIN_MV( 15, 2, 3 ); |
#undef SADS |
#undef ADD_MVCOST |
#undef MIN_MV |
- if(dir) |
- { |
- bmx = omx + i*(dir>>4); |
- bmy = omy + i*((dir<<28)>>28); |
- } |
- } |
- } while( ++i <= i_me_range/4 ); |
- if( bmy <= mv_y_max && bmy >= mv_y_min ) |
- goto me_hex2; |
- break; |
- } |
+ if(dir) |
+ { |
+ bmx = omx + i*(dir>>4); |
+ bmy = omy + i*((dir<<28)>>28); |
+ } |
+ } |
+ } while( ++i <= i_me_range/4 ); |
+ if( bmy <= mv_y_max && bmy >= mv_y_min ) |
+ goto me_hex2; |
+ break; |
+ } |
|
case X264_ME_ESA: |
case X264_ME_TESA: |
- { |
- const int min_x = X264_MAX( bmx - i_me_range, mv_x_min ); |
- const int min_y = X264_MAX( bmy - i_me_range, mv_y_min ); |
- const int max_x = X264_MIN( bmx + i_me_range, mv_x_max ); |
- const int max_y = X264_MIN( bmy + i_me_range, mv_y_max ); |
- /* SEA is fastest in multiples of 4 */ |
- const int width = (max_x - min_x + 3) & ~3; |
- int my; |
+ { |
+ const int min_x = X264_MAX( bmx - i_me_range, mv_x_min ); |
+ const int min_y = X264_MAX( bmy - i_me_range, mv_y_min ); |
+ const int max_x = X264_MIN( bmx + i_me_range, mv_x_max ); |
+ const int max_y = X264_MIN( bmy + i_me_range, mv_y_max ); |
+ /* SEA is fastest in multiples of 4 */ |
+ const int width = (max_x - min_x + 3) & ~3; |
+ int my; |
#if 0 |
- /* plain old exhaustive search */ |
- int mx; |
- for( my = min_y; my <= max_y; my++ ) |
- for( mx = min_x; mx <= max_x; mx++ ) |
- COST_MV( mx, my ); |
+ /* plain old exhaustive search */ |
+ int mx; |
+ for( my = min_y; my <= max_y; my++ ) |
+ for( mx = min_x; mx <= max_x; mx++ ) |
+ COST_MV( mx, my ); |
#else |
- /* successive elimination by comparing DC before a full SAD, |
- * because sum(abs(diff)) >= abs(diff(sum)). */ |
- uint16_t *sums_base = m->integral; |
- /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned. |
- * this is not a problem because it is not used for any SSE instructions. */ |
- ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] ); |
- ALIGNED_ARRAY_16( int, enc_dc,[4] ); |
- int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; |
- int delta = x264_pixel_size[sad_size].w; |
- int16_t *xs = h->scratch_buffer; |
- int xn; |
- uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2); |
- |
- h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta, |
- p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE, |
- FENC_STRIDE, enc_dc ); |
- if( delta == 4 ) |
- sums_base += stride * (h->fenc->i_lines[0] + PADV*2); |
- if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) |
- delta *= stride; |
- if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) |
- enc_dc[1] = enc_dc[2]; |
- |
- if( h->mb.i_me_method == X264_ME_TESA ) |
- { |
- // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD |
- mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15)); |
- int nmvsad = 0, limit; |
- int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; |
- int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+bmy*stride+bmx, stride ) |
- + BITS_MVD( bmx, bmy ); |
- for( my = min_y; my <= max_y; my++ ) |
- { |
- int ycost = p_cost_mvy[my<<2]; |
- if( bsad <= ycost ) |
- continue; |
- bsad -= ycost; |
- xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, |
- cost_fpel_mvx+min_x, xs, width, bsad*17/16 ); |
- for( i=0; i<xn-2; i+=3 ) |
- { |
- uint8_t *ref = p_fref+min_x+my*stride; |
- int sads[3]; |
- h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads ); |
- for( j=0; j<3; j++ ) |
- { |
- int sad = sads[j] + cost_fpel_mvx[xs[i+j]]; |
- if( sad < bsad*sad_thresh>>3 ) |
- { |
- COPY1_IF_LT( bsad, sad ); |
- mvsads[nmvsad].sad = sad + ycost; |
- mvsads[nmvsad].mx = min_x+xs[i+j]; |
- mvsads[nmvsad].my = my; |
- nmvsad++; |
- } |
- } |
- } |
- for( ; i<xn; i++ ) |
- { |
- int mx = min_x+xs[i]; |
- int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+mx+my*stride, stride ) |
- + cost_fpel_mvx[xs[i]]; |
- if( sad < bsad*sad_thresh>>3 ) |
- { |
- COPY1_IF_LT( bsad, sad ); |
- mvsads[nmvsad].sad = sad + ycost; |
- mvsads[nmvsad].mx = mx; |
- mvsads[nmvsad].my = my; |
- nmvsad++; |
- } |
- } |
- bsad += ycost; |
- } |
- |
- limit = i_me_range / 2; |
- sad_thresh = bsad*sad_thresh>>3; |
- while( nmvsad > limit*2 && sad_thresh > bsad ) |
- { |
- // halve the range if the domain is too large... eh, close enough |
- sad_thresh = (sad_thresh + bsad) >> 1; |
- for( i=0; i<nmvsad && mvsads[i].sad <= sad_thresh; i++ ); |
- for( j=i; j<nmvsad; j++ ) |
- { |
- /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */ |
- if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) |
- *(uint64_t*)&mvsads[i] = *(uint64_t*)&mvsads[j]; |
- else |
- mvsads[i] = mvsads[j]; |
- i += mvsads[j].sad <= sad_thresh; |
- } |
- nmvsad = i; |
- } |
- while( nmvsad > limit ) |
- { |
- int bsad = mvsads[0].sad; |
- int bi = 0; |
- for( i=1; i<nmvsad; i++ ) |
- COPY2_IF_GT( bsad, mvsads[i].sad, bi, i ); |
- nmvsad--; |
- mvsads[bi] = mvsads[nmvsad]; |
- if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) |
- *(uint64_t*)&mvsads[bi] = *(uint64_t*)&mvsads[nmvsad]; |
- else |
- mvsads[bi] = mvsads[nmvsad]; |
- } |
- for( i=0; i<nmvsad; i++ ) |
- COST_MV( mvsads[i].mx, mvsads[i].my ); |
- } |
- else |
- { |
- // just ADS and SAD |
- for( my = min_y; my <= max_y; my++ ) |
- { |
- int ycost = p_cost_mvy[my<<2]; |
- if( bcost <= ycost ) |
- continue; |
- bcost -= ycost; |
- xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, |
- cost_fpel_mvx+min_x, xs, width, bcost ); |
- for( i=0; i<xn-2; i+=3 ) |
- COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my ); |
- bcost += ycost; |
- for( ; i<xn; i++ ) |
- COST_MV( min_x+xs[i], my ); |
- } |
- } |
+ /* successive elimination by comparing DC before a full SAD, |
+ * because sum(abs(diff)) >= abs(diff(sum)). */ |
+ uint16_t *sums_base = m->integral; |
+ /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned. |
+ * this is not a problem because it is not used for any SSE instructions. */ |
+ ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] ); |
+ ALIGNED_ARRAY_16( int, enc_dc,[4] ); |
+ int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; |
+ int delta = x264_pixel_size[sad_size].w; |
+ int16_t *xs = h->scratch_buffer; |
+ int xn; |
+ uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2); |
+ |
+ h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta, |
+ p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE, |
+ FENC_STRIDE, enc_dc ); |
+ if( delta == 4 ) |
+ sums_base += stride * (h->fenc->i_lines[0] + PADV*2); |
+ if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) |
+ delta *= stride; |
+ if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 ) |
+ enc_dc[1] = enc_dc[2]; |
+ |
+ if( h->mb.i_me_method == X264_ME_TESA ) |
+ { |
+ // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD |
+ mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15)); |
+ int nmvsad = 0, limit; |
+ int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; |
+ int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+bmy*stride+bmx, stride ) |
+ + BITS_MVD( bmx, bmy ); |
+ for( my = min_y; my <= max_y; my++ ) |
+ { |
+ int ycost = p_cost_mvy[my<<2]; |
+ if( bsad <= ycost ) |
+ continue; |
+ bsad -= ycost; |
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, |
+ cost_fpel_mvx+min_x, xs, width, bsad*17/16 ); |
+ for( i=0; i<xn-2; i+=3 ) |
+ { |
+ uint8_t *ref = p_fref+min_x+my*stride; |
+ int sads[3]; |
+ h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads ); |
+ for( j=0; j<3; j++ ) |
+ { |
+ int sad = sads[j] + cost_fpel_mvx[xs[i+j]]; |
+ if( sad < bsad*sad_thresh>>3 ) |
+ { |
+ COPY1_IF_LT( bsad, sad ); |
+ mvsads[nmvsad].sad = sad + ycost; |
+ mvsads[nmvsad].mx = min_x+xs[i+j]; |
+ mvsads[nmvsad].my = my; |
+ nmvsad++; |
+ } |
+ } |
+ } |
+ for( ; i<xn; i++ ) |
+ { |
+ int mx = min_x+xs[i]; |
+ int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+mx+my*stride, stride ) |
+ + cost_fpel_mvx[xs[i]]; |
+ if( sad < bsad*sad_thresh>>3 ) |
+ { |
+ COPY1_IF_LT( bsad, sad ); |
+ mvsads[nmvsad].sad = sad + ycost; |
+ mvsads[nmvsad].mx = mx; |
+ mvsads[nmvsad].my = my; |
+ nmvsad++; |
+ } |
+ } |
+ bsad += ycost; |
+ } |
+ |
+ limit = i_me_range / 2; |
+ sad_thresh = bsad*sad_thresh>>3; |
+ while( nmvsad > limit*2 && sad_thresh > bsad ) |
+ { |
+ // halve the range if the domain is too large... eh, close enough |
+ sad_thresh = (sad_thresh + bsad) >> 1; |
+ for( i=0; i<nmvsad && mvsads[i].sad <= sad_thresh; i++ ); |
+ for( j=i; j<nmvsad; j++ ) |
+ { |
+ /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */ |
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) |
+ *(uint64_t*)&mvsads[i] = *(uint64_t*)&mvsads[j]; |
+ else |
+ mvsads[i] = mvsads[j]; |
+ i += mvsads[j].sad <= sad_thresh; |
+ } |
+ nmvsad = i; |
+ } |
+ while( nmvsad > limit ) |
+ { |
+ int bsad = mvsads[0].sad; |
+ int bi = 0; |
+ for( i=1; i<nmvsad; i++ ) |
+ COPY2_IF_GT( bsad, mvsads[i].sad, bi, i ); |
+ nmvsad--; |
+ mvsads[bi] = mvsads[nmvsad]; |
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) |
+ *(uint64_t*)&mvsads[bi] = *(uint64_t*)&mvsads[nmvsad]; |
+ else |
+ mvsads[bi] = mvsads[nmvsad]; |
+ } |
+ for( i=0; i<nmvsad; i++ ) |
+ COST_MV( mvsads[i].mx, mvsads[i].my ); |
+ } |
+ else |
+ { |
+ // just ADS and SAD |
+ for( my = min_y; my <= max_y; my++ ) |
+ { |
+ int ycost = p_cost_mvy[my<<2]; |
+ if( bcost <= ycost ) |
+ continue; |
+ bcost -= ycost; |
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta, |
+ cost_fpel_mvx+min_x, xs, width, bcost ); |
+ for( i=0; i<xn-2; i+=3 ) |
+ COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my ); |
+ bcost += ycost; |
+ for( ; i<xn; i++ ) |
+ COST_MV( min_x+xs[i], my ); |
+ } |
+ } |
#endif |
- } |
- break; |
+ } |
+ break; |
} |
|
- /* -> qpel mv */ |
- if( bpred_cost < bcost ) |
+ /* -> qpel mv */ |
+ if( bpred_cost < bcost ) |
{ |
- m->mv[0] = bpred_mx; |
- m->mv[1] = bpred_my; |
- m->cost = bpred_cost; |
+ m->mv[0] = bpred_mx; |
+ m->mv[1] = bpred_my; |
+ m->cost = bpred_cost; |
} |
- else |
+ else |
{ |
- m->mv[0] = bmx << 2; |
- m->mv[1] = bmy << 2; |
- m->cost = bcost; |
+ m->mv[0] = bmx << 2; |
+ m->mv[1] = bmy << 2; |
+ m->cost = bcost; |
} |
|
- /* compute the real cost */ |
- m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ]; |
- if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 ) |
- m->cost += m->cost_mv; |
+ /* compute the real cost */ |
+ m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ]; |
+ if( bmx == pmx && bmy == pmy && h->mb.i_subpel_refine < 3 ) |
+ m->cost += m->cost_mv; |
|
- /* subpel refine */ |
- if( h->mb.i_subpel_refine >= 2 ) |
+ /* subpel refine */ |
+ if( h->mb.i_subpel_refine >= 2 ) |
{ |
- int hpel = subpel_iterations[h->mb.i_subpel_refine][2]; |
- int qpel = subpel_iterations[h->mb.i_subpel_refine][3]; |
- refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 ); |
+ int hpel = subpel_iterations[h->mb.i_subpel_refine][2]; |
+ int qpel = subpel_iterations[h->mb.i_subpel_refine][3]; |
+ refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 ); |
} |
} |
#undef COST_MV |
|
void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) |
{ |
- int hpel = subpel_iterations[h->mb.i_subpel_refine][0]; |
- int qpel = subpel_iterations[h->mb.i_subpel_refine][1]; |
+ int hpel = subpel_iterations[h->mb.i_subpel_refine][0]; |
+ int qpel = subpel_iterations[h->mb.i_subpel_refine][1]; |
|
- if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P ) |
- m->cost -= m->i_ref_cost; |
+ if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P ) |
+ m->cost -= m->i_ref_cost; |
|
- refine_subpel( h, m, hpel, qpel, NULL, 1 ); |
+ refine_subpel( h, m, hpel, qpel, NULL, 1 ); |
} |
|
-#define COST_MV_SAD( mx, my ) \ |
-{ \ |
- int stride = 16; \ |
+#define COST_MV_SAD( mx, my ) \ |
+ { \ |
+ int stride = 16; \ |
uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ |
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ |
- + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ |
- COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ |
-} |
- |
-#define COST_MV_SATD( mx, my, dir ) \ |
-if( b_refine_qpel || (dir^1) != odir ) \ |
-{ \ |
- int stride = 16; \ |
- uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ |
- int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ |
- + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ |
- if( b_chroma_me && cost < bcost ) \ |
- { \ |
- h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \ |
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \ |
- if( cost < bcost ) \ |
- { \ |
- h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \ |
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \ |
- } \ |
- } \ |
- if( cost < bcost ) \ |
- { \ |
- bcost = cost; \ |
- bmx = mx; \ |
- bmy = my; \ |
- bdir = dir; \ |
- } \ |
-} |
+ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ |
+ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ |
+ } |
+ |
+#define COST_MV_SATD( mx, my, dir ) \ |
+ if( b_refine_qpel || (dir^1) != odir ) \ |
+ { \ |
+ int stride = 16; \ |
+ uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ |
+ int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ |
+ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ |
+ if( b_chroma_me && cost < bcost ) \ |
+ { \ |
+ h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \ |
+ cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \ |
+ if( cost < bcost ) \ |
+ { \ |
+ h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \ |
+ cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \ |
+ } \ |
+ } \ |
+ if( cost < bcost ) \ |
+ { \ |
+ bcost = cost; \ |
+ bmx = mx; \ |
+ bmy = my; \ |
+ bdir = dir; \ |
+ } \ |
+ } |
|
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel ) |
{ |
- const int bw = x264_pixel_size[m->i_pixel].w; |
- const int bh = x264_pixel_size[m->i_pixel].h; |
- const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; |
- const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; |
- const int i_pixel = m->i_pixel; |
- const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; |
- |
- ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] ); // really 17x17, but round up for alignment |
- int omx, omy; |
- int i; |
- |
- int bmx = m->mv[0]; |
- int bmy = m->mv[1]; |
- int bcost = m->cost; |
- int odir = -1, bdir; |
+ const int bw = x264_pixel_size[m->i_pixel].w; |
+ const int bh = x264_pixel_size[m->i_pixel].h; |
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; |
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; |
+ const int i_pixel = m->i_pixel; |
+ const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; |
+ |
+ ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] ); // really 17x17, but round up for alignment |
+ int omx, omy; |
+ int i; |
+ |
+ int bmx = m->mv[0]; |
+ int bmy = m->mv[1]; |
+ int bcost = m->cost; |
+ int odir = -1, bdir; |
|
- /* try the subpel component of the predicted mv */ |
- if( hpel_iters && h->mb.i_subpel_refine < 3 ) |
+ /* try the subpel component of the predicted mv */ |
+ if( hpel_iters && h->mb.i_subpel_refine < 3 ) |
{ |
- int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 ); |
- int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 ); |
- if( (mx-bmx)|(my-bmy) ) |
- COST_MV_SAD( mx, my ); |
+ int mx = x264_clip3( m->mvp[0], h->mb.mv_min_spel[0]+2, h->mb.mv_max_spel[0]-2 ); |
+ int my = x264_clip3( m->mvp[1], h->mb.mv_min_spel[1]+2, h->mb.mv_max_spel[1]-2 ); |
+ if( (mx-bmx)|(my-bmy) ) |
+ COST_MV_SAD( mx, my ); |
} |
|
- /* halfpel diamond search */ |
- for( i = hpel_iters; i > 0; i-- ) |
+ /* halfpel diamond search */ |
+ for( i = hpel_iters; i > 0; i-- ) |
{ |
- int omx = bmx, omy = bmy; |
- int costs[4]; |
- int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough |
- uint8_t *src0, *src1, *src2, *src3; |
- src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 ); |
- src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh ); |
- src1 = src0 + stride; |
- src3 = src2 + 1; |
- h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); |
- COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 ); |
- COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 ); |
- COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy ], bmx, omx-2, bmy, omy ); |
- COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+2] + p_cost_mvy[omy ], bmx, omx+2, bmy, omy ); |
- if( (bmx == omx) & (bmy == omy) ) |
- break; |
+ int omx = bmx, omy = bmy; |
+ int costs[4]; |
+ int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough |
+ uint8_t *src0, *src1, *src2, *src3; |
+ src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 ); |
+ src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh ); |
+ src1 = src0 + stride; |
+ src3 = src2 + 1; |
+ h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); |
+ COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 ); |
+ COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 ); |
+ COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy ], bmx, omx-2, bmy, omy ); |
+ COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+2] + p_cost_mvy[omy ], bmx, omx+2, bmy, omy ); |
+ if( (bmx == omx) & (bmy == omy) ) |
+ break; |
} |
|
- if( !b_refine_qpel ) |
+ if( !b_refine_qpel ) |
{ |
- bcost = COST_MAX; |
- COST_MV_SATD( bmx, bmy, -1 ); |
+ bcost = COST_MAX; |
+ COST_MV_SATD( bmx, bmy, -1 ); |
} |
|
- /* early termination when examining multiple reference frames */ |
- if( p_halfpel_thresh ) |
+ /* early termination when examining multiple reference frames */ |
+ if( p_halfpel_thresh ) |
{ |
- if( (bcost*7)>>3 > *p_halfpel_thresh ) |
+ if( (bcost*7)>>3 > *p_halfpel_thresh ) |
{ |
- m->cost = bcost; |
- m->mv[0] = bmx; |
- m->mv[1] = bmy; |
- // don't need cost_mv |
- return; |
+ m->cost = bcost; |
+ m->mv[0] = bmx; |
+ m->mv[1] = bmy; |
+ // don't need cost_mv |
+ return; |
} |
- else if( bcost < *p_halfpel_thresh ) |
- *p_halfpel_thresh = bcost; |
+ else if( bcost < *p_halfpel_thresh ) |
+ *p_halfpel_thresh = bcost; |
} |
|
- /* quarterpel diamond search */ |
- bdir = -1; |
- for( i = qpel_iters; i > 0; i-- ) |
+ /* quarterpel diamond search */ |
+ bdir = -1; |
+ for( i = qpel_iters; i > 0; i-- ) |
{ |
- if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] ) |
- break; |
- odir = bdir; |
- omx = bmx; |
- omy = bmy; |
- COST_MV_SATD( omx, omy - 1, 0 ); |
- COST_MV_SATD( omx, omy + 1, 1 ); |
- COST_MV_SATD( omx - 1, omy, 2 ); |
- COST_MV_SATD( omx + 1, omy, 3 ); |
- if( bmx == omx && bmy == omy ) |
- break; |
+ if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] ) |
+ break; |
+ odir = bdir; |
+ omx = bmx; |
+ omy = bmy; |
+ COST_MV_SATD( omx, omy - 1, 0 ); |
+ COST_MV_SATD( omx, omy + 1, 1 ); |
+ COST_MV_SATD( omx - 1, omy, 2 ); |
+ COST_MV_SATD( omx + 1, omy, 3 ); |
+ if( bmx == omx && bmy == omy ) |
+ break; |
} |
|
- m->cost = bcost; |
- m->mv[0] = bmx; |
- m->mv[1] = bmy; |
- m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ]; |
+ m->cost = bcost; |
+ m->mv[0] = bmx; |
+ m->mv[1] = bmy; |
+ m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ]; |
} |
|
-#define BIME_CACHE( dx, dy, list ) \ |
-{ \ |
- x264_me_t *m = m##list;\ |
- int i = 4 + 3*dx + dy; \ |
- int mvx = om##list##x+dx;\ |
- int mvy = om##list##y+dy;\ |
- stride##list[i] = bw;\ |
+#define BIME_CACHE( dx, dy, list ) \ |
+ { \ |
+ x264_me_t *m = m##list; \ |
+ int i = 4 + 3*dx + dy; \ |
+ int mvx = om##list##x+dx; \ |
+ int mvy = om##list##y+dy; \ |
+ stride##list[i] = bw; \ |
src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh ); \ |
- if( rd )\ |
- {\ |
- if( h->mb.b_interlaced & ref##list )\ |
- mvy += (h->mb.i_mb_y & 1)*4 - 2;\ |
- h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\ |
- h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 );\ |
- }\ |
-} |
- |
-#define BIME_CACHE2(a,b,list) \ |
- BIME_CACHE(a,b,list) \ |
- BIME_CACHE(-(a),-(b),list) |
- |
-#define BIME_CACHE8(list) \ |
-{\ |
- BIME_CACHE2( 1, 0, list );\ |
- BIME_CACHE2( 0, 1, list );\ |
- BIME_CACHE2( 1, 1, list );\ |
- BIME_CACHE2( 1,-1, list );\ |
-} |
+ if( rd ) \ |
+ { \ |
+ if( h->mb.b_interlaced & ref##list ) \ |
+ mvy += (h->mb.i_mb_y & 1)*4 - 2; \ |
+ h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 ); \ |
+ h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy, bw>>1, bh>>1 ); \ |
+ } \ |
+ } |
+ |
+#define BIME_CACHE2(a,b,list) \ |
+ BIME_CACHE(a,b,list) \ |
+ BIME_CACHE(-(a),-(b),list) |
+ |
+#define BIME_CACHE8(list) \ |
+ { \ |
+ BIME_CACHE2( 1, 0, list ); \ |
+ BIME_CACHE2( 0, 1, list ); \ |
+ BIME_CACHE2( 1, 1, list ); \ |
+ BIME_CACHE2( 1,-1, list ); \ |
+ } |
|
#define SATD_THRESH 17/16 |
|
-#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \ |
-if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \ |
-{ \ |
- int cost; \ |
- int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \ |
- int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \ |
- visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\ |
- h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \ |
- cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \ |
- + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \ |
- + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \ |
- if( rd ) \ |
- { \ |
- if( cost < bcost * SATD_THRESH ) \ |
- { \ |
- uint64_t costrd; \ |
- if( cost < bcost ) \ |
- bcost = cost; \ |
- *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \ |
- *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \ |
- h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );\ |
- h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );\ |
- costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); \ |
- if( costrd < bcostrd ) \ |
- {\ |
- bcostrd = costrd;\ |
- bm0x = m0x; \ |
- bm0y = m0y; \ |
- bm1x = m1x; \ |
- bm1y = m1y; \ |
- }\ |
- } \ |
- } \ |
- else if( cost < bcost ) \ |
- { \ |
- bcost = cost; \ |
- bm0x = m0x; \ |
- bm0y = m0y; \ |
- bm1x = m1x; \ |
- bm1y = m1y; \ |
- } \ |
-} |
+#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \ |
+ if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \ |
+ { \ |
+ int cost; \ |
+ int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \ |
+ int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \ |
+ visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7)); \ |
+ h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \ |
+ cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \ |
+ + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \ |
+ + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \ |
+ if( rd ) \ |
+ { \ |
+ if( cost < bcost * SATD_THRESH ) \ |
+ { \ |
+ uint64_t costrd; \ |
+ if( cost < bcost ) \ |
+ bcost = cost; \ |
+ *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \ |
+ *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \ |
+ h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight ); \ |
+ h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight ); \ |
+ costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); \ |
+ if( costrd < bcostrd ) \ |
+ { \ |
+ bcostrd = costrd; \ |
+ bm0x = m0x; \ |
+ bm0y = m0y; \ |
+ bm1x = m1x; \ |
+ bm1y = m1y; \ |
+ } \ |
+ } \ |
+ } \ |
+ else if( cost < bcost ) \ |
+ { \ |
+ bcost = cost; \ |
+ bm0x = m0x; \ |
+ bm0y = m0y; \ |
+ bm1x = m1x; \ |
+ bm1y = m1y; \ |
+ } \ |
+ } |
|
-#define CHECK_BIDIR(a,b,c,d) \ |
- COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d) |
+#define CHECK_BIDIR(a,b,c,d) \ |
+ COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d) |
|
static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd ) |
{ |
- static const int pixel_mv_offs[] = { 0, 4, 4*8, 0 }; |
- int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]]; |
- int16_t *cache0_mv2 = cache0_mv + pixel_mv_offs[m0->i_pixel]; |
- int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]]; |
- int16_t *cache1_mv2 = cache1_mv + pixel_mv_offs[m0->i_pixel]; |
- const int i_pixel = m0->i_pixel; |
- const int bw = x264_pixel_size[i_pixel].w; |
- const int bh = x264_pixel_size[i_pixel].h; |
- const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0]; |
- const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1]; |
- const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0]; |
- const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1]; |
- ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] ); |
- ALIGNED_8( uint8_t pixu_buf[2][9][8*8] ); |
- ALIGNED_8( uint8_t pixv_buf[2][9][8*8] ); |
- uint8_t *src0[9]; |
- uint8_t *src1[9]; |
- uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8]; |
- uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; |
- uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; |
- int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]]; |
- int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]]; |
- int stride0[9]; |
- int stride1[9]; |
- int bm0x = m0->mv[0], om0x = bm0x; |
- int bm0y = m0->mv[1], om0y = bm0y; |
- int bm1x = m1->mv[0], om1x = bm1x; |
- int bm1y = m1->mv[1], om1y = bm1y; |
- int bcost = COST_MAX; |
- int pass = 0; |
- int j; |
- int mc_list0 = 1, mc_list1 = 1; |
- uint64_t bcostrd = COST_MAX64; |
- /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ |
- ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] ); |
- /* all permutations of an offset in up to 2 of the dimensions */ |
- static const int8_t dia4d[32][4] = { |
- {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0}, |
- {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0}, |
- {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0}, |
- {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1}, |
- {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0}, |
- {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0}, |
- {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1}, |
- {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0}, |
- }; |
- |
- if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 || |
- bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ) |
- return; |
- |
- h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) ); |
- |
- BIME_CACHE( 0, 0, 0 ); |
- BIME_CACHE( 0, 0, 1 ); |
- CHECK_BIDIR( 0, 0, 0, 0 ); |
+ static const int pixel_mv_offs[] = { 0, 4, 4*8, 0 }; |
+ int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]]; |
+ int16_t *cache0_mv2 = cache0_mv + pixel_mv_offs[m0->i_pixel]; |
+ int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]]; |
+ int16_t *cache1_mv2 = cache1_mv + pixel_mv_offs[m0->i_pixel]; |
+ const int i_pixel = m0->i_pixel; |
+ const int bw = x264_pixel_size[i_pixel].w; |
+ const int bh = x264_pixel_size[i_pixel].h; |
+ const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0]; |
+ const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1]; |
+ const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0]; |
+ const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1]; |
+ ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] ); |
+ ALIGNED_8( uint8_t pixu_buf[2][9][8*8] ); |
+ ALIGNED_8( uint8_t pixv_buf[2][9][8*8] ); |
+ uint8_t *src0[9]; |
+ uint8_t *src1[9]; |
+ uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8]; |
+ uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; |
+ uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; |
+ int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]]; |
+ int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]]; |
+ int stride0[9]; |
+ int stride1[9]; |
+ int bm0x = m0->mv[0], om0x = bm0x; |
+ int bm0y = m0->mv[1], om0y = bm0y; |
+ int bm1x = m1->mv[0], om1x = bm1x; |
+ int bm1y = m1->mv[1], om1y = bm1y; |
+ int bcost = COST_MAX; |
+ int pass = 0; |
+ int j; |
+ int mc_list0 = 1, mc_list1 = 1; |
+ uint64_t bcostrd = COST_MAX64; |
+ /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ |
+ ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] ); |
+ /* all permutations of an offset in up to 2 of the dimensions */ |
+ static const int8_t dia4d[32][4] = { |
+ {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0}, |
+ {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0}, |
+ {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0}, |
+ {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1}, |
+ {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0}, |
+ {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0}, |
+ {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1}, |
+ {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0}, |
+ }; |
+ |
+ if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 || |
+ bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ) |
+ return; |
+ |
+ h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) ); |
+ |
+ BIME_CACHE( 0, 0, 0 ); |
+ BIME_CACHE( 0, 0, 1 ); |
+ CHECK_BIDIR( 0, 0, 0, 0 ); |
|
- for( pass = 0; pass < 8; pass++ ) |
+ for( pass = 0; pass < 8; pass++ ) |
{ |
- /* check all mv pairs that differ in at most 2 components from the current mvs. */ |
- /* doesn't do chroma ME. this probably doesn't matter, as the gains |
- * from bidir ME are the same with and without chroma ME. */ |
- |
- if( mc_list0 ) |
- BIME_CACHE8( 0 ); |
- if( mc_list1 ) |
- BIME_CACHE8( 1 ); |
- |
- for( j=0; j<32; j++ ) |
- CHECK_BIDIR( dia4d[j][0], dia4d[j][1], dia4d[j][2], dia4d[j][3] ); |
- |
- mc_list0 = (om0x-bm0x)|(om0y-bm0y); |
- mc_list1 = (om1x-bm1x)|(om1y-bm1y); |
- if( !mc_list0 && !mc_list1 ) |
- break; |
- |
- om0x = bm0x; |
- om0y = bm0y; |
- om1x = bm1x; |
- om1y = bm1y; |
- |
- if( mc_list0 ) |
- BIME_CACHE( 0, 0, 0 ); |
- if( mc_list1 ) |
- BIME_CACHE( 0, 0, 1 ); |
+ /* check all mv pairs that differ in at most 2 components from the current mvs. */ |
+ /* doesn't do chroma ME. this probably doesn't matter, as the gains |
+ * from bidir ME are the same with and without chroma ME. */ |
+ |
+ if( mc_list0 ) |
+ BIME_CACHE8( 0 ); |
+ if( mc_list1 ) |
+ BIME_CACHE8( 1 ); |
+ |
+ for( j=0; j<32; j++ ) |
+ CHECK_BIDIR( dia4d[j][0], dia4d[j][1], dia4d[j][2], dia4d[j][3] ); |
+ |
+ mc_list0 = (om0x-bm0x)|(om0y-bm0y); |
+ mc_list1 = (om1x-bm1x)|(om1y-bm1y); |
+ if( !mc_list0 && !mc_list1 ) |
+ break; |
+ |
+ om0x = bm0x; |
+ om0y = bm0y; |
+ om1x = bm1x; |
+ om1y = bm1y; |
+ |
+ if( mc_list0 ) |
+ BIME_CACHE( 0, 0, 0 ); |
+ if( mc_list1 ) |
+ BIME_CACHE( 0, 0, 1 ); |
} |
|
- m0->mv[0] = bm0x; |
- m0->mv[1] = bm0y; |
- m1->mv[0] = bm1x; |
- m1->mv[1] = bm1y; |
+ m0->mv[0] = bm0x; |
+ m0->mv[1] = bm0y; |
+ m1->mv[0] = bm1x; |
+ m1->mv[1] = bm1y; |
} |
|
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ) |
{ |
- x264_me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 ); |
+ x264_me_refine_bidir( h, m0, m1, i_weight, 0, 0, 0 ); |
} |
|
void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2 ) |
{ |
- /* Motion compensation is done as part of bidir_rd; don't repeat |
- * it in encoding. */ |
- h->mb.b_skip_mc = 1; |
- x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 ); |
- h->mb.b_skip_mc = 0; |
+ /* Motion compensation is done as part of bidir_rd; don't repeat |
+ * it in encoding. */ |
+ h->mb.b_skip_mc = 1; |
+ x264_me_refine_bidir( h, m0, m1, i_weight, i8, i_lambda2, 1 ); |
+ h->mb.b_skip_mc = 0; |
} |
|
#undef COST_MV_SATD |
-#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \ |
-{ \ |
- if( !avoid_mvp || !(mx == pmx && my == pmy) ) \ |
- { \ |
- int stride = 16; \ |
+#define COST_MV_SATD( mx, my, dst, avoid_mvp ) \ |
+ { \ |
+ if( !avoid_mvp || !(mx == pmx && my == pmy) ) \ |
+ { \ |
+ int stride = 16; \ |
uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \ |
dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ |
- + p_cost_mvx[mx] + p_cost_mvy[my]; \ |
- COPY1_IF_LT( bsatd, dst ); \ |
- } \ |
- else \ |
- dst = COST_MAX; \ |
-} |
- |
-#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \ |
-{ \ |
- if( satd <= bsatd * SATD_THRESH ) \ |
- { \ |
- uint64_t cost; \ |
+ + p_cost_mvx[mx] + p_cost_mvy[my]; \ |
+ COPY1_IF_LT( bsatd, dst ); \ |
+ } \ |
+ else \ |
+ dst = COST_MAX; \ |
+ } |
+ |
+#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \ |
+ { \ |
+ if( satd <= bsatd * SATD_THRESH ) \ |
+ { \ |
+ uint64_t cost; \ |
*(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \ |
- cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ |
+ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ |
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ |
- } \ |
-} |
+ } \ |
+ } |
|
void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int i_list ) |
{ |
- // don't have to fill the whole mv cache rectangle |
- static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 }; |
- int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]]; |
- int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel]; |
- const uint16_t *p_cost_mvx, *p_cost_mvy; |
- const int bw = x264_pixel_size[m->i_pixel].w>>2; |
- const int bh = x264_pixel_size[m->i_pixel].h>>2; |
- const int i_pixel = m->i_pixel; |
- |
- ALIGNED_ARRAY_16( uint8_t, pix,[16*16] ); |
- uint64_t bcost = COST_MAX64; |
- int bmx = m->mv[0]; |
- int bmy = m->mv[1]; |
- int omx, omy, pmx, pmy, i, j; |
- unsigned bsatd; |
- int satd = 0; |
- int dir = -2; |
- int satds[8]; |
- |
- if( m->i_pixel != PIXEL_16x16 && i4 != 0 ) |
- x264_mb_predict_mv( h, i_list, i4, bw, m->mvp ); |
- pmx = m->mvp[0]; |
- pmy = m->mvp[1]; |
- p_cost_mvx = m->p_cost_mv - pmx; |
- p_cost_mvy = m->p_cost_mv - pmy; |
- COST_MV_SATD( bmx, bmy, bsatd, 0 ); |
- if( m->i_pixel != PIXEL_16x16 ) |
- COST_MV_RD( bmx, bmy, 0, 0, 0 ) |
+ // don't have to fill the whole mv cache rectangle |
+ static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 }; |
+ int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]]; |
+ int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel]; |
+ const uint16_t *p_cost_mvx, *p_cost_mvy; |
+ const int bw = x264_pixel_size[m->i_pixel].w>>2; |
+ const int bh = x264_pixel_size[m->i_pixel].h>>2; |
+ const int i_pixel = m->i_pixel; |
+ |
+ ALIGNED_ARRAY_16( uint8_t, pix,[16*16] ); |
+ uint64_t bcost = COST_MAX64; |
+ int bmx = m->mv[0]; |
+ int bmy = m->mv[1]; |
+ int omx, omy, pmx, pmy, i, j; |
+ unsigned bsatd; |
+ int satd = 0; |
+ int dir = -2; |
+ int satds[8]; |
+ |
+ if( m->i_pixel != PIXEL_16x16 && i4 != 0 ) |
+ x264_mb_predict_mv( h, i_list, i4, bw, m->mvp ); |
+ pmx = m->mvp[0]; |
+ pmy = m->mvp[1]; |
+ p_cost_mvx = m->p_cost_mv - pmx; |
+ p_cost_mvy = m->p_cost_mv - pmy; |
+ COST_MV_SATD( bmx, bmy, bsatd, 0 ); |
+ if( m->i_pixel != PIXEL_16x16 ) |
+ COST_MV_RD( bmx, bmy, 0, 0, 0 ) |
else |
- bcost = m->cost; |
+ bcost = m->cost; |
|
- /* check the predicted mv */ |
- if( (bmx != pmx || bmy != pmy) |
- && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0] |
- && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] ) |
+ /* check the predicted mv */ |
+ if( (bmx != pmx || bmy != pmy) |
+ && pmx >= h->mb.mv_min_spel[0] && pmx <= h->mb.mv_max_spel[0] |
+ && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] ) |
{ |
- COST_MV_SATD( pmx, pmy, satd, 0 ); |
- COST_MV_RD( pmx, pmy, satd, 0,0 ); |
- /* The hex motion search is guaranteed to not repeat the center candidate, |
- * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */ |
- if( bmx == pmx && bmy == pmy ) |
+ COST_MV_SATD( pmx, pmy, satd, 0 ); |
+ COST_MV_RD( pmx, pmy, satd, 0,0 ); |
+ /* The hex motion search is guaranteed to not repeat the center candidate, |
+ * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */ |
+ if( bmx == pmx && bmy == pmy ) |
{ |
- pmx = m->mv[0]; |
- pmy = m->mv[1]; |
+ pmx = m->mv[0]; |
+ pmy = m->mv[1]; |
} |
} |
|
- if( bmy < h->mb.mv_min_spel[1] + 3 || |
- bmy > h->mb.mv_max_spel[1] - 3 ) |
- return; |
- |
- /* subpel hex search, same pattern as ME HEX. */ |
- dir = -2; |
- omx = bmx; |
- omy = bmy; |
- for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 ); |
- for( j=0; j<6; j++ ) COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j ); |
+ if( bmy < h->mb.mv_min_spel[1] + 3 || |
+ bmy > h->mb.mv_max_spel[1] - 3 ) |
+ return; |
+ |
+ /* subpel hex search, same pattern as ME HEX. */ |
+ dir = -2; |
+ omx = bmx; |
+ omy = bmy; |
+ for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 ); |
+ for( j=0; j<6; j++ ) COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j ); |
|
- if( dir != -2 ) |
+ if( dir != -2 ) |
{ |
- /* half hexagon, not overlapping the previous iteration */ |
- for( i = 1; i < 10; i++ ) |
+ /* half hexagon, not overlapping the previous iteration */ |
+ for( i = 1; i < 10; i++ ) |
{ |
- const int odir = mod6m1[dir+1]; |
- if( bmy < h->mb.mv_min_spel[1] + 3 || |
- bmy > h->mb.mv_max_spel[1] - 3 ) |
- break; |
- dir = -2; |
- omx = bmx; |
- omy = bmy; |
- for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 ); |
- for( j=0; j<3; j++ ) COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j ); |
- if( dir == -2 ) |
- break; |
+ const int odir = mod6m1[dir+1]; |
+ if( bmy < h->mb.mv_min_spel[1] + 3 || |
+ bmy > h->mb.mv_max_spel[1] - 3 ) |
+ break; |
+ dir = -2; |
+ omx = bmx; |
+ omy = bmy; |
+ for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 ); |
+ for( j=0; j<3; j++ ) COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j ); |
+ if( dir == -2 ) |
+ break; |
} |
} |
|
- /* square refine, same pattern as ME HEX. */ |
- omx = bmx; |
- omy = bmy; |
- for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 ); |
- for( i=0; i<8; i++ ) COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 0,0 ); |
- |
- m->cost = bcost; |
- m->mv[0] = bmx; |
- m->mv[1] = bmy; |
- x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx, bmy) ); |
- x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) ); |
+ /* square refine, same pattern as ME HEX. */ |
+ omx = bmx; |
+ omy = bmy; |
+ for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 ); |
+ for( i=0; i<8; i++ ) COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 0,0 ); |
+ |
+ m->cost = bcost; |
+ m->mv[0] = bmx; |
+ m->mv[1] = bmy; |
+ x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx, bmy) ); |
+ x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) ); |
} |
diff --exclude=.git --exclude=/gitignore -Naur x264/encoder/slicetype.c x264-or/encoder/slicetype.c |
--- x264/encoder/slicetype.c 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/encoder/slicetype.c 2009-10-28 15:05:29.000000000 +0100 |
@@ -909,7 +909,9 @@ |
|
/* Restore frametypes for all frames that haven't actually been decided yet. */ |
for( j = reset_start; j <= num_frames; j++ ) |
- frames[j]->i_type = X264_TYPE_AUTO; |
+ frames[j]->i_type = X264_TYPE_AUTO; |
+ |
+ return; |
} |
|
void x264_slicetype_decide( x264_t *h ) |
diff --exclude=.git --exclude=/gitignore -Naur x264/link.ld x264-or/link.ld |
--- x264/link.ld 1970-01-01 01:00:00.000000000 +0100 |
+++ x264-or/link.ld 2009-10-28 15:05:29.000000000 +0100 |
@@ -0,0 +1,100 @@ |
+/* |
+MEMORY |
+ { |
+ vectors : ORIGIN = 0x00000000, LENGTH = 0x00002000 |
+ flash : ORIGIN = 0x04000000, LENGTH = 0x00200000 |
+ ram : ORIGIN = 0x00002000, LENGTH = 0x001fe000 |
+ icm : ORIGIN = 0x00800000, LENGTH = 0x00004000 |
+ } |
+ */ |
+ |
+MEMORY |
+ { |
+/* |
+ reset : ORIGIN = 0x00000000, LENGTH = 0x00000200 |
+ vectors : ORIGIN = 0x00000200, LENGTH = 0x00001E00 |
+ text : ORIGIN = 0x00002000, LENGTH = 0x000fe000 |
+ data : ORIGIN = 0x00100000, LENGTH = 0x00fe0000 |
+ stack : ORIGIN = 0x001fe000, LENGTH = 0x00010000 |
+*/ |
+ yuv_data : ORIGIN = 25M, LENGTH = 7M |
+ } |
+ |
+/* |
+MEMORY |
+ { |
+ reset : ORIGIN = 0xc0000000, LENGTH = 0x00000200 |
+ vectors : ORIGIN = 0xc0000200, LENGTH = 0x00001000 |
+ ram : ORIGIN = 0xc0001200, LENGTH = 0x00FFED00 |
+ } |
+*/ |
+SECTIONS |
+{ |
+/* |
+ .reset : |
+ { |
+ *(.reset) |
+ } > reset |
+ |
+ |
+ |
+ .vectors : |
+ { |
+ _vec_start = .; |
+ *(.vectors) |
+ _vec_end = .; |
+ } > vectors |
+ |
+ .text : |
+ { |
+ *(.text) |
+ } > text |
+ |
+ .rodata : |
+ { |
+ *(.rodata) |
+ *(.rodata.*) |
+ } > text |
+ |
+ .icm : |
+ { |
+ _icm_start = .; |
+ *(.icm) |
+ _icm_end = .; |
+ } > data |
+ |
+ .data : |
+ { |
+ _dst_beg = .; |
+ *(.data) |
+ _dst_end = .; |
+ } > data |
+ |
+ .bss : |
+ { |
+ *(.bss) |
+ } > data |
+ |
+ .heap : |
+ { |
+ _heap_start = .; |
+ *(.heap) |
+ _heap_end = .; |
+ } > data |
+ |
+ .stack (NOLOAD) : |
+ { |
+ *(.stack) |
+ _src_addr = .; |
+ } > data |
+*/ |
+ .yuv_data : |
+ { |
+ _yuv_data_start = .; |
+ __yuv_data_start = .; |
+ *(.yuv_data) |
+ _yuv_data_end = .; |
+ __yuv_data_end = .; |
+ } > yuv_data |
+ |
+} |
diff --exclude=.git --exclude=/gitignore -Naur x264/Makefile x264-or/Makefile |
--- x264/Makefile 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/Makefile 2009-11-15 18:48:46.000000000 +0100 |
@@ -91,7 +91,66 @@ |
$(SONAME): .depend $(OBJS) $(OBJASM) |
$(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS) |
|
-x264$(EXE): $(OBJCLI) libx264.a |
+ |
+## OR32-specific build rules |
+## |
+## Essentially we create some sample YUV video data |
+## and embed it in an ELF file, in a section called yuv_data. |
+## In the linker script we then link this section to a place |
+## in ram and tell x264 to look here when it wants data. |
+## |
+## You can download sample h264 encoded CIF sequences |
+## here: http://www.tkn.tu-berlin.de/research/evalvid/cif.html |
+## |
+## |
+GEN_FILES=encoder/analyse_init_log2.c |
+VIDEO_DATA_FILE=yuv_data.elf |
+INPUT_VIDEO_FILE ?= ../test-sequences/football_cif.264 |
+INPUT_VIDEO_SIZE ?= cif |
+YUV_VIDEO_FILE ?= video_data.yuv |
+ |
+# We will use a limit of about 5MB of video data to encode |
+ifeq ($(INPUT_VIDEO_SIZE), 4cif) |
+NUM_FRAMES ?=10 |
+endif |
+ifeq ($(INPUT_VIDEO_SIZE), cif) |
+NUM_FRAMES ?=30 |
+endif |
+ifeq ($(INPUT_VIDEO_SIZE), qcif) |
+NUM_FRAMES ?=90 |
+endif |
+ |
+$(INPUT_VIDEO_FILE): |
+ @echo; echo; |
+ @echo "\tNo sample video file to embed! Please edit the Makefile" |
+ @echo "\tand set the variable INPUT_VIDEO_FILE to the location of" |
+ @echo "\tsome sample video material." |
+ @echo "\tOr, isntead of editing the makefile, specify it on the" |
+ @echo "\tcommand line like so:" |
+ @echo "\t\tINPUT_VIDEO_FILE=../coastguard_cif.yuv"; echo |
+ @echo "\tYou can also specify the size of the video (cif, qcif etc.)" |
+ @echo "\tby specifying INPUT_VIDEO_SIZE - the default is cif, however" |
+ @echo "\tyou will need to change the program's hardcoded resolution" |
+ @echo |
+ exit 1 |
+$(YUV_VIDEO_FILE): $(INPUT_VIDEO_FILE) |
+# First convert it to raw YUV, only about 5MB large, so for cif take 30 frames, qcif is 90 frames |
+ ffmpeg -s $(INPUT_VIDEO_SIZE) -i $(INPUT_VIDEO_FILE) -vframes $(NUM_FRAMES) $(YUV_VIDEO_FILE) |
+$(VIDEO_DATA_FILE): $(YUV_VIDEO_FILE) |
+ or32-elf-ld -r -b binary -o yuv_data.o $(YUV_VIDEO_FILE) |
+ or32-elf-objcopy --rename-section .data=.yuv_data yuv_data.o |
+ mv yuv_data.o $(VIDEO_DATA_FILE) |
+encoder/analyse_init_log2.c: encoder/analyse_gen_init_array.sh |
+ cd encoder && chmod a+x analyse_gen_init_array.sh && ./analyse_gen_init_array.sh |
+#OR32_DEPS= reset.o except.o uart.o syscalls.o $(VIDEO_DATA_FILE) |
+ |
+OR32_DEPS= $(VIDEO_DATA_FILE) $(GEN_FILES) |
+ |
+sim: x264$(EXE) |
+ or32-elf-sim -f or1ksim_x264.cfg $< |
+ |
+ |
+x264$(EXE): $(OR32_DEPS) $(OBJCLI) libx264.a |
$(CC) -o $@ $+ $(LDFLAGS) |
|
checkasm: tools/checkasm.o libx264.a |
@@ -147,12 +206,13 @@ |
endif |
|
clean: |
- rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS |
+ rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS $(YUV_VIDEO_FILE) |
rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o |
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) |
- sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak |
|
distclean: clean |
+ rm -f $(OR32_DEPS) uart0* $(GEN_FILES) |
rm -f config.mak config.h x264.pc |
rm -rf test/ |
|
diff --exclude=.git --exclude=/gitignore -Naur x264/muxers.c x264-or/muxers.c |
--- x264/muxers.c 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/muxers.c 2009-11-15 18:45:09.000000000 +0100 |
@@ -52,15 +52,32 @@ |
|
typedef struct |
{ |
- FILE *fh; |
- int width, height; |
- int next_frame; |
+ //FILE *fh; |
+ char *fh; |
+ |
+ int width, height; |
+ int next_frame; |
+ |
} yuv_input_t; |
|
+void init_yuv_dataspace(char* yuv_dat_addr, hnd_t *p_handle, x264_param_t *p_param) |
+{ |
+ yuv_input_t *h = malloc( sizeof(yuv_input_t) ); |
+ V(fprintf( stderr, "init_yuv_dataspace: h addr = 0x%.8x\n", h)); |
+ h->fh = yuv_dat_addr; |
+ V(fprintf( stderr, "init_yuv_dataspace: h->fh 0x%.8x\n",h->fh)); |
+ h->width = p_param->i_width; |
+ V(fprintf( stderr, "init_yuv_dataspace: h->width %d\n",h->width)); |
+ h->height = p_param->i_height; |
+ V(fprintf( stderr, "init_yuv_dataspace: h->height %d\n",h->height)); |
+ h->next_frame = 0; |
+ *p_handle = (hnd_t *)h; |
+} |
+ |
/* raw 420 yuv file operation */ |
int open_file_yuv( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param ) |
{ |
- yuv_input_t *h = malloc( sizeof(yuv_input_t) ); |
+ yuv_input_t *h = malloc( sizeof(yuv_input_t) ); |
if( !h ) |
return -1; |
h->width = p_param->i_width; |
@@ -80,7 +97,10 @@ |
|
int get_frame_total_yuv( hnd_t handle ) |
{ |
- yuv_input_t *h = handle; |
+ yuv_input_t *h = handle; |
+ |
+ V(fprintf(stderr, "get_frame_total_yuv args: handle: 0x%.8x, h: 0x%.8x\n", handle, h)); |
+ /* |
int i_frame_total = 0; |
|
if( !fseek( h->fh, 0, SEEK_END ) ) |
@@ -88,24 +108,45 @@ |
uint64_t i_size = ftell( h->fh ); |
fseek( h->fh, 0, SEEK_SET ); |
i_frame_total = (int)(i_size / ( h->width * h->height * 3 / 2 )); |
- } |
+ }*/ |
+ |
+#ifdef USE_HARDCODED_FRAME_NUM |
+ return (int)HARDCODED_FRAME_NUM; |
+#else |
+ fprintf( stderr, "get_frame_total_yuv: %d %d %d %d\n", |
+ h->width, h->height, |
+ YUV_DATA_SIZE, |
+ (int)(YUV_DATA_SIZE / ( h->width * h->height * 3 / 2 ))); |
+ return (int)(YUV_DATA_SIZE / ( h->width * h->height * 3 / 2 ));//i_frame_total; |
+ |
+#endif |
|
- return i_frame_total; |
} |
|
int read_frame_yuv( x264_picture_t *p_pic, hnd_t handle, int i_frame ) |
{ |
yuv_input_t *h = handle; |
- |
+ int i; |
+ |
if( i_frame != h->next_frame ) |
- if( fseek( h->fh, (uint64_t)i_frame * h->width * h->height * 3 / 2, SEEK_SET ) ) |
- return -1; |
- |
- if( fread( p_pic->img.plane[0], 1, h->width * h->height, h->fh ) <= 0 |
- || fread( p_pic->img.plane[1], 1, h->width * h->height / 4, h->fh ) <= 0 |
- || fread( p_pic->img.plane[2], 1, h->width * h->height / 4, h->fh ) <= 0 ) |
- return -1; |
+ //if( fseek( h->fh, (uint64_t)i_frame * h->width * h->height * 3 / 2, SEEK_SET ) ) |
+ // Progress pointer to beginning of this frame |
+ h->fh = (void*) YUV_DATA_ADDR + i_frame * h->width * h->height * 3 / 2; |
+ //return -1; |
+ |
+ //if( fread( p_pic->img.plane[0], 1, h->width * h->height, h->fh ) <= 0 |
+ for (i=0;i<h->width * h->height;i++) p_pic->img.plane[0][i] = h->fh[i]; |
+ h->fh += h->width * h->height; |
+ //|| fread( p_pic->img.plane[1], 1, h->width * h->height / 4, h->fh ) <= 0 |
+ for (i=0;i<(h->width * h->height) / 4;i++) p_pic->img.plane[1][i] = h->fh[i]; |
+ h->fh += (h->width * h->height) / 4; |
+ |
+ //|| fread( p_pic->img.plane[2], 1, h->width * h->height / 4, h->fh ) <= 0 ) |
+ for (i=0;i<(h->width * h->height) / 4;i++) p_pic->img.plane[2][i] = h->fh[i]; |
+ h->fh += (h->width * h->height) / 4; |
|
+ //return -1; |
+ |
h->next_frame = i_frame+1; |
|
return 0; |
@@ -113,12 +154,15 @@ |
|
int close_file_yuv(hnd_t handle) |
{ |
+ return 0; |
+ /* // -- jb |
yuv_input_t *h = handle; |
if( !h || !h->fh ) |
return 0; |
fclose( h->fh ); |
free( h ); |
return 0; |
+ */ |
} |
|
/* YUV4MPEG2 raw 420 yuv file operation */ |
@@ -530,11 +574,21 @@ |
} |
#endif |
|
+#ifdef ENC_OUT_ADDR |
+char * enc_out_pos; |
+#endif |
|
int open_file_bsf( char *psz_filename, hnd_t *p_handle ) |
{ |
+#ifdef ENC_OUT_ADDR |
+ V(fprintf(stderr, "open_file_bsf\n")); |
+ // Use a hardcoded memory location for outputting encoded data to |
+ p_handle = (void *) ENC_OUT_ADDR; |
+ enc_out_pos = (char *) ENC_OUT_ADDR; |
+#else |
if( !(*p_handle = fopen(psz_filename, "w+b")) ) |
return -1; |
+#endif |
|
return 0; |
} |
@@ -546,9 +600,16 @@ |
|
int write_nalu_bsf( hnd_t handle, uint8_t *p_nalu, int i_size ) |
{ |
+#ifdef ENC_OUT_ADDR |
+ V(fprintf(stderr, "write_nalu_bsf: writing %d bytes from 0x%.8x\n", i_size, (unsigned long) enc_out_pos)); |
+ // Just write to the spot in memory and increment the pointer |
+ int i; for (i=0;i<i_size;i++) *enc_out_pos++ = p_nalu[i]; |
+ return i_size; |
+#else |
if( fwrite( p_nalu, i_size, 1, (FILE*)handle ) > 0 ) |
return i_size; |
return -1; |
+#endif |
} |
|
int set_eop_bsf( hnd_t handle, x264_picture_t *p_picture ) |
@@ -558,6 +619,15 @@ |
|
int close_file_bsf( hnd_t handle ) |
{ |
+#ifdef ENC_OUT_ADDR |
+ // Let's calculate a checksum for the written data |
+ int i; char* d = (char*)ENC_OUT_ADDR; unsigned int cksum; |
+ unsigned int size = (unsigned int)enc_out_pos - (unsigned int)ENC_OUT_ADDR; |
+ for(i=0;i<size;i++)cksum+=d[i++]; |
+ fprintf(stderr, "close_file_bsf: wrote %d bytes from 0x%.8x to 0x%.8x\n", size, (unsigned int) ENC_OUT_ADDR, (unsigned int) enc_out_pos); |
+ fprintf(stderr, "close_file_bsf: cksum 0x%.8x\n", cksum); |
+ return 0; |
+#endif |
if( !handle || handle == stdout ) |
return 0; |
|
diff --exclude=.git --exclude=/gitignore -Naur x264/muxers.h x264-or/muxers.h |
--- x264/muxers.h 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/muxers.h 2009-10-28 15:05:29.000000000 +0100 |
@@ -27,6 +27,7 @@ |
typedef void *hnd_t; |
|
int open_file_yuv( char *psz_filename, hnd_t *p_handle, x264_param_t *p_param ); |
+void init_yuv_dataspace(char* yuv_dat_addr, hnd_t *p_handle, x264_param_t *p_param); |
int get_frame_total_yuv( hnd_t handle ); |
int read_frame_yuv( x264_picture_t *p_pic, hnd_t handle, int i_frame ); |
int close_file_yuv( hnd_t handle ); |
diff --exclude=.git --exclude=/gitignore -Naur x264/or1ksim_x264.cfg x264-or/or1ksim_x264.cfg |
--- x264/or1ksim_x264.cfg 1970-01-01 01:00:00.000000000 +0100 |
+++ x264-or/or1ksim_x264.cfg 2009-11-15 19:57:09.000000000 +0100 |
@@ -0,0 +1,886 @@ |
+/* sim.cfg -- Simulator configuration script file |
+ Copyright (C) 2001-2002, Marko Mlinar, markom@opencores.org |
+ |
+This file is part of OpenRISC 1000 Architectural Simulator. |
+It contains the default configuration and help about configuring |
+the simulator. |
+ |
+This program is free software; you can redistribute it and/or modify |
+it under the terms of the GNU General Public License as published by |
+the Free Software Foundation; either version 2 of the License, or |
+(at your option) any later version. |
+ |
+This program is distributed in the hope that it will be useful, |
+but WITHOUT ANY WARRANTY; without even the implied warranty of |
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
+GNU General Public License for more details. |
+ |
+You should have received a copy of the GNU General Public License |
+along with this program; if not, write to the Free Software |
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ |
+ |
+ |
+/* INTRODUCTION |
+ |
+ The ork1sim has various parameters, that are set in configuration files |
+ like this one. The user can switch between configurations at startup by |
+ specifying the required configuration file with the -f <filename.cfg> option. |
+ If no configuration file is specified or1ksim searches for the default |
+ configuration file sim.cfg. First it searches for './sim.cfg'. If this |
+ file is not found, it searches for '~/or1k/sim.cfg'. If this file is |
+ not found too, it reverts to the built-in default configuration. |
+ |
+ NOTE: Users should not rely on the built-in configuration, since the |
+ default configuration may differ between version. |
+ Rather create a configuration file that sets all critical values. |
+ |
+ This file may contain (standard C) comments only - no // support. |
+ |
+ Configure files may be be included, using: |
+ include "file_name_to_include" |
+ |
+ Like normal configuration files, the included file is divided into |
+ sections. Each section is described in detail also. |
+ |
+ Some section have subsections. One example of such a subsection is: |
+ |
+ device <index> |
+ instance specific parameters... |
+ enddevice |
+ |
+ which creates a device instance. |
+*/ |
+ |
+ |
+/* MEMORY SECTION |
+ |
+ This section specifies how the memory is generated and the blocks |
+ it consists of. |
+ |
+ type = random/unknown/pattern |
+ Specifies the initial memory values. |
+ 'random' generates random memory using seed 'random_seed'. |
+ 'pattern' fills memory with 'pattern'. |
+ 'unknown' does not specify how memory should be generated, |
+ leaving the memory in a undefined state. This is the fastest |
+ option. |
+ |
+ random_seed = <value> |
+ random seed for randomizer, used if type = 'random'. |
+ |
+ pattern = <value> |
+ pattern to fill memory, used if type = 'pattern'. |
+ |
+ nmemories = <value> |
+ number of memory instances connected |
+ |
+ baseaddr = <hex_value> |
+ memory start address |
+ |
+ size = <hex_value> |
+ memory size |
+ |
+ name = "<string>" |
+ memory block name |
+ |
+ ce = <value> |
+ chip enable index of the memory instance |
+ |
+ mc = <value> |
+ memory controller this memory is connected to |
+ |
+ delayr = <value> |
+ cycles, required for read access, -1 if instance does not support reading |
+ |
+ delayw = <value> |
+ cycles, required for write access, -1 if instance does not support writing |
+ |
+ log = "<filename>" |
+ filename, where to log memory accesses to, no log, if log command is not specified |
+*/ |
+ |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "FLASH" |
+ ce = 0 |
+ mc = 0 |
+ baseaddr = 0xf0000000 |
+ size = 0x01000000 |
+ delayr = 10 |
+ delayw = -1 |
+end |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "RAM" |
+ ce = 1 |
+ mc = 0 |
+ baseaddr = 0x00000000 |
+ size = 0x02000000 |
+ delayr = 20 |
+ delayw = 25 |
+end |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "SRAM" |
+ mc = 0 |
+ ce = 2 |
+ baseaddr = 0xa4000000 |
+ size = 0x00100000 |
+ delayr = 1 |
+ delayw = 2 |
+end |
+ |
+ |
+/* IMMU SECTION |
+ |
+ This section configures the Instruction Memory Manangement Unit |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of ITLB sets; must be power of two |
+ |
+ nways = <value> |
+ number of ITLB ways |
+ |
+ pagesize = <value> |
+ instruction page size; must be power of two |
+ |
+ entrysize = <value> |
+ instruction entry size in bytes |
+ |
+ ustates = <value> |
+ number of ITLB usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles immu hit costs |
+ |
+ missdelay = <value> |
+ number of cycles immu miss costs |
+*/ |
+ |
+section immu |
+ enabled = 1 |
+ nsets = 64 |
+ nways = 1 |
+ pagesize = 8192 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* DMMU SECTION |
+ |
+ This section configures the Data Memory Manangement Unit |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of DTLB sets; must be power of two |
+ |
+ nways = <value> |
+ number of DTLB ways |
+ |
+ pagesize = <value> |
+ data page size; must be power of two |
+ |
+ entrysize = <value> |
+ data entry size in bytes |
+ |
+ ustates = <value> |
+ number of DTLB usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles dmmu hit costs |
+ |
+ missdelay = <value> |
+ number of cycles dmmu miss costs |
+*/ |
+ |
+section dmmu |
+ enabled = 1 |
+ nsets = 64 |
+ nways = 1 |
+ pagesize = 8192 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* IC SECTION |
+ |
+ This section configures the Instruction Cache |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of IC sets; must be power of two |
+ |
+ nways = <value> |
+ number of IC ways |
+ |
+ blocksize = <value> |
+ IC block size in bytes; must be power of two |
+ |
+ ustates = <value> |
+ number of IC usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles ic hit costs |
+ |
+ missdelay = <value> |
+ number of cycles ic miss costs |
+*/ |
+ |
+section ic |
+ enabled = 0 |
+ nsets = 512 |
+ nways = 1 |
+ blocksize = 16 |
+ hitdelay = 20 |
+ missdelay = 20 |
+end |
+ |
+ |
+/* DC SECTION |
+ |
+ This section configures the Data Cache |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of DC sets; must be power of two |
+ |
+ nways = <value> |
+ number of DC ways |
+ |
+ blocksize = <value> |
+ DC block size in bytes; must be power of two |
+ |
+ ustates = <value> |
+ number of DC usage states (2, 3, 4 etc., max is 4) |
+ |
+ load_hitdelay = <value> |
+ number of cycles dc load hit costs |
+ |
+ load_missdelay = <value> |
+ number of cycles dc load miss costs |
+ |
+ store_hitdelay = <value> |
+ number of cycles dc load hit costs |
+ |
+ store_missdelay = <value> |
+ number of cycles dc load miss costs |
+*/ |
+ |
+section dc |
+ enabled = 0 |
+ nsets = 512 |
+ nways = 1 |
+ blocksize = 16 |
+ load_hitdelay = 20 |
+ load_missdelay = 20 |
+ store_hitdelay = 20 |
+ store_missdelay = 20 |
+end |
+ |
+ |
+/* SIM SECTION |
+ |
+ This section specifies how or1ksim should behave. |
+ |
+ verbose = 0/1 |
+ '0': don't print extra messages |
+ '1': print extra messages |
+ |
+ debug = 0-9 |
+ 0 : no debug messages |
+ 1-9: debug message level. |
+ higher numbers produce more messages |
+ |
+ profile = 0/1 |
+ '0': don't generate profiling file 'sim.profile' |
+ '1': don't generate profiling file 'sim.profile' |
+ |
+ prof_fn = "<filename>" |
+ optional filename for the profiling file. |
+ valid only if 'profile' is set |
+ |
+ mprofile = 0/1 |
+ '0': don't generate memory profiling file 'sim.mprofile' |
+ '1': generate memory profiling file 'sim.mprofile' |
+ |
+ mprof_fn = "<filename>" |
+ optional filename for the memory profiling file. |
+ valid only if 'mprofile' is set |
+ |
+ history = 0/1 |
+ '0': don't track execution flow |
+ '1': track execution flow |
+ Execution flow can be tracked for the simulator's |
+ 'hist' command. Useful for back-trace debugging. |
+ |
+ iprompt = 0/1 |
+ '0': start in <not interactive prompt> (so what do we start in ???) |
+ '1': start in interactive prompt. |
+ |
+ exe_log = 0/1 |
+ '0': don't generate execution log. |
+ '1': generate execution log. |
+ |
+ exe_log = default/hardware/simple/software |
+ type of execution log, default is used when not specified |
+ |
+ exe_log_start = <value> |
+ index of first instruction to start logging, default = 0 |
+ |
+ exe_log_end = <value> |
+ index of last instruction to end logging; not limited, if omitted |
+ |
+ exe_log_marker = <value> |
+ <value> specifies number of instructions before horizontal marker is |
+ printed; if zero, markers are disabled (default) |
+ |
+ exe_log_fn = "<filename>" |
+ filename for the exection log file. |
+ valid only if 'exe_log' is set |
+ |
+ clkcycle = <value>[ps|ns|us|ms] |
+ specifies time measurement for one cycle |
+*/ |
+ |
+section sim |
+ verbose = 1 |
+ debug = 0 |
+ profile = 0 |
+ history = 0 |
+ |
+ clkcycle = 10ns |
+end |
+ |
+ |
+/* SECTION VAPI |
+ |
+ This section configures the Verification API, used for Advanced |
+ Core Verification. |
+ |
+ enabled = 0/1 |
+ '0': disbable VAPI server |
+ '1': enable/start VAPI server |
+ |
+ server_port = <value> |
+ TCP/IP port to start VAPI server on |
+ |
+ log_enabled = 0/1 |
+ '0': disable VAPI requests logging |
+ '1': enable VAPI requests logging |
+ |
+ hide_device_id = 0/1 |
+ '0': don't log device id (for compatability with old version) |
+ '1': log device id |
+ |
+ |
+ vapi_fn = <filename> |
+ filename for the log file. |
+ valid only if log_enabled is set |
+*/ |
+ |
+section VAPI |
+ enabled = 0 |
+ server_port = 9998 |
+ log_enabled = 0 |
+ vapi_log_fn = "vapi.log" |
+end |
+ |
+ |
+/* CPU SECTION |
+ |
+ This section specifies various CPU parameters. |
+ |
+ ver = <value> |
+ rev = <value> |
+ specifies version and revision of the CPU used |
+ |
+ upr = <value> |
+ changes the upr register |
+ |
+ sr = <value> |
+ sets the initial Supervision Register value |
+ |
+ superscalar = 0/1 |
+ '0': CPU is scalar |
+ '1': CPU is superscalar |
+ (modify cpu/or32/execute.c to tune superscalar model) |
+ |
+ hazards = 0/1 |
+ '0': don't track data hazards in superscalar CPU |
+ '1': track data hazards in superscalar CPU |
+ If tracked, data hazards can be displayed using the |
+ simulator's 'r' command. |
+ |
+ dependstats = 0/1 |
+ '0': don't calculate inter-instruction dependencies. |
+ '1': calculate inter-instruction dependencies. |
+ If calculated, inter-instruction dependencies can be |
+ displayed using the simulator's 'stat' command. |
+ |
+ sbuf_len = <value> |
+ length of store buffer (<= 256), 0 = disabled |
+*/ |
+ |
+section cpu |
+ ver = 0x12 |
+ cfg = 0x00 |
+ rev = 0x01 |
+ /* upr = */ |
+ superscalar = 0 |
+ hazards = 0 |
+ dependstats = 0 |
+ sbuf_len = 0 |
+ hardfloat = 1 |
+end |
+ |
+ |
+/* PM SECTION |
+ |
+ This section specifies Power Management parameters |
+ |
+ enabled = 0/1 |
+ '0': disable power management |
+ '1': enable power management |
+*/ |
+ |
+section pm |
+ enabled = 0 |
+end |
+ |
+ |
+/* BPB SECTION |
+ |
+ This section specifies how branch prediction should behave. |
+ |
+ enabled = 0/1 |
+ '0': disable branch prediction |
+ '1': enable branch prediction |
+ |
+ btic = 0/1 |
+ '0': disable branch target instruction cache model |
+ '1': enable branch target instruction cache model |
+ |
+ sbp_bf_fwd = 0/1 |
+ Static branch prediction for 'l.bf' |
+ '0': don't use forward prediction |
+ '1': use forward prediction |
+ |
+ sbp_bnf_fwd = 0/1 |
+ Static branch prediction for 'l.bnf' |
+ '0': don't use forward prediction |
+ '1': use forward prediction |
+ |
+ hitdelay = <value> |
+ number of cycles bpb hit costs |
+ |
+ missdelay = <value> |
+ number of cycles bpb miss costs |
+*/ |
+ |
+section bpb |
+ enabled = 0 |
+ btic = 0 |
+ sbp_bf_fwd = 0 |
+ sbp_bnf_fwd = 0 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* DEBUG SECTION |
+ |
+ This sections specifies how the debug unit should behave. |
+ |
+ enabled = 0/1 |
+ '0': disable debug unit |
+ '1': enable debug unit |
+ |
+ gdb_enabled = 0/1 |
+ '0': don't start gdb server |
+ '1': start gdb server at port 'server_port' |
+ |
+ server_port = <value> |
+ TCP/IP port to start gdb server on |
+ valid only if gdb_enabled is set |
+ |
+ vapi_id = <hex_value> |
+ Used to create "fake" vapi log file containing the JTAG proxy messages. |
+*/ |
+section debug |
+ enabled = 0 |
+ rsp_enabled = 0 |
+ rsp_port = 5554 |
+ /*server_port = 9999*/ |
+end |
+ |
+ |
+/* MC SECTION |
+ |
+ This section configures the memory controller |
+ |
+ enabled = 0/1 |
+ '0': disable memory controller |
+ '1': enable memory controller |
+ |
+ baseaddr = <hex_value> |
+ address of first MC register |
+ |
+ POC = <hex_value> |
+ Power On Configuration register |
+ |
+ index = <value> |
+ Index of this memory controller amongst all the memory controllers |
+*/ |
+ |
+section mc |
+ enabled = 0 |
+ baseaddr = 0x93000000 |
+ POC = 0x00000008 /* Power on configuration register */ |
+ index = 0 |
+end |
+ |
+ |
+/* UART SECTION |
+ |
+ This section configures the UARTs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first UART register for this device |
+ |
+ |
+ channel = <channeltype>:<args> |
+ |
+ The channel parameter indicates the source of received UART characters |
+ and the sink for transmitted UART characters. |
+ |
+ The <channeltype> can be either "file", "xterm", "tcp", "fd", or "tty" |
+ (without quotes). |
+ |
+ A) To send/receive characters from a pair of files, use a file |
+ channel: |
+ |
+ channel=file:<rxfile>,<txfile> |
+ |
+ B) To create an interactive terminal window, use an xterm channel: |
+ |
+ channel=xterm:[<xterm_arg>]* |
+ |
+ C) To create a bidirectional tcp socket which one could, for example, |
+ access via telnet, use a tcp channel: |
+ |
+ channel=tcp:<port number> |
+ |
+ D) To cause the UART to read/write from existing numeric file |
+ descriptors, use an fd channel: |
+ |
+ channel=fd:<rx file descriptor num>,<tx file descriptor num> |
+ |
+ E) To connect the UART to a physical serial port, create a tty |
+ channel: |
+ |
+ channel=tty:device=/dev/ttyS0,baud=9600 |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ 16550 = 0/1 |
+ '0': this device is a UART16450 |
+ '1': this device is a UART16550 |
+ |
+ jitter = <value> |
+ in msecs... time to block, -1 to disable it |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section uart |
+ enabled = 1 |
+ baseaddr = 0x90000000 |
+ irq = 2 |
+ channel = "file:uart0.rx,uart0.tx" |
+ /* channel = "tcp:10084" */ |
+ /* channel = "xterm:" */ |
+ jitter = -1 /* async behaviour */ |
+ 16550 = 1 |
+end |
+ |
+ |
+/* DMA SECTION |
+ |
+ This section configures the DMAs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first DMA register for this device |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section dma |
+ enabled = 1 |
+ baseaddr = 0x9a000000 |
+ irq = 11 |
+end |
+ |
+ |
+/* ETHERNET SECTION |
+ |
+ This section configures the ETHERNETs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first ethernet register for this device |
+ |
+ dma = <value> |
+ which controller is this ethernet "connected" to |
+ |
+ irq = <value> |
+ ethernet mac IRQ level |
+ |
+ rtx_type = <value> |
+ use 0 - file interface, 1 - socket interface |
+ |
+ rx_channel = <value> |
+ DMA channel used for RX |
+ |
+ tx_channel = <value> |
+ DMA channel used for TX |
+ |
+ rxfile = "<filename>" |
+ filename, where to read data from |
+ |
+ txfile = "<filename>" |
+ filename, where to write data to |
+ |
+ sockif = "<ifacename>" |
+ interface name of ethernet socket |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section ethernet |
+ enabled = 0 |
+ baseaddr = 0x92000000 |
+ /* dma = 0 */ |
+ irq = 4 |
+ rtx_type = 0 |
+ /* tx_channel = 0 */ |
+ /* rx_channel = 1 */ |
+ /*rxfile = "eth0.rx"*/ |
+ txfile = "eth0.tx" |
+ sockif = "eth0" |
+end |
+ |
+ |
+/* GPIO SECTION |
+ |
+ This section configures the GPIOs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first GPIO register for this device |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ base_vapi_id = <hex_value> |
+ first VAPI id of this instance |
+ GPIO uses 8 consecutive VAPI IDs |
+*/ |
+ |
+section gpio |
+ enabled = 0 |
+ baseaddr = 0x91000000 |
+ irq = 3 |
+ base_vapi_id = 0x0200 |
+end |
+ |
+/* VGA SECTION |
+ |
+ This section configures the VGA/LCD controller |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first VGA register |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ refresh_rate = <value> |
+ number of cycles between screen dumps |
+ |
+ filename = "<filename>" |
+ template name for generated names (e.g. "primary" produces "primary0023.bmp") |
+*/ |
+ |
+section vga |
+ enabled = 1 |
+ baseaddr = 0x97100000 |
+ irq = 8 |
+ refresh_rate = 100000 |
+ filename = "primary" |
+end |
+ |
+ |
+/* TICK TIMER SECTION |
+ |
+ This section configures tick timer |
+ |
+ enabled = 0/1 |
+ whether tick timer is enabled |
+*/ |
+ |
+section pic |
+ enabled = 1 |
+ edge_trigger = 1 |
+end |
+ |
+/* FB SECTION |
+ |
+ This section configures the frame buffer |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ base address of frame buffer |
+ |
+ paladdr = <hex_value> |
+ base address of first palette entry |
+ |
+ refresh_rate = <value> |
+ number of cycles between screen dumps |
+ |
+ filename = "<filename>" |
+ template name for generated names (e.g. "primary" produces "primary0023.bmp") |
+*/ |
+ |
+section fb |
+ enabled = 1 |
+ baseaddr = 0x97000000 |
+ refresh_rate = 1000000 |
+ filename = "primary" |
+end |
+ |
+ |
+/* KBD SECTION |
+ |
+ This section configures the PS/2 compatible keyboard |
+ |
+ baseaddr = <hex_value> |
+ base address of the keyboard device |
+ |
+ rxfile = "<filename>" |
+ filename, where to read data from |
+*/ |
+ |
+section kbd |
+ enabled = 1 |
+ irq = 5 |
+ baseaddr = 0x94000000 |
+ rxfile = "kbd.rx" |
+end |
+ |
+ |
+/* ATA SECTION |
+ |
+ This section configures the ATA/ATAPI host controller |
+ |
+ baseaddr = <hex_value> |
+ address of first ATA register |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ debug = <value> |
+ debug level for ata models. |
+ 0: no debug messages |
+ 1: verbose messages |
+ 3: normal messages (more messages than verbose) |
+ 5: debug messages (normal debug messages) |
+ 7: flow control messages (debug statemachine flows) |
+ 9: low priority message (display everything the code does) |
+ |
+ dev_type0/1 = <value> |
+ ata device 0 type |
+ 0: NO_CONNeCT: none (not connected) |
+ 1: FILE : simulated harddisk |
+ 2: LOCAL : local system harddisk |
+ |
+ dev_file0/1 = "<filename>" |
+ filename for simulated ATA device |
+ valid only if dev_type0 == 1 |
+ |
+ dev_size0/1 = <value> |
+ size of simulated hard-disk (in MBytes) |
+ valid only if dev_type0 == 1 |
+ |
+ dev_packet0/1 = <value> |
+ 0: simulated ATA device does NOT implement PACKET command feature set |
+ 1: simulated ATA device does implement PACKET command feature set |
+ |
+ FIXME: irq number |
+*/ |
+ |
+section ata |
+ enabled = 0 |
+ baseaddr = 0x9e000000 |
+ irq = 15 |
+ |
+end |
+ |
+ |
diff --exclude=.git --exclude=/gitignore -Naur x264/or1ksim_x264_sadssdmod.cfg x264-or/or1ksim_x264_sadssdmod.cfg |
--- x264/or1ksim_x264_sadssdmod.cfg 1970-01-01 01:00:00.000000000 +0100 |
+++ x264-or/or1ksim_x264_sadssdmod.cfg 2009-11-17 00:15:03.000000000 +0100 |
@@ -0,0 +1,896 @@ |
+/* sim.cfg -- Simulator configuration script file |
+ Copyright (C) 2001-2002, Marko Mlinar, markom@opencores.org |
+ |
+This file is part of OpenRISC 1000 Architectural Simulator. |
+It contains the default configuration and help about configuring |
+the simulator. |
+ |
+This program is free software; you can redistribute it and/or modify |
+it under the terms of the GNU General Public License as published by |
+the Free Software Foundation; either version 2 of the License, or |
+(at your option) any later version. |
+ |
+This program is distributed in the hope that it will be useful, |
+but WITHOUT ANY WARRANTY; without even the implied warranty of |
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
+GNU General Public License for more details. |
+ |
+You should have received a copy of the GNU General Public License |
+along with this program; if not, write to the Free Software |
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ |
+ |
+ |
+/* INTRODUCTION |
+ |
+ The ork1sim has various parameters, that are set in configuration files |
+ like this one. The user can switch between configurations at startup by |
+ specifying the required configuration file with the -f <filename.cfg> option. |
+ If no configuration file is specified or1ksim searches for the default |
+ configuration file sim.cfg. First it searches for './sim.cfg'. If this |
+ file is not found, it searches for '~/or1k/sim.cfg'. If this file is |
+ not found too, it reverts to the built-in default configuration. |
+ |
+ NOTE: Users should not rely on the built-in configuration, since the |
+ default configuration may differ between version. |
+ Rather create a configuration file that sets all critical values. |
+ |
+ This file may contain (standard C) comments only - no // support. |
+ |
+ Configure files may be be included, using: |
+ include "file_name_to_include" |
+ |
+ Like normal configuration files, the included file is divided into |
+ sections. Each section is described in detail also. |
+ |
+ Some section have subsections. One example of such a subsection is: |
+ |
+ device <index> |
+ instance specific parameters... |
+ enddevice |
+ |
+ which creates a device instance. |
+*/ |
+ |
+ |
+/* MEMORY SECTION |
+ |
+ This section specifies how the memory is generated and the blocks |
+ it consists of. |
+ |
+ type = random/unknown/pattern |
+ Specifies the initial memory values. |
+ 'random' generates random memory using seed 'random_seed'. |
+ 'pattern' fills memory with 'pattern'. |
+ 'unknown' does not specify how memory should be generated, |
+ leaving the memory in a undefined state. This is the fastest |
+ option. |
+ |
+ random_seed = <value> |
+ random seed for randomizer, used if type = 'random'. |
+ |
+ pattern = <value> |
+ pattern to fill memory, used if type = 'pattern'. |
+ |
+ nmemories = <value> |
+ number of memory instances connected |
+ |
+ baseaddr = <hex_value> |
+ memory start address |
+ |
+ size = <hex_value> |
+ memory size |
+ |
+ name = "<string>" |
+ memory block name |
+ |
+ ce = <value> |
+ chip enable index of the memory instance |
+ |
+ mc = <value> |
+ memory controller this memory is connected to |
+ |
+ delayr = <value> |
+ cycles, required for read access, -1 if instance does not support reading |
+ |
+ delayw = <value> |
+ cycles, required for write access, -1 if instance does not support writing |
+ |
+ log = "<filename>" |
+ filename, where to log memory accesses to, no log, if log command is not specified |
+*/ |
+ |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "FLASH" |
+ ce = 0 |
+ mc = 0 |
+ baseaddr = 0xf0000000 |
+ size = 0x01000000 |
+ delayr = 10 |
+ delayw = -1 |
+end |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "RAM" |
+ ce = 1 |
+ mc = 0 |
+ baseaddr = 0x00000000 |
+ size = 0x02000000 |
+ delayr = 20 |
+ delayw = 25 |
+end |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "SRAM" |
+ mc = 0 |
+ ce = 2 |
+ baseaddr = 0xa4000000 |
+ size = 0x00100000 |
+ delayr = 1 |
+ delayw = 2 |
+end |
+ |
+ |
+/* IMMU SECTION |
+ |
+ This section configures the Instruction Memory Manangement Unit |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of ITLB sets; must be power of two |
+ |
+ nways = <value> |
+ number of ITLB ways |
+ |
+ pagesize = <value> |
+ instruction page size; must be power of two |
+ |
+ entrysize = <value> |
+ instruction entry size in bytes |
+ |
+ ustates = <value> |
+ number of ITLB usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles immu hit costs |
+ |
+ missdelay = <value> |
+ number of cycles immu miss costs |
+*/ |
+ |
+section immu |
+ enabled = 0 |
+ nsets = 64 |
+ nways = 1 |
+ pagesize = 8192 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* DMMU SECTION |
+ |
+ This section configures the Data Memory Manangement Unit |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of DTLB sets; must be power of two |
+ |
+ nways = <value> |
+ number of DTLB ways |
+ |
+ pagesize = <value> |
+ data page size; must be power of two |
+ |
+ entrysize = <value> |
+ data entry size in bytes |
+ |
+ ustates = <value> |
+ number of DTLB usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles dmmu hit costs |
+ |
+ missdelay = <value> |
+ number of cycles dmmu miss costs |
+*/ |
+ |
+section dmmu |
+ enabled = 0 |
+ nsets = 64 |
+ nways = 1 |
+ pagesize = 8192 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* IC SECTION |
+ |
+ This section configures the Instruction Cache |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of IC sets; must be power of two |
+ |
+ nways = <value> |
+ number of IC ways |
+ |
+ blocksize = <value> |
+ IC block size in bytes; must be power of two |
+ |
+ ustates = <value> |
+ number of IC usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles ic hit costs |
+ |
+ missdelay = <value> |
+ number of cycles ic miss costs |
+*/ |
+ |
+section ic |
+ enabled = 0 |
+ nsets = 512 |
+ nways = 1 |
+ blocksize = 16 |
+ hitdelay = 20 |
+ missdelay = 20 |
+end |
+ |
+ |
+/* DC SECTION |
+ |
+ This section configures the Data Cache |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of DC sets; must be power of two |
+ |
+ nways = <value> |
+ number of DC ways |
+ |
+ blocksize = <value> |
+ DC block size in bytes; must be power of two |
+ |
+ ustates = <value> |
+ number of DC usage states (2, 3, 4 etc., max is 4) |
+ |
+ load_hitdelay = <value> |
+ number of cycles dc load hit costs |
+ |
+ load_missdelay = <value> |
+ number of cycles dc load miss costs |
+ |
+ store_hitdelay = <value> |
+ number of cycles dc load hit costs |
+ |
+ store_missdelay = <value> |
+ number of cycles dc load miss costs |
+*/ |
+ |
+section dc |
+ enabled = 0 |
+ nsets = 512 |
+ nways = 1 |
+ blocksize = 16 |
+ load_hitdelay = 20 |
+ load_missdelay = 20 |
+ store_hitdelay = 20 |
+ store_missdelay = 20 |
+end |
+ |
+ |
+/* SIM SECTION |
+ |
+ This section specifies how or1ksim should behave. |
+ |
+ verbose = 0/1 |
+ '0': don't print extra messages |
+ '1': print extra messages |
+ |
+ debug = 0-9 |
+ 0 : no debug messages |
+ 1-9: debug message level. |
+ higher numbers produce more messages |
+ |
+ profile = 0/1 |
+ '0': don't generate profiling file 'sim.profile' |
+ '1': don't generate profiling file 'sim.profile' |
+ |
+ prof_fn = "<filename>" |
+ optional filename for the profiling file. |
+ valid only if 'profile' is set |
+ |
+ mprofile = 0/1 |
+ '0': don't generate memory profiling file 'sim.mprofile' |
+ '1': generate memory profiling file 'sim.mprofile' |
+ |
+ mprof_fn = "<filename>" |
+ optional filename for the memory profiling file. |
+ valid only if 'mprofile' is set |
+ |
+ history = 0/1 |
+ '0': don't track execution flow |
+ '1': track execution flow |
+ Execution flow can be tracked for the simulator's |
+ 'hist' command. Useful for back-trace debugging. |
+ |
+ iprompt = 0/1 |
+ '0': start in <not interactive prompt> (so what do we start in ???) |
+ '1': start in interactive prompt. |
+ |
+ exe_log = 0/1 |
+ '0': don't generate execution log. |
+ '1': generate execution log. |
+ |
+ exe_log = default/hardware/simple/software |
+ type of execution log, default is used when not specified |
+ |
+ exe_log_start = <value> |
+ index of first instruction to start logging, default = 0 |
+ |
+ exe_log_end = <value> |
+ index of last instruction to end logging; not limited, if omitted |
+ |
+ exe_log_marker = <value> |
+ <value> specifies number of instructions before horizontal marker is |
+ printed; if zero, markers are disabled (default) |
+ |
+ exe_log_fn = "<filename>" |
+ filename for the exection log file. |
+ valid only if 'exe_log' is set |
+ |
+ clkcycle = <value>[ps|ns|us|ms] |
+ specifies time measurement for one cycle |
+*/ |
+ |
+section sim |
+ verbose = 1 |
+ debug = 0 |
+ profile = 0 |
+ history = 0 |
+ |
+ clkcycle = 10ns |
+end |
+ |
+ |
+/* SECTION VAPI |
+ |
+ This section configures the Verification API, used for Advanced |
+ Core Verification. |
+ |
+ enabled = 0/1 |
+ '0': disbable VAPI server |
+ '1': enable/start VAPI server |
+ |
+ server_port = <value> |
+ TCP/IP port to start VAPI server on |
+ |
+ log_enabled = 0/1 |
+ '0': disable VAPI requests logging |
+ '1': enable VAPI requests logging |
+ |
+ hide_device_id = 0/1 |
+ '0': don't log device id (for compatability with old version) |
+ '1': log device id |
+ |
+ |
+ vapi_fn = <filename> |
+ filename for the log file. |
+ valid only if log_enabled is set |
+*/ |
+ |
+section VAPI |
+ enabled = 0 |
+ server_port = 9998 |
+ log_enabled = 0 |
+ vapi_log_fn = "vapi.log" |
+end |
+ |
+ |
+/* CPU SECTION |
+ |
+ This section specifies various CPU parameters. |
+ |
+ ver = <value> |
+ rev = <value> |
+ specifies version and revision of the CPU used |
+ |
+ upr = <value> |
+ changes the upr register |
+ |
+ sr = <value> |
+ sets the initial Supervision Register value |
+ |
+ superscalar = 0/1 |
+ '0': CPU is scalar |
+ '1': CPU is superscalar |
+ (modify cpu/or32/execute.c to tune superscalar model) |
+ |
+ hazards = 0/1 |
+ '0': don't track data hazards in superscalar CPU |
+ '1': track data hazards in superscalar CPU |
+ If tracked, data hazards can be displayed using the |
+ simulator's 'r' command. |
+ |
+ dependstats = 0/1 |
+ '0': don't calculate inter-instruction dependencies. |
+ '1': calculate inter-instruction dependencies. |
+ If calculated, inter-instruction dependencies can be |
+ displayed using the simulator's 'stat' command. |
+ |
+ sbuf_len = <value> |
+ length of store buffer (<= 256), 0 = disabled |
+*/ |
+ |
+section cpu |
+ ver = 0x12 |
+ cfg = 0x00 |
+ rev = 0x01 |
+ /* upr = */ |
+ superscalar = 0 |
+ hazards = 0 |
+ dependstats = 0 |
+ sbuf_len = 0 |
+ hardfloat = 1 |
+end |
+ |
+ |
+/* PM SECTION |
+ |
+ This section specifies Power Management parameters |
+ |
+ enabled = 0/1 |
+ '0': disable power management |
+ '1': enable power management |
+*/ |
+ |
+section pm |
+ enabled = 0 |
+end |
+ |
+ |
+/* BPB SECTION |
+ |
+ This section specifies how branch prediction should behave. |
+ |
+ enabled = 0/1 |
+ '0': disable branch prediction |
+ '1': enable branch prediction |
+ |
+ btic = 0/1 |
+ '0': disable branch target instruction cache model |
+ '1': enable branch target instruction cache model |
+ |
+ sbp_bf_fwd = 0/1 |
+ Static branch prediction for 'l.bf' |
+ '0': don't use forward prediction |
+ '1': use forward prediction |
+ |
+ sbp_bnf_fwd = 0/1 |
+ Static branch prediction for 'l.bnf' |
+ '0': don't use forward prediction |
+ '1': use forward prediction |
+ |
+ hitdelay = <value> |
+ number of cycles bpb hit costs |
+ |
+ missdelay = <value> |
+ number of cycles bpb miss costs |
+*/ |
+ |
+section bpb |
+ enabled = 0 |
+ btic = 0 |
+ sbp_bf_fwd = 0 |
+ sbp_bnf_fwd = 0 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* DEBUG SECTION |
+ |
+ This sections specifies how the debug unit should behave. |
+ |
+ enabled = 0/1 |
+ '0': disable debug unit |
+ '1': enable debug unit |
+ |
+ gdb_enabled = 0/1 |
+ '0': don't start gdb server |
+ '1': start gdb server at port 'server_port' |
+ |
+ server_port = <value> |
+ TCP/IP port to start gdb server on |
+ valid only if gdb_enabled is set |
+ |
+ vapi_id = <hex_value> |
+ Used to create "fake" vapi log file containing the JTAG proxy messages. |
+*/ |
+section debug |
+ enabled = 0 |
+ rsp_enabled = 0 |
+ rsp_port = 5554 |
+ /*server_port = 9999*/ |
+end |
+ |
+ |
+/* MC SECTION |
+ |
+ This section configures the memory controller |
+ |
+ enabled = 0/1 |
+ '0': disable memory controller |
+ '1': enable memory controller |
+ |
+ baseaddr = <hex_value> |
+ address of first MC register |
+ |
+ POC = <hex_value> |
+ Power On Configuration register |
+ |
+ index = <value> |
+ Index of this memory controller amongst all the memory controllers |
+*/ |
+ |
+section mc |
+ enabled = 0 |
+ baseaddr = 0x93000000 |
+ POC = 0x00000008 /* Power on configuration register */ |
+ index = 0 |
+end |
+ |
+ |
+/* UART SECTION |
+ |
+ This section configures the UARTs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first UART register for this device |
+ |
+ |
+ channel = <channeltype>:<args> |
+ |
+ The channel parameter indicates the source of received UART characters |
+ and the sink for transmitted UART characters. |
+ |
+ The <channeltype> can be either "file", "xterm", "tcp", "fd", or "tty" |
+ (without quotes). |
+ |
+ A) To send/receive characters from a pair of files, use a file |
+ channel: |
+ |
+ channel=file:<rxfile>,<txfile> |
+ |
+ B) To create an interactive terminal window, use an xterm channel: |
+ |
+ channel=xterm:[<xterm_arg>]* |
+ |
+ C) To create a bidirectional tcp socket which one could, for example, |
+ access via telnet, use a tcp channel: |
+ |
+ channel=tcp:<port number> |
+ |
+ D) To cause the UART to read/write from existing numeric file |
+ descriptors, use an fd channel: |
+ |
+ channel=fd:<rx file descriptor num>,<tx file descriptor num> |
+ |
+ E) To connect the UART to a physical serial port, create a tty |
+ channel: |
+ |
+ channel=tty:device=/dev/ttyS0,baud=9600 |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ 16550 = 0/1 |
+ '0': this device is a UART16450 |
+ '1': this device is a UART16550 |
+ |
+ jitter = <value> |
+ in msecs... time to block, -1 to disable it |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section uart |
+ enabled = 1 |
+ baseaddr = 0x90000000 |
+ irq = 2 |
+ channel = "file:uart0.rx,uart0.tx" |
+ /* channel = "tcp:10084" */ |
+ /* channel = "xterm:" */ |
+ jitter = -1 /* async behaviour */ |
+ 16550 = 1 |
+end |
+ |
+ |
+/* DMA SECTION |
+ |
+ This section configures the DMAs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first DMA register for this device |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section dma |
+ enabled = 0 |
+ baseaddr = 0x9a000000 |
+ irq = 11 |
+end |
+ |
+ |
+/* ETHERNET SECTION |
+ |
+ This section configures the ETHERNETs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first ethernet register for this device |
+ |
+ dma = <value> |
+ which controller is this ethernet "connected" to |
+ |
+ irq = <value> |
+ ethernet mac IRQ level |
+ |
+ rtx_type = <value> |
+ use 0 - file interface, 1 - socket interface |
+ |
+ rx_channel = <value> |
+ DMA channel used for RX |
+ |
+ tx_channel = <value> |
+ DMA channel used for TX |
+ |
+ rxfile = "<filename>" |
+ filename, where to read data from |
+ |
+ txfile = "<filename>" |
+ filename, where to write data to |
+ |
+ sockif = "<ifacename>" |
+ interface name of ethernet socket |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section ethernet |
+ enabled = 0 |
+ baseaddr = 0x92000000 |
+ /* dma = 0 */ |
+ irq = 4 |
+ rtx_type = 0 |
+ /* tx_channel = 0 */ |
+ /* rx_channel = 1 */ |
+ /*rxfile = "eth0.rx"*/ |
+ txfile = "eth0.tx" |
+ sockif = "eth0" |
+end |
+ |
+ |
+/* GPIO SECTION |
+ |
+ This section configures the GPIOs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first GPIO register for this device |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ base_vapi_id = <hex_value> |
+ first VAPI id of this instance |
+ GPIO uses 8 consecutive VAPI IDs |
+*/ |
+ |
+section gpio |
+ enabled = 0 |
+ baseaddr = 0x91000000 |
+ irq = 3 |
+ base_vapi_id = 0x0200 |
+end |
+ |
+/* VGA SECTION |
+ |
+ This section configures the VGA/LCD controller |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first VGA register |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ refresh_rate = <value> |
+ number of cycles between screen dumps |
+ |
+ filename = "<filename>" |
+ template name for generated names (e.g. "primary" produces "primary0023.bmp") |
+*/ |
+ |
+section vga |
+ enabled = 0 |
+ baseaddr = 0x97100000 |
+ irq = 8 |
+ refresh_rate = 100000 |
+ filename = "primary" |
+end |
+ |
+ |
+/* TICK TIMER SECTION |
+ |
+ This section configures tick timer |
+ |
+ enabled = 0/1 |
+ whether tick timer is enabled |
+*/ |
+ |
+section pic |
+ enabled = 1 |
+ edge_trigger = 1 |
+end |
+ |
+/* FB SECTION |
+ |
+ This section configures the frame buffer |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ base address of frame buffer |
+ |
+ paladdr = <hex_value> |
+ base address of first palette entry |
+ |
+ refresh_rate = <value> |
+ number of cycles between screen dumps |
+ |
+ filename = "<filename>" |
+ template name for generated names (e.g. "primary" produces "primary0023.bmp") |
+*/ |
+ |
+section fb |
+ enabled = 0 |
+ baseaddr = 0x97000000 |
+ refresh_rate = 1000000 |
+ filename = "primary" |
+end |
+ |
+ |
+/* KBD SECTION |
+ |
+ This section configures the PS/2 compatible keyboard |
+ |
+ baseaddr = <hex_value> |
+ base address of the keyboard device |
+ |
+ rxfile = "<filename>" |
+ filename, where to read data from |
+*/ |
+ |
+section kbd |
+ enabled = 0 |
+ irq = 5 |
+ baseaddr = 0x94000000 |
+ rxfile = "kbd.rx" |
+end |
+ |
+ |
+/* ATA SECTION |
+ |
+ This section configures the ATA/ATAPI host controller |
+ |
+ baseaddr = <hex_value> |
+ address of first ATA register |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ debug = <value> |
+ debug level for ata models. |
+ 0: no debug messages |
+ 1: verbose messages |
+ 3: normal messages (more messages than verbose) |
+ 5: debug messages (normal debug messages) |
+ 7: flow control messages (debug statemachine flows) |
+ 9: low priority message (display everything the code does) |
+ |
+ dev_type0/1 = <value> |
+ ata device 0 type |
+ 0: NO_CONNeCT: none (not connected) |
+ 1: FILE : simulated harddisk |
+ 2: LOCAL : local system harddisk |
+ |
+ dev_file0/1 = "<filename>" |
+ filename for simulated ATA device |
+ valid only if dev_type0 == 1 |
+ |
+ dev_size0/1 = <value> |
+ size of simulated hard-disk (in MBytes) |
+ valid only if dev_type0 == 1 |
+ |
+ dev_packet0/1 = <value> |
+ 0: simulated ATA device does NOT implement PACKET command feature set |
+ 1: simulated ATA device does implement PACKET command feature set |
+ |
+ FIXME: irq number |
+*/ |
+ |
+section ata |
+ enabled = 0 |
+ baseaddr = 0x9e000000 |
+ irq = 15 |
+ |
+end |
+ |
+ |
+ |
+/* SAD/SSD MODULE SECTION |
+ This module calculates SAD/SSD on a set of data, specifically |
+ for use with the x264 H.264 encoding software. |
+*/ |
+section x264_sadssdmod |
+ enabled = 1 |
+ baseaddr = 0x26400000 |
+ name = "x264_sadssdmod0" |
+end |
diff --exclude=.git --exclude=/gitignore -Naur x264/rsp_or1ksim_x264.cfg x264-or/rsp_or1ksim_x264.cfg |
--- x264/rsp_or1ksim_x264.cfg 1970-01-01 01:00:00.000000000 +0100 |
+++ x264-or/rsp_or1ksim_x264.cfg 2009-11-15 19:56:58.000000000 +0100 |
@@ -0,0 +1,886 @@ |
+/* sim.cfg -- Simulator configuration script file |
+ Copyright (C) 2001-2002, Marko Mlinar, markom@opencores.org |
+ |
+This file is part of OpenRISC 1000 Architectural Simulator. |
+It contains the default configuration and help about configuring |
+the simulator. |
+ |
+This program is free software; you can redistribute it and/or modify |
+it under the terms of the GNU General Public License as published by |
+the Free Software Foundation; either version 2 of the License, or |
+(at your option) any later version. |
+ |
+This program is distributed in the hope that it will be useful, |
+but WITHOUT ANY WARRANTY; without even the implied warranty of |
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
+GNU General Public License for more details. |
+ |
+You should have received a copy of the GNU General Public License |
+along with this program; if not, write to the Free Software |
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ |
+ |
+ |
+/* INTRODUCTION |
+ |
+ The ork1sim has various parameters, that are set in configuration files |
+ like this one. The user can switch between configurations at startup by |
+ specifying the required configuration file with the -f <filename.cfg> option. |
+ If no configuration file is specified or1ksim searches for the default |
+ configuration file sim.cfg. First it searches for './sim.cfg'. If this |
+ file is not found, it searches for '~/or1k/sim.cfg'. If this file is |
+ not found too, it reverts to the built-in default configuration. |
+ |
+ NOTE: Users should not rely on the built-in configuration, since the |
+ default configuration may differ between version. |
+ Rather create a configuration file that sets all critical values. |
+ |
+ This file may contain (standard C) comments only - no // support. |
+ |
+ Configure files may be be included, using: |
+ include "file_name_to_include" |
+ |
+ Like normal configuration files, the included file is divided into |
+ sections. Each section is described in detail also. |
+ |
+ Some section have subsections. One example of such a subsection is: |
+ |
+ device <index> |
+ instance specific parameters... |
+ enddevice |
+ |
+ which creates a device instance. |
+*/ |
+ |
+ |
+/* MEMORY SECTION |
+ |
+ This section specifies how the memory is generated and the blocks |
+ it consists of. |
+ |
+ type = random/unknown/pattern |
+ Specifies the initial memory values. |
+ 'random' generates random memory using seed 'random_seed'. |
+ 'pattern' fills memory with 'pattern'. |
+ 'unknown' does not specify how memory should be generated, |
+ leaving the memory in a undefined state. This is the fastest |
+ option. |
+ |
+ random_seed = <value> |
+ random seed for randomizer, used if type = 'random'. |
+ |
+ pattern = <value> |
+ pattern to fill memory, used if type = 'pattern'. |
+ |
+ nmemories = <value> |
+ number of memory instances connected |
+ |
+ baseaddr = <hex_value> |
+ memory start address |
+ |
+ size = <hex_value> |
+ memory size |
+ |
+ name = "<string>" |
+ memory block name |
+ |
+ ce = <value> |
+ chip enable index of the memory instance |
+ |
+ mc = <value> |
+ memory controller this memory is connected to |
+ |
+ delayr = <value> |
+ cycles, required for read access, -1 if instance does not support reading |
+ |
+ delayw = <value> |
+ cycles, required for write access, -1 if instance does not support writing |
+ |
+ log = "<filename>" |
+ filename, where to log memory accesses to, no log, if log command is not specified |
+*/ |
+ |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "FLASH" |
+ ce = 0 |
+ mc = 0 |
+ baseaddr = 0xf0000000 |
+ size = 0x01000000 |
+ delayr = 10 |
+ delayw = -1 |
+end |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "RAM" |
+ ce = 1 |
+ mc = 0 |
+ baseaddr = 0x00000000 |
+ size = 0x02000000 |
+ delayr = 20 |
+ delayw = 25 |
+end |
+ |
+section memory |
+ /*random_seed = 12345 |
+ type = random*/ |
+ pattern = 0x00 |
+ type = unknown /* Fastest */ |
+ |
+ name = "SRAM" |
+ mc = 0 |
+ ce = 2 |
+ baseaddr = 0xa4000000 |
+ size = 0x00100000 |
+ delayr = 1 |
+ delayw = 2 |
+end |
+ |
+ |
+/* IMMU SECTION |
+ |
+ This section configures the Instruction Memory Manangement Unit |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of ITLB sets; must be power of two |
+ |
+ nways = <value> |
+ number of ITLB ways |
+ |
+ pagesize = <value> |
+ instruction page size; must be power of two |
+ |
+ entrysize = <value> |
+ instruction entry size in bytes |
+ |
+ ustates = <value> |
+ number of ITLB usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles immu hit costs |
+ |
+ missdelay = <value> |
+ number of cycles immu miss costs |
+*/ |
+ |
+section immu |
+ enabled = 1 |
+ nsets = 64 |
+ nways = 1 |
+ pagesize = 8192 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* DMMU SECTION |
+ |
+ This section configures the Data Memory Manangement Unit |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of DTLB sets; must be power of two |
+ |
+ nways = <value> |
+ number of DTLB ways |
+ |
+ pagesize = <value> |
+ data page size; must be power of two |
+ |
+ entrysize = <value> |
+ data entry size in bytes |
+ |
+ ustates = <value> |
+ number of DTLB usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles dmmu hit costs |
+ |
+ missdelay = <value> |
+ number of cycles dmmu miss costs |
+*/ |
+ |
+section dmmu |
+ enabled = 1 |
+ nsets = 64 |
+ nways = 1 |
+ pagesize = 8192 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* IC SECTION |
+ |
+ This section configures the Instruction Cache |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of IC sets; must be power of two |
+ |
+ nways = <value> |
+ number of IC ways |
+ |
+ blocksize = <value> |
+ IC block size in bytes; must be power of two |
+ |
+ ustates = <value> |
+ number of IC usage states (2, 3, 4 etc., max is 4) |
+ |
+ hitdelay = <value> |
+ number of cycles ic hit costs |
+ |
+ missdelay = <value> |
+ number of cycles ic miss costs |
+*/ |
+ |
+section ic |
+ enabled = 0 |
+ nsets = 512 |
+ nways = 1 |
+ blocksize = 16 |
+ hitdelay = 20 |
+ missdelay = 20 |
+end |
+ |
+ |
+/* DC SECTION |
+ |
+ This section configures the Data Cache |
+ |
+ enabled = 0/1 |
+ '0': disabled |
+ '1': enabled |
+ (NOTE: UPR bit is set) |
+ |
+ nsets = <value> |
+ number of DC sets; must be power of two |
+ |
+ nways = <value> |
+ number of DC ways |
+ |
+ blocksize = <value> |
+ DC block size in bytes; must be power of two |
+ |
+ ustates = <value> |
+ number of DC usage states (2, 3, 4 etc., max is 4) |
+ |
+ load_hitdelay = <value> |
+ number of cycles dc load hit costs |
+ |
+ load_missdelay = <value> |
+ number of cycles dc load miss costs |
+ |
+ store_hitdelay = <value> |
+ number of cycles dc load hit costs |
+ |
+ store_missdelay = <value> |
+ number of cycles dc load miss costs |
+*/ |
+ |
+section dc |
+ enabled = 0 |
+ nsets = 512 |
+ nways = 1 |
+ blocksize = 16 |
+ load_hitdelay = 20 |
+ load_missdelay = 20 |
+ store_hitdelay = 20 |
+ store_missdelay = 20 |
+end |
+ |
+ |
+/* SIM SECTION |
+ |
+ This section specifies how or1ksim should behave. |
+ |
+ verbose = 0/1 |
+ '0': don't print extra messages |
+ '1': print extra messages |
+ |
+ debug = 0-9 |
+ 0 : no debug messages |
+ 1-9: debug message level. |
+ higher numbers produce more messages |
+ |
+ profile = 0/1 |
+ '0': don't generate profiling file 'sim.profile' |
+ '1': don't generate profiling file 'sim.profile' |
+ |
+ prof_fn = "<filename>" |
+ optional filename for the profiling file. |
+ valid only if 'profile' is set |
+ |
+ mprofile = 0/1 |
+ '0': don't generate memory profiling file 'sim.mprofile' |
+ '1': generate memory profiling file 'sim.mprofile' |
+ |
+ mprof_fn = "<filename>" |
+ optional filename for the memory profiling file. |
+ valid only if 'mprofile' is set |
+ |
+ history = 0/1 |
+ '0': don't track execution flow |
+ '1': track execution flow |
+ Execution flow can be tracked for the simulator's |
+ 'hist' command. Useful for back-trace debugging. |
+ |
+ iprompt = 0/1 |
+ '0': start in <not interactive prompt> (so what do we start in ???) |
+ '1': start in interactive prompt. |
+ |
+ exe_log = 0/1 |
+ '0': don't generate execution log. |
+ '1': generate execution log. |
+ |
+ exe_log = default/hardware/simple/software |
+ type of execution log, default is used when not specified |
+ |
+ exe_log_start = <value> |
+ index of first instruction to start logging, default = 0 |
+ |
+ exe_log_end = <value> |
+ index of last instruction to end logging; not limited, if omitted |
+ |
+ exe_log_marker = <value> |
+ <value> specifies number of instructions before horizontal marker is |
+ printed; if zero, markers are disabled (default) |
+ |
+ exe_log_fn = "<filename>" |
+ filename for the exection log file. |
+ valid only if 'exe_log' is set |
+ |
+ clkcycle = <value>[ps|ns|us|ms] |
+ specifies time measurement for one cycle |
+*/ |
+ |
+section sim |
+ verbose = 1 |
+ debug = 0 |
+ profile = 0 |
+ history = 0 |
+ |
+ clkcycle = 10ns |
+end |
+ |
+ |
+/* SECTION VAPI |
+ |
+ This section configures the Verification API, used for Advanced |
+ Core Verification. |
+ |
+ enabled = 0/1 |
+ '0': disbable VAPI server |
+ '1': enable/start VAPI server |
+ |
+ server_port = <value> |
+ TCP/IP port to start VAPI server on |
+ |
+ log_enabled = 0/1 |
+ '0': disable VAPI requests logging |
+ '1': enable VAPI requests logging |
+ |
+ hide_device_id = 0/1 |
+ '0': don't log device id (for compatability with old version) |
+ '1': log device id |
+ |
+ |
+ vapi_fn = <filename> |
+ filename for the log file. |
+ valid only if log_enabled is set |
+*/ |
+ |
+section VAPI |
+ enabled = 0 |
+ server_port = 9998 |
+ log_enabled = 0 |
+ vapi_log_fn = "vapi.log" |
+end |
+ |
+ |
+/* CPU SECTION |
+ |
+ This section specifies various CPU parameters. |
+ |
+ ver = <value> |
+ rev = <value> |
+ specifies version and revision of the CPU used |
+ |
+ upr = <value> |
+ changes the upr register |
+ |
+ sr = <value> |
+ sets the initial Supervision Register value |
+ |
+ superscalar = 0/1 |
+ '0': CPU is scalar |
+ '1': CPU is superscalar |
+ (modify cpu/or32/execute.c to tune superscalar model) |
+ |
+ hazards = 0/1 |
+ '0': don't track data hazards in superscalar CPU |
+ '1': track data hazards in superscalar CPU |
+ If tracked, data hazards can be displayed using the |
+ simulator's 'r' command. |
+ |
+ dependstats = 0/1 |
+ '0': don't calculate inter-instruction dependencies. |
+ '1': calculate inter-instruction dependencies. |
+ If calculated, inter-instruction dependencies can be |
+ displayed using the simulator's 'stat' command. |
+ |
+ sbuf_len = <value> |
+ length of store buffer (<= 256), 0 = disabled |
+*/ |
+ |
+section cpu |
+ ver = 0x12 |
+ cfg = 0x00 |
+ rev = 0x01 |
+ /* upr = */ |
+ superscalar = 0 |
+ hazards = 0 |
+ dependstats = 0 |
+ sbuf_len = 0 |
+ hardfloat = 1 |
+end |
+ |
+ |
+/* PM SECTION |
+ |
+ This section specifies Power Management parameters |
+ |
+ enabled = 0/1 |
+ '0': disable power management |
+ '1': enable power management |
+*/ |
+ |
+section pm |
+ enabled = 0 |
+end |
+ |
+ |
+/* BPB SECTION |
+ |
+ This section specifies how branch prediction should behave. |
+ |
+ enabled = 0/1 |
+ '0': disable branch prediction |
+ '1': enable branch prediction |
+ |
+ btic = 0/1 |
+ '0': disable branch target instruction cache model |
+ '1': enable branch target instruction cache model |
+ |
+ sbp_bf_fwd = 0/1 |
+ Static branch prediction for 'l.bf' |
+ '0': don't use forward prediction |
+ '1': use forward prediction |
+ |
+ sbp_bnf_fwd = 0/1 |
+ Static branch prediction for 'l.bnf' |
+ '0': don't use forward prediction |
+ '1': use forward prediction |
+ |
+ hitdelay = <value> |
+ number of cycles bpb hit costs |
+ |
+ missdelay = <value> |
+ number of cycles bpb miss costs |
+*/ |
+ |
+section bpb |
+ enabled = 0 |
+ btic = 0 |
+ sbp_bf_fwd = 0 |
+ sbp_bnf_fwd = 0 |
+ hitdelay = 0 |
+ missdelay = 0 |
+end |
+ |
+ |
+/* DEBUG SECTION |
+ |
+ This sections specifies how the debug unit should behave. |
+ |
+ enabled = 0/1 |
+ '0': disable debug unit |
+ '1': enable debug unit |
+ |
+ gdb_enabled = 0/1 |
+ '0': don't start gdb server |
+ '1': start gdb server at port 'server_port' |
+ |
+ server_port = <value> |
+ TCP/IP port to start gdb server on |
+ valid only if gdb_enabled is set |
+ |
+ vapi_id = <hex_value> |
+ Used to create "fake" vapi log file containing the JTAG proxy messages. |
+*/ |
+section debug |
+ enabled = 1 |
+ rsp_enabled = 1 |
+ rsp_port = 5554 |
+ /*server_port = 9999*/ |
+end |
+ |
+ |
+/* MC SECTION |
+ |
+ This section configures the memory controller |
+ |
+ enabled = 0/1 |
+ '0': disable memory controller |
+ '1': enable memory controller |
+ |
+ baseaddr = <hex_value> |
+ address of first MC register |
+ |
+ POC = <hex_value> |
+ Power On Configuration register |
+ |
+ index = <value> |
+ Index of this memory controller amongst all the memory controllers |
+*/ |
+ |
+section mc |
+ enabled = 0 |
+ baseaddr = 0x93000000 |
+ POC = 0x00000008 /* Power on configuration register */ |
+ index = 0 |
+end |
+ |
+ |
+/* UART SECTION |
+ |
+ This section configures the UARTs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first UART register for this device |
+ |
+ |
+ channel = <channeltype>:<args> |
+ |
+ The channel parameter indicates the source of received UART characters |
+ and the sink for transmitted UART characters. |
+ |
+ The <channeltype> can be either "file", "xterm", "tcp", "fd", or "tty" |
+ (without quotes). |
+ |
+ A) To send/receive characters from a pair of files, use a file |
+ channel: |
+ |
+ channel=file:<rxfile>,<txfile> |
+ |
+ B) To create an interactive terminal window, use an xterm channel: |
+ |
+ channel=xterm:[<xterm_arg>]* |
+ |
+ C) To create a bidirectional tcp socket which one could, for example, |
+ access via telnet, use a tcp channel: |
+ |
+ channel=tcp:<port number> |
+ |
+ D) To cause the UART to read/write from existing numeric file |
+ descriptors, use an fd channel: |
+ |
+ channel=fd:<rx file descriptor num>,<tx file descriptor num> |
+ |
+ E) To connect the UART to a physical serial port, create a tty |
+ channel: |
+ |
+ channel=tty:device=/dev/ttyS0,baud=9600 |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ 16550 = 0/1 |
+ '0': this device is a UART16450 |
+ '1': this device is a UART16550 |
+ |
+ jitter = <value> |
+ in msecs... time to block, -1 to disable it |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section uart |
+ enabled = 1 |
+ baseaddr = 0x90000000 |
+ irq = 2 |
+ channel = "file:uart0.rx,uart0.tx" |
+ /* channel = "tcp:10084" */ |
+ /* channel = "xterm:" */ |
+ jitter = -1 /* async behaviour */ |
+ 16550 = 1 |
+end |
+ |
+ |
+/* DMA SECTION |
+ |
+ This section configures the DMAs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first DMA register for this device |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section dma |
+ enabled = 1 |
+ baseaddr = 0x9a000000 |
+ irq = 11 |
+end |
+ |
+ |
+/* ETHERNET SECTION |
+ |
+ This section configures the ETHERNETs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first ethernet register for this device |
+ |
+ dma = <value> |
+ which controller is this ethernet "connected" to |
+ |
+ irq = <value> |
+ ethernet mac IRQ level |
+ |
+ rtx_type = <value> |
+ use 0 - file interface, 1 - socket interface |
+ |
+ rx_channel = <value> |
+ DMA channel used for RX |
+ |
+ tx_channel = <value> |
+ DMA channel used for TX |
+ |
+ rxfile = "<filename>" |
+ filename, where to read data from |
+ |
+ txfile = "<filename>" |
+ filename, where to write data to |
+ |
+ sockif = "<ifacename>" |
+ interface name of ethernet socket |
+ |
+ vapi_id = <hex_value> |
+ VAPI id of this instance |
+*/ |
+ |
+section ethernet |
+ enabled = 0 |
+ baseaddr = 0x92000000 |
+ /* dma = 0 */ |
+ irq = 4 |
+ rtx_type = 0 |
+ /* tx_channel = 0 */ |
+ /* rx_channel = 1 */ |
+ /*rxfile = "eth0.rx"*/ |
+ txfile = "eth0.tx" |
+ sockif = "eth0" |
+end |
+ |
+ |
+/* GPIO SECTION |
+ |
+ This section configures the GPIOs |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first GPIO register for this device |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ base_vapi_id = <hex_value> |
+ first VAPI id of this instance |
+ GPIO uses 8 consecutive VAPI IDs |
+*/ |
+ |
+section gpio |
+ enabled = 0 |
+ baseaddr = 0x91000000 |
+ irq = 3 |
+ base_vapi_id = 0x0200 |
+end |
+ |
+/* VGA SECTION |
+ |
+ This section configures the VGA/LCD controller |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ address of first VGA register |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ refresh_rate = <value> |
+ number of cycles between screen dumps |
+ |
+ filename = "<filename>" |
+ template name for generated names (e.g. "primary" produces "primary0023.bmp") |
+*/ |
+ |
+section vga |
+ enabled = 1 |
+ baseaddr = 0x97100000 |
+ irq = 8 |
+ refresh_rate = 100000 |
+ filename = "primary" |
+end |
+ |
+ |
+/* TICK TIMER SECTION |
+ |
+ This section configures tick timer |
+ |
+ enabled = 0/1 |
+ whether tick timer is enabled |
+*/ |
+ |
+section pic |
+ enabled = 1 |
+ edge_trigger = 1 |
+end |
+ |
+/* FB SECTION |
+ |
+ This section configures the frame buffer |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ baseaddr = <hex_value> |
+ base address of frame buffer |
+ |
+ paladdr = <hex_value> |
+ base address of first palette entry |
+ |
+ refresh_rate = <value> |
+ number of cycles between screen dumps |
+ |
+ filename = "<filename>" |
+ template name for generated names (e.g. "primary" produces "primary0023.bmp") |
+*/ |
+ |
+section fb |
+ enabled = 1 |
+ baseaddr = 0x97000000 |
+ refresh_rate = 1000000 |
+ filename = "primary" |
+end |
+ |
+ |
+/* KBD SECTION |
+ |
+ This section configures the PS/2 compatible keyboard |
+ |
+ baseaddr = <hex_value> |
+ base address of the keyboard device |
+ |
+ rxfile = "<filename>" |
+ filename, where to read data from |
+*/ |
+ |
+section kbd |
+ enabled = 1 |
+ irq = 5 |
+ baseaddr = 0x94000000 |
+ rxfile = "kbd.rx" |
+end |
+ |
+ |
+/* ATA SECTION |
+ |
+ This section configures the ATA/ATAPI host controller |
+ |
+ baseaddr = <hex_value> |
+ address of first ATA register |
+ |
+ enabled = <0|1> |
+ Enable/disable the peripheral. By default if it is enabled. |
+ |
+ irq = <value> |
+ irq number for this device |
+ |
+ debug = <value> |
+ debug level for ata models. |
+ 0: no debug messages |
+ 1: verbose messages |
+ 3: normal messages (more messages than verbose) |
+ 5: debug messages (normal debug messages) |
+ 7: flow control messages (debug statemachine flows) |
+ 9: low priority message (display everything the code does) |
+ |
+ dev_type0/1 = <value> |
+ ata device 0 type |
+ 0: NO_CONNeCT: none (not connected) |
+ 1: FILE : simulated harddisk |
+ 2: LOCAL : local system harddisk |
+ |
+ dev_file0/1 = "<filename>" |
+ filename for simulated ATA device |
+ valid only if dev_type0 == 1 |
+ |
+ dev_size0/1 = <value> |
+ size of simulated hard-disk (in MBytes) |
+ valid only if dev_type0 == 1 |
+ |
+ dev_packet0/1 = <value> |
+ 0: simulated ATA device does NOT implement PACKET command feature set |
+ 1: simulated ATA device does implement PACKET command feature set |
+ |
+ FIXME: irq number |
+*/ |
+ |
+section ata |
+ enabled = 0 |
+ baseaddr = 0x9e000000 |
+ irq = 15 |
+ |
+end |
+ |
+ |
diff --exclude=.git --exclude=/gitignore -Naur x264/x264.c x264-or/x264.c |
--- x264/x264.c 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/x264.c 2009-11-15 17:35:30.000000000 +0100 |
@@ -40,6 +40,8 @@ |
#define SetConsoleTitle(t) |
#endif |
|
+//#include "uart.h" |
+ |
uint8_t *mux_buffer = NULL; |
int mux_buffer_size = 0; |
|
@@ -54,11 +56,11 @@ |
} |
|
typedef struct { |
- int b_progress; |
- int i_seek; |
- hnd_t hin; |
- hnd_t hout; |
- FILE *qpfile; |
+ int b_progress; |
+ int i_seek; |
+ hnd_t hin; /* hnd_t is a void* */ |
+ hnd_t hout; |
+ FILE *qpfile; |
} cli_opt_t; |
|
/* input file operation function pointers */ |
@@ -78,15 +80,28 @@ |
static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt ); |
static int Encode( x264_param_t *param, cli_opt_t *opt ); |
|
- |
+int yuv_data_size; |
/**************************************************************************** |
* main: |
****************************************************************************/ |
int main( int argc, char **argv ) |
{ |
- x264_param_t param; |
- cli_opt_t opt; |
- int ret; |
+ char* new_argv ="./x264 --profile=baseline"; |
+ argc = 2; |
+ |
+ x264_param_t param; |
+ cli_opt_t opt; |
+ int ret; |
+ |
+ //uart_init() ; |
+ |
+ extern int _end; // start of free memory |
+ extern int _stack; // end of free memory |
+ |
+ V(fprintf( stderr, "start of heap at 0x%.8x, size of heap: %d bytes\n", |
+ (unsigned int) &_end, (unsigned int) ((unsigned int) &_stack) - ((unsigned int) &_end))); |
+ |
+ //fprintf(stderr,"start\n"); |
|
#ifdef PTW32_STATIC_LIB |
pthread_win32_process_attach_np(); |
@@ -100,14 +115,90 @@ |
|
x264_param_default( ¶m ); |
|
- /* Parse command line */ |
- if( Parse( argc, argv, ¶m, &opt ) < 0 ) |
- return -1; |
+ // Baseline parameters |
+ param.analyse.b_transform_8x8 = 0; |
+ param.b_cabac = 0; |
+ param.i_cqm_preset = X264_CQM_FLAT; |
+ param.i_bframe = 0; |
+ /* |
+ // Ultrafast preset |
+ param.i_frame_reference = 1; |
+ param.i_scenecut_threshold = 0; |
+ param.b_deblocking_filter = 0; |
+ param.b_cabac = 0; |
+ param.i_bframe = 0; |
+ param.analyse.intra = 0; |
+ param.analyse.inter = 0; |
+ param.analyse.b_transform_8x8 = 0; |
+ param.analyse.i_me_method = X264_ME_DIA; |
+ param.analyse.i_subpel_refine = 0; |
+ param.rc.i_aq_mode = 0; |
+ param.analyse.b_mixed_references = 0; |
+ param.analyse.i_trellis = 0; |
+ param.i_bframe_adaptive = X264_B_ADAPT_NONE; |
+ param.rc.b_mb_tree = 0; |
+ */ |
+ // Fast preset, standard |
+ /* |
+ param.i_frame_reference = 2; |
+ param.analyse.i_subpel_refine = 6; |
+ param.rc.i_lookahead = 30; |
+ */ |
+ // Fast preset, but with 0 lookahead |
+ param.i_frame_reference = 2; |
+ param.analyse.i_subpel_refine = 6; |
+ param.rc.i_lookahead = 0; |
+ |
+ // Verbose (info per frame encoded) |
+ param.i_log_level = X264_LOG_DEBUG; |
+ |
+ // Set a bitrate |
+ param.rc.i_bitrate = 500; // kbps |
+ param.rc.i_rc_method = X264_RC_ABR; |
+ |
+ // A smaller range of QP (qp_max - qp_min) means less startup time for computing motion vectors costs (not sure exactly how this is done) --jb |
+ // Defaults here were i_qp_min = 10, i_qp_max = 51 |
+ param.rc.i_qp_min = 10; |
+ param.rc.i_qp_max = 51; |
+ param.rc.i_qp_step = 4; |
+ |
+ |
+ // Picture height and width |
+ // CIF = 352x288 |
+ param.i_width = 352; |
+ param.i_height = 288; |
+ V(fprintf( stderr, "x264 [info]: %dx%d @ %.2f fps\n", |
+ param.i_width, param.i_height, |
+ (float)param.i_fps_num / (float)param.i_fps_den)); |
+ |
+ // VBV buffer size in kbits |
+ param.rc.i_vbv_buffer_size = (352*288*12*4)/1024; // 4 frames of VBV (?!) |
+ |
+ |
+ |
+ V(printf("video file in memory from 0x%.8x, size %d bytes\n", YUV_DATA_ADDR, YUV_DATA_SIZE)); |
+ /* Must define a few things for use of static CIF data |
+ YUV_DATA_ADDR - address where the data starts |
+ YUV_DATA_SIZE - size of data in bytes |
+ ENC_OUT_ADDR - where we'll store the encoded data |
+ */ |
+ |
+ /* Parse command line */ |
+ Parse( argc, &new_argv, ¶m, &opt ); |
+ //if( Parse( argc, &new_argv, ¶m, &opt ) < 0 ) |
+ //return -1; |
+ |
+ // We specify a place in memory of some YUV CIF data instad of opening a file |
+ init_yuv_dataspace((char *)YUV_DATA_ADDR, (hnd_t *)&opt.hin, ¶m); |
+#ifdef ENC_OUT_ADDR |
+ p_open_outfile( 0, &opt.hout ); // Setup the out "file" which is really just a spot in memory |
+#endif |
+ //opt.hout = (void*) ENC_OUT_ADDR; // We specify a place in memory of where we'll dump the encoded data |
|
- /* Control-C handler */ |
- signal( SIGINT, SigIntHandler ); |
+ /* Control-C handler */ |
+ //signal( SIGINT, SigIntHandler ); |
|
- ret = Encode( ¶m, &opt ); |
+ ret = Encode( ¶m, &opt ); |
|
#ifdef PTW32_STATIC_LIB |
pthread_win32_thread_detach_np(); |
@@ -555,10 +646,12 @@ |
/* Default output file driver */ |
p_open_outfile = open_file_bsf; |
p_set_outfile_param = set_param_bsf; |
- p_write_nalu = write_nalu_bsf; |
+ p_write_nalu = write_nalu_bsf; //--jb |
p_set_eop = set_eop_bsf; |
p_close_outfile = close_file_bsf; |
|
+ return 0; // -- added jb |
+ |
/* Presets are applied before all other options. */ |
for( optind = 0;; ) |
{ |
@@ -978,9 +1071,11 @@ |
} |
else |
{ |
- sscanf( argv[optind++], "%ux%u", ¶m->i_width, ¶m->i_height ); |
- if( param->i_log_level >= X264_LOG_INFO ) |
- fprintf( stderr, "x264 [info]: %dx%d @ %.2f fps\n", param->i_width, param->i_height, (double)param->i_fps_num / (double)param->i_fps_den); |
+ sscanf( argv[optind++], "%ux%u", ¶m->i_width, ¶m->i_height ); |
+ if( param->i_log_level >= X264_LOG_INFO ) |
+ fprintf( stderr, "x264 [info]: %dx%d @ %.2f fps\n", |
+ param->i_width, param->i_height, |
+ (double)param->i_fps_num / (double)param->i_fps_den); |
} |
} |
|
@@ -1127,7 +1222,10 @@ |
{ |
i_nalu_size = p_write_nalu( hout, nal[i].p_payload, nal[i].i_payload ); |
if( i_nalu_size < 0 ) |
+ { |
+ fprintf(stderr, "Encode_frame: p_write_nalu() returned i_nalu_size < 0: %d\n", i_nalu_size); |
return -1; |
+ } |
i_file += i_nalu_size; |
} |
if (i_nal) |
@@ -1172,12 +1270,15 @@ |
opt->b_progress &= param->i_log_level < X264_LOG_DEBUG; |
i_frame_total = p_get_frame_total( opt->hin ); |
i_frame_total -= opt->i_seek; |
+ |
if( ( i_frame_total == 0 || param->i_frame_total < i_frame_total ) |
&& param->i_frame_total > 0 ) |
i_frame_total = param->i_frame_total; |
+ |
param->i_frame_total = i_frame_total; |
+ V(fprintf(stderr, "Encode: i_frame_total:%d\n", param->i_frame_total)); |
i_update_interval = i_frame_total ? x264_clip3( i_frame_total / 1000, 1, 10 ) : 10; |
- |
+ V(fprintf(stderr, "Encode: i_update_interval:%d\n", i_update_interval)); |
if( ( h = x264_encoder_open( param ) ) == NULL ) |
{ |
fprintf( stderr, "x264 [error]: x264_encoder_open failed\n" ); |
@@ -1202,6 +1303,7 @@ |
|
i_start = x264_mdate(); |
|
+ V(fprintf(stderr, "Starting Encoder loop: i_frame_total %d\n", i_frame_total)); |
/* Encode frames */ |
for( i_frame = 0, i_file = 0, i_frame_output = 0; b_ctrl_c == 0 && (i_frame < i_frame_total || i_frame_total == 0); ) |
{ |
diff --exclude=.git --exclude=/gitignore -Naur x264/x264.h x264-or/x264.h |
--- x264/x264.h 2009-10-25 17:41:22.000000000 +0100 |
+++ x264-or/x264.h 2009-10-28 15:05:29.000000000 +0100 |
@@ -24,6 +24,29 @@ |
#ifndef X264_X264_H |
#define X264_X264_H |
|
+extern int yuv_data_start; |
+extern int yuv_data_end; |
+//#define YUV_DATA_ADDR 0x01000000 |
+#define YUV_DATA_ADDR &yuv_data_start |
+#define YUV_DATA_SIZE ((int)(&yuv_data_end) - (int)(YUV_DATA_ADDR)) |
+#define ENC_OUT_ADDR &yuv_data_end |
+ |
+//#define USE_HARDCODED_FRAME_NUM |
+#define HARDCODED_FRAME_NUM 10 |
+ |
+ |
+ |
+/* Enable for verbose output during runtime */ |
+//#define X264_VERBOSE |
+#ifdef X264_VERBOSE |
+#define V(x) x |
+#else |
+#define V(x) |
+#endif |
+ |
+ |
+extern int yuv_data_size; |
+ |
#if !defined(_STDINT_H) && !defined(_STDINT_H_) && \ |
!defined(_INTTYPES_H) && !defined(_INTTYPES_H_) |
# ifdef _MSC_VER |
/oc-h264-encoder/trunk/x264/patches/README
0,0 → 1,25
These are patches for x264, a software H.264/AVC encoder. |
They should be applied to the unmodified source revision contained in their filename, not in a combined fashion, ie. apply 1.0 or 1.1, not 1.0 and then 1.1. |
|
See the x264 project's site for more information on it: http://www.videolan.org/developers/x264.html |
|
The revision these patches apply to is in the filename. At present the revision we're working with, and thus creating patches for, is e381f6d. |
|
For a full guide to setting up x264 to run in the OpenRISC architectural simulator, see the forum post here: http://opencores.org/forum,OC%20H.264%20project,0,3557 |
|
The following is a quick guide to getting the x264 source, reverting to the correct revision, and then applying a patch (patches are unified, not progressive.) |
|
1. Get x264 sources from git repository |
|
# git clone git://git.videolan.org/x264.git |
|
2. Revert to the appropriate revision |
|
# cd x264 |
# git checkout e381f6d |
|
3. Patch the x264 source (while within the x264 directory) |
|
# patch -p1 < ../oc-h264-encoder/trunk/x264/patches/x264-e381f6d-or32-or1ksim-with-fp-1.0.patch |
|
The x264 source code is now patch, and can be configured and compiled. Check the OpenCores h.264 project forum for more details: http://opencores.org/forum,OC%20H.264%20project |
/oc-h264-encoder/trunk/or1ksim/patches/README
0,0 → 1,26
These are patches for or1ksim, aimed at customising it for the development of an x264 port/hardware adaption. |
|
|
For details on applying the patches, see the entire or1ksim/x264 setup guide in the OpenCores H.264 project forum: http://opencores.org/forum,OC%20H.264%20project,0,3557 |
|
An outline of the basic steps follows: |
# wget ftp://ocuser:oc@orsoc.se/toolchain/or1ksim-0.3.0.tar.bz2 |
# tar xjf or1ksim-0.3.0.tar.bz2 |
# wget ftp://ocuser:oc@orsoc.se/toolchain/or1ksim-0.3.0-fp-patch.bz2 |
# cd or1ksim-0.3.0 |
# bzcat -dc ../or1ksim-0.3.0-fp-patch.bz2 | -patch -p1 |
# ./configure --target=or32-elf --prefix=/opt/or32-newlib |
# make all install |
|
File description: |
|
or1ksim-0.3.0-fp.patch.bz2 |
|
Patch for or1ksim-0.3.0 implementing floating point (single precision) support to the simulator, as well as tweaking its boot address to start from 0x100 instead of 0xf0000100. |
|
or1ksim-0.3.0-fp-sadssdmod.patch.bz2 |
|
Patch to be applied ontop of or1ksim-0.3.0-fp.patch.bz2, implementing an example SAD/SSD calculation module in the video_enc/ subdirectory. It is a little large because each time a subdirectory is added to or1ksim each automake and autoconf file must be updated. |
|
|
|
/oc-h264-encoder/trunk/or1ksim/patches/or1ksim-0.3.0-fp-sadssdmod.patch.bz2
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
oc-h264-encoder/trunk/or1ksim/patches/or1ksim-0.3.0-fp-sadssdmod.patch.bz2
Property changes :
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property
Index: oc-h264-encoder/trunk/or1ksim/patches/or1ksim-0.3.0-fp.patch.bz2
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: oc-h264-encoder/trunk/or1ksim/patches/or1ksim-0.3.0-fp.patch.bz2
===================================================================
--- oc-h264-encoder/trunk/or1ksim/patches/or1ksim-0.3.0-fp.patch.bz2 (nonexistent)
+++ oc-h264-encoder/trunk/or1ksim/patches/or1ksim-0.3.0-fp.patch.bz2 (revision 51)
oc-h264-encoder/trunk/or1ksim/patches/or1ksim-0.3.0-fp.patch.bz2
Property changes :
Added: svn:mime-type
## -0,0 +1 ##
+application/octet-stream
\ No newline at end of property