r41: Moved from a plugin to a statically linked component of Cinelerra
authorHerman Robak <herman@skolelinux.no>
Fri, 3 Oct 2003 23:42:11 +0000 (23:42 +0000)
committerHerman Robak <herman@skolelinux.no>
Fri, 3 Oct 2003 23:42:11 +0000 (23:42 +0000)
26 files changed:
mpeg2enc/conform.c [new file with mode: 0644]
mpeg2enc/cpu_accel.c [new file with mode: 0644]
mpeg2enc/create_mtable.c [new file with mode: 0644]
mpeg2enc/fdctdata.c [new file with mode: 0644]
mpeg2enc/fdctref.c [new file with mode: 0644]
mpeg2enc/idct.c [new file with mode: 0644]
mpeg2enc/idctdata.c [new file with mode: 0644]
mpeg2enc/mblock_sub44_sads.c [new file with mode: 0644]
mpeg2enc/mblock_sub44_sads_x86_h.c [new file with mode: 0644]
mpeg2enc/motion.c [new file with mode: 0644]
mpeg2enc/mpeg2enc.c [new file with mode: 0644]
mpeg2enc/predict.c [new file with mode: 0644]
mpeg2enc/putbits.c [new file with mode: 0644]
mpeg2enc/puthdr.c [new file with mode: 0644]
mpeg2enc/putmpg.c [new file with mode: 0644]
mpeg2enc/putpic.c [new file with mode: 0644]
mpeg2enc/putseq.c [new file with mode: 0644]
mpeg2enc/putvlc.c [new file with mode: 0644]
mpeg2enc/quantize.c [new file with mode: 0644]
mpeg2enc/quantize_x86.c [new file with mode: 0644]
mpeg2enc/ratectl.c [new file with mode: 0644]
mpeg2enc/readpic.c [new file with mode: 0644]
mpeg2enc/stats.c [new file with mode: 0644]
mpeg2enc/transfrm.c [new file with mode: 0644]
mpeg2enc/worksheet.c [new file with mode: 0644]
mpeg2enc/writepic.c [new file with mode: 0644]

diff --git a/mpeg2enc/conform.c b/mpeg2enc/conform.c
new file mode 100644 (file)
index 0000000..b672425
--- /dev/null
@@ -0,0 +1,81 @@
+/* conform.c, conformance checks                                            */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config.h"
+#include "global.h"
+
+/* check for (level independent) parameter limits */
+void range_checks()
+{
+}
+
+/* identifies valid profile / level combinations */
+static char profile_level_defined[5][4] =
+{
+/* HL   H-14 ML   LL  */
+  {1,   1,   1,   0},  /* HP   */
+  {0,   1,   0,   0},  /* Spat */
+  {0,   0,   1,   1},  /* SNR  */
+  {1,   1,   1,   1},  /* MP   */
+  {0,   0,   1,   0}   /* SP   */
+};
+
+static struct level_limits {
+  int hor_f_code;
+  int vert_f_code;
+  int hor_size;
+  int vert_size;
+  int sample_rate;
+  int bit_rate; /* Mbit/s */
+  int vbv_buffer_size; /* 16384 bit steps */
+} maxval_tab[4] =
+{
+  {9, 5, 1920, 1152, 62668800, 80, 597}, /* HL */
+  {9, 5, 1440, 1152, 47001600, 60, 448}, /* H-14 */
+  {8, 5,  720,  576, 10368000, 15, 112}, /* ML */
+  {7, 4,  352,  288,  3041280,  4,  29}  /* LL */
+};
+
+#define SP   5
+#define MP   4
+#define SNR  3
+#define SPAT 2
+#define HP   1
+
+#define LL  10
+#define ML   8
+#define H14  6
+#define HL   4
+
+void profile_and_level_checks()
+{
+}
diff --git a/mpeg2enc/cpu_accel.c b/mpeg2enc/cpu_accel.c
new file mode 100644 (file)
index 0000000..f898d25
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * cpu_accel.c
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ *
+ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ *
+ * mpeg2dec is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "cpu_accel.h"
+
+#ifdef X86_CPU
+static int x86_accel (void)
+{
+    int eax, ebx, ecx, edx;
+    int AMD;
+    int caps;
+
+#define cpuid(op,eax,ebx,ecx,edx)      \
+    asm ("cpuid"                       \
+        : "=a" (eax),                  \
+          "=b" (ebx),                  \
+          "=c" (ecx),                  \
+          "=d" (edx)                   \
+        : "a" (op)                     \
+        : "cc")
+
+    asm ("pushfl\n\t"
+        "popl %0\n\t"
+        "movl %0,%1\n\t"
+        "xorl $0x200000,%0\n\t"
+        "pushl %0\n\t"
+        "popfl\n\t"
+        "pushfl\n\t"
+        "popl %0"
+         : "=a" (eax),
+          "=b" (ebx)
+        :
+        : "cc");
+
+    if (eax == ebx)            // no cpuid
+       return 0;
+
+    cpuid (0x00000000, eax, ebx, ecx, edx);
+    if (!eax)                  // vendor string only
+       return 0;
+
+    AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65);
+
+    cpuid (0x00000001, eax, ebx, ecx, edx);
+    if (! (edx & 0x00800000))  // no MMX
+       return 0;
+
+    caps = ACCEL_X86_MMX;
+    if (edx & 0x02000000)      // SSE - identical to AMD MMX extensions
+       caps = ACCEL_X86_MMX | ACCEL_X86_MMXEXT;
+
+    cpuid (0x80000000, eax, ebx, ecx, edx);
+    if (eax < 0x80000001)      // no extended capabilities
+       return caps;
+
+    cpuid (0x80000001, eax, ebx, ecx, edx);
+
+    if (edx & 0x80000000)
+       caps |= ACCEL_X86_3DNOW;
+
+    if (AMD && (edx & 0x00400000))     // AMD MMX extensions
+       {
+               caps |= ACCEL_X86_MMXEXT;
+       }
+
+    return caps;
+}
+#endif
+
+int cpu_accel (void)
+{
+#ifdef X86_CPU
+    static int got_accel = 0;
+    static int accel;
+
+    if (!got_accel) {
+       got_accel = 1;
+       accel = x86_accel ();
+    }
+
+    return accel;
+#else
+    return 0;
+#endif
+}
diff --git a/mpeg2enc/create_mtable.c b/mpeg2enc/create_mtable.c
new file mode 100644 (file)
index 0000000..2553519
--- /dev/null
@@ -0,0 +1,31 @@
+#include <stdio.h>
+
+int main(char**, int)
+{
+  FILE *fout;
+  int i, j, v;
+
+  fout = fopen("mtable.h", "wt");
+  fprintf(fout, "const unsigned char motion_lookup[256][256] = {\n");
+  for (i = 0; i < 256; i++)
+  {
+    fprintf(fout, "{");
+    for (j = 0; j < 256; j++)
+    {
+      v = i - j;
+      if (v < 0)
+        v = -v;
+      fprintf(fout, "%3d", v);
+      if (j != 255)
+        fprintf(fout, ", ");
+      if ((j) && (j % 16 == 0))
+        fprintf(fout, "\n");
+    }
+    if (i != 255)
+      fprintf(fout, "},\n");
+  }
+  fprintf(fout, "}};\n");
+  fclose(fout);
+
+  return 0;
+}
diff --git a/mpeg2enc/fdctdata.c b/mpeg2enc/fdctdata.c
new file mode 100644 (file)
index 0000000..e095d0f
--- /dev/null
@@ -0,0 +1,143 @@
+//////////////////////////////////////////////////////////////////////////////
+//
+//  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
+//  ----------
+//  Intel Application Note AP-922 - fast, precise implementation of DCT
+//        http://developer.intel.com/vtune/cbts/appnotes.htm
+//  ----------
+//  
+//       This routine uses a 3D-Now/MMX enhancement to increase the
+//  accuracy of the fdct_col_4 macro.  The dct_col function uses 3D-Now's
+//  PMHULHRW instead of MMX's PMHULHW(and POR).  The substitution improves
+//  accuracy very slightly with performance penalty.  If the target CPU
+//  does not support 3D-Now, then this function cannot be executed.  
+//  fdctmm32.c contains the standard MMX implementation of AP-922.
+//  
+//  For a fast, precise MMX implementation of inverse-DCT 
+//              visit http://www.elecard.com/peter
+//
+//  v1.0 07/22/2000 (initial release)
+//       Initial release of AP922 MMX(3D-Now) forward_DCT.
+//       This code was tested with Visual C++ 6.0Pro + service_pack4 + 
+//       processor_pack_beta!  If you have the processor_pack_beta, you can
+//       remove the #include for amd3dx.h, and substitute the 'normal'
+//       assembly lines for the macro'd versions.  Otherwise, this
+//       code should compile 'as is', under Visual C++ 6.0 Pro.
+//     
+//  liaor@iname.com  http://members.tripod.com/~liaor  
+//////////////////////////////////////////////////////////////////////////////
+
+#include <inttypes.h>
+
+//////////////////////////////////////////////////////////////////////
+//
+// constants for the forward DCT
+// -----------------------------
+//
+// Be sure to check that your compiler is aligning all constants to QWORD
+// (8-byte) memory boundaries!  Otherwise the unaligned memory access will
+// severely stall MMX execution.
+//
+//////////////////////////////////////////////////////////////////////
+
+#define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
+#define SHIFT_FRW_COL  BITS_FRW_ACC
+#define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17)
+//#define RND_FRW_ROW          (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
+#define RND_FRW_ROW            (1 << (SHIFT_FRW_ROW-1))
+//#define RND_FRW_COL          (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
+#define RND_FRW_COL            (1 << (SHIFT_FRW_COL-1))
+
+//concatenated table, for forward DCT transformation
+const int16_t fdct_tg_all_16[] = {
+       13036, 13036, 13036, 13036,             // tg * (2<<16) + 0.5
+       27146, 27146, 27146, 27146,             // tg * (2<<16) + 0.5
+       -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
+       -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5
+       23170, 23170, 23170, 23170 };   //cos * (2<<15) + 0.5
+const long long  fdct_one_corr = 0x0001000100010001LL;
+const long fdct_r_row[2] = {RND_FRW_ROW, RND_FRW_ROW };
+
+const int16_t tab_frw_01234567[] = {  // forward_dct coeff table
+    //row0
+    16384, 16384, 21407, -8867,     //    w09 w01 w08 w00
+    16384, 16384, 8867, -21407,     //    w13 w05 w12 w04
+    16384, -16384, 8867, 21407,     //    w11 w03 w10 w02
+    -16384, 16384, -21407, -8867,   //    w15 w07 w14 w06
+    22725, 12873, 19266, -22725,    //    w22 w20 w18 w16
+    19266, 4520, -4520, -12873,     //    w23 w21 w19 w17
+    12873, 4520, 4520, 19266,       //    w30 w28 w26 w24
+    -22725, 19266, -12873, -22725,  //    w31 w29 w27 w25
+
+    //row1
+    22725, 22725, 29692, -12299,    //    w09 w01 w08 w00
+    22725, 22725, 12299, -29692,    //    w13 w05 w12 w04
+    22725, -22725, 12299, 29692,    //    w11 w03 w10 w02
+    -22725, 22725, -29692, -12299,  //    w15 w07 w14 w06
+    31521, 17855, 26722, -31521,    //    w22 w20 w18 w16
+    26722, 6270, -6270, -17855,     //    w23 w21 w19 w17
+    17855, 6270, 6270, 26722,       //    w30 w28 w26 w24
+    -31521, 26722, -17855, -31521,  //    w31 w29 w27 w25
+
+    //row2
+    21407, 21407, 27969, -11585,    //    w09 w01 w08 w00
+    21407, 21407, 11585, -27969,    //    w13 w05 w12 w04
+    21407, -21407, 11585, 27969,    //    w11 w03 w10 w02
+    -21407, 21407, -27969, -11585,  //    w15 w07 w14 w06
+    29692, 16819, 25172, -29692,    //    w22 w20 w18 w16
+    25172, 5906, -5906, -16819,     //    w23 w21 w19 w17
+    16819, 5906, 5906, 25172,       //    w30 w28 w26 w24
+    -29692, 25172, -16819, -29692,  //    w31 w29 w27 w25
+
+    //row3
+    19266, 19266, 25172, -10426,    //    w09 w01 w08 w00
+    19266, 19266, 10426, -25172,    //    w13 w05 w12 w04
+    19266, -19266, 10426, 25172,    //    w11 w03 w10 w02
+    -19266, 19266, -25172, -10426,  //    w15 w07 w14 w06, 
+    26722, 15137, 22654, -26722,    //    w22 w20 w18 w16
+    22654, 5315, -5315, -15137,     //    w23 w21 w19 w17
+    15137, 5315, 5315, 22654,       //    w30 w28 w26 w24
+    -26722, 22654, -15137, -26722,  //    w31 w29 w27 w25, 
+
+    //row4
+    16384, 16384, 21407, -8867,     //    w09 w01 w08 w00
+    16384, 16384, 8867, -21407,     //    w13 w05 w12 w04
+    16384, -16384, 8867, 21407,     //    w11 w03 w10 w02
+    -16384, 16384, -21407, -8867,   //    w15 w07 w14 w06
+    22725, 12873, 19266, -22725,    //    w22 w20 w18 w16
+    19266, 4520, -4520, -12873,     //    w23 w21 w19 w17
+    12873, 4520, 4520, 19266,       //    w30 w28 w26 w24
+    -22725, 19266, -12873, -22725,  //    w31 w29 w27 w25 
+
+    //row5
+    19266, 19266, 25172, -10426,    //    w09 w01 w08 w00
+    19266, 19266, 10426, -25172,    //    w13 w05 w12 w04
+    19266, -19266, 10426, 25172,    //    w11 w03 w10 w02
+    -19266, 19266, -25172, -10426,  //    w15 w07 w14 w06
+    26722, 15137, 22654, -26722,    //    w22 w20 w18 w16
+    22654, 5315, -5315, -15137,     //    w23 w21 w19 w17
+    15137, 5315, 5315, 22654,       //    w30 w28 w26 w24
+    -26722, 22654, -15137, -26722,  //    w31 w29 w27 w25
+
+    //row6
+    21407, 21407, 27969, -11585,    //    w09 w01 w08 w00
+    21407, 21407, 11585, -27969,    //    w13 w05 w12 w04
+    21407, -21407, 11585, 27969,    //    w11 w03 w10 w02
+    -21407, 21407, -27969, -11585,  //    w15 w07 w14 w06, 
+    29692, 16819, 25172, -29692,    //    w22 w20 w18 w16
+    25172, 5906, -5906, -16819,     //    w23 w21 w19 w17
+    16819, 5906, 5906, 25172,       //    w30 w28 w26 w24
+    -29692, 25172, -16819, -29692,  //    w31 w29 w27 w25, 
+
+    //row7
+    22725, 22725, 29692, -12299,    //    w09 w01 w08 w00
+    22725, 22725, 12299, -29692,    //    w13 w05 w12 w04
+    22725, -22725, 12299, 29692,    //    w11 w03 w10 w02
+    -22725, 22725, -29692, -12299,  //    w15 w07 w14 w06, 
+    31521, 17855, 26722, -31521,    //    w22 w20 w18 w16
+    26722, 6270, -6270, -17855,     //    w23 w21 w19 w17
+    17855, 6270, 6270, 26722,       //    w30 w28 w26 w24
+    -31521, 26722, -17855, -31521   //    w31 w29 w27 w25
+};
+
+
diff --git a/mpeg2enc/fdctref.c b/mpeg2enc/fdctref.c
new file mode 100644 (file)
index 0000000..61996db
--- /dev/null
@@ -0,0 +1,120 @@
+/* fdctref.c, forward discrete cosine transform, double precision           */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <math.h>
+
+#include "config.h"
+
+#ifndef PI
+# ifdef M_PI
+#  define PI M_PI
+# else
+#  define PI 3.14159265358979323846
+# endif
+#endif
+
+/* global declarations */
+void init_fdct _ANSI_ARGS_((void));
+void fdct _ANSI_ARGS_((short *block));
+
+/* private data */
+static double c[8][8]; /* transform coefficients */
+
+void init_fdct()
+{
+  int i, j;
+  double s;
+
+  for (i=0; i<8; i++)
+  {
+    s = (i==0) ? sqrt(0.125) : 0.5;
+
+    for (j=0; j<8; j++)
+      c[i][j] = s * cos((PI/8.0)*i*(j+0.5));
+  }
+}
+
+void fdct(block)
+short *block;
+{
+       register int i, j;
+       double s;
+       double tmp[64];
+
+       for(i = 0; i < 8; i++)
+       for(j = 0; j < 8; j++)
+       {
+               s = 0.0;
+
+/*
+ *                     for(k = 0; k < 8; k++)
+ *                     s += c[j][k] * block[8 * i + k];
+ */
+               s += c[j][0] * block[8 * i + 0];
+               s += c[j][1] * block[8 * i + 1];
+               s += c[j][2] * block[8 * i + 2];
+               s += c[j][3] * block[8 * i + 3];
+               s += c[j][4] * block[8 * i + 4];
+               s += c[j][5] * block[8 * i + 5];
+               s += c[j][6] * block[8 * i + 6];
+               s += c[j][7] * block[8 * i + 7];
+
+               tmp[8 * i + j] = s;
+       }
+
+       for(j = 0; j < 8; j++)
+       for(i = 0; i < 8; i++)
+       {
+               s = 0.0;
+
+/*
+ *                     for(k = 0; k < 8; k++)
+ *                 s += c[i][k] * tmp[8 * k + j];
+ */
+               s += c[i][0] * tmp[8 * 0 + j];
+               s += c[i][1] * tmp[8 * 1 + j];
+               s += c[i][2] * tmp[8 * 2 + j];
+               s += c[i][3] * tmp[8 * 3 + j];
+               s += c[i][4] * tmp[8 * 4 + j];
+               s += c[i][5] * tmp[8 * 5 + j];
+               s += c[i][6] * tmp[8 * 6 + j];
+               s += c[i][7] * tmp[8 * 7 + j];
+
+               block[8 * i + j] = (int)floor(s + 0.499999);
+/*
+ * reason for adding 0.499999 instead of 0.5:
+ * s is quite often x.5 (at least for i and/or j = 0 or 4)
+ * and setting the rounding threshold exactly to 0.5 leads to an
+ * extremely high arithmetic implementation dependency of the result;
+ * s being between x.5 and x.500001 (which is now incorrectly rounded
+ * downwards instead of upwards) is assumed to occur less often
+ * (if at all)
+ */
+      }
+}
diff --git a/mpeg2enc/idct.c b/mpeg2enc/idct.c
new file mode 100644 (file)
index 0000000..e0bab11
--- /dev/null
@@ -0,0 +1,210 @@
+/* idct.c, inverse fast discrete cosine transform                           */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+/**********************************************************/
+/* inverse two dimensional DCT, Chen-Wang algorithm       */
+/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)             */
+/* 32-bit integer arithmetic (8 bit coefficients)         */
+/* 11 mults, 29 adds per DCT                              */
+/*                                      sE, 18.8.91       */
+/**********************************************************/
+/* coefficients extended to 12 bit for IEEE1180-1990      */
+/* compliance                           sE,  2.1.94       */
+/**********************************************************/
+
+/* this code assumes >> to be a two's-complement arithmetic */
+/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2               */
+
+#include "config.h"
+
+#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
+#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
+#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
+#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
+#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
+#define W7 565  /* 2048*sqrt(2)*cos(7*pi/16) */
+
+/* global declarations */
+void init_idct _ANSI_ARGS_((void));
+void idct _ANSI_ARGS_((short *block, unsigned char *temp));
+
+/* private data */
+static short iclip[1024]; /* clipping table */
+static short *iclp;
+
+/* private prototypes */
+static void idctrow _ANSI_ARGS_((short *blk));
+static void idctcol _ANSI_ARGS_((short *blk));
+
+/* row (horizontal) IDCT
+ *
+ *           7                       pi         1
+ * dst[k] = sum c[l] * src[l] * cos( -- * ( k + - ) * l )
+ *          l=0                      8          2
+ *
+ * where: c[0]    = 128
+ *        c[1..7] = 128*sqrt(2)
+ */
+
+static void idctrow(blk)
+short *blk;
+{
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = blk[4]<<11) | (x2 = blk[6]) | (x3 = blk[2]) |
+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3])))
+  {
+    blk[0]=blk[1]=blk[2]=blk[3]=blk[4]=blk[5]=blk[6]=blk[7]=blk[0]<<3;
+    return;
+  }
+
+  x0 = (blk[0]<<11) + 128; /* for proper rounding in the fourth stage */
+
+  /* first stage */
+  x8 = W7*(x4+x5);
+  x4 = x8 + (W1-W7)*x4;
+  x5 = x8 - (W1+W7)*x5;
+  x8 = W3*(x6+x7);
+  x6 = x8 - (W3-W5)*x6;
+  x7 = x8 - (W3+W5)*x7;
+  
+  /* second stage */
+  x8 = x0 + x1;
+  x0 -= x1;
+  x1 = W6*(x3+x2);
+  x2 = x1 - (W2+W6)*x2;
+  x3 = x1 + (W2-W6)*x3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+  
+  /* third stage */
+  x7 = x8 + x3;
+  x8 -= x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181*(x4+x5)+128)>>8;
+  x4 = (181*(x4-x5)+128)>>8;
+  
+  /* fourth stage */
+  blk[0] = (x7+x1)>>8;
+  blk[1] = (x3+x2)>>8;
+  blk[2] = (x0+x4)>>8;
+  blk[3] = (x8+x6)>>8;
+  blk[4] = (x8-x6)>>8;
+  blk[5] = (x0-x4)>>8;
+  blk[6] = (x3-x2)>>8;
+  blk[7] = (x7-x1)>>8;
+}
+
+/* column (vertical) IDCT
+ *
+ *             7                         pi         1
+ * dst[8*k] = sum c[l] * src[8*l] * cos( -- * ( k + - ) * l )
+ *            l=0                        8          2
+ *
+ * where: c[0]    = 1/1024
+ *        c[1..7] = (1/1024)*sqrt(2)
+ */
+static void idctcol(blk)
+short *blk;
+{
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = (blk[8*4]<<8)) | (x2 = blk[8*6]) | (x3 = blk[8*2]) |
+        (x4 = blk[8*1]) | (x5 = blk[8*7]) | (x6 = blk[8*5]) | (x7 = blk[8*3])))
+  {
+    blk[8*0]=blk[8*1]=blk[8*2]=blk[8*3]=blk[8*4]=blk[8*5]=blk[8*6]=blk[8*7]=
+      iclp[(blk[8*0]+32)>>6];
+    return;
+  }
+
+  x0 = (blk[8*0]<<8) + 8192;
+
+  /* first stage */
+  x8 = W7*(x4+x5) + 4;
+  x4 = (x8+(W1-W7)*x4)>>3;
+  x5 = (x8-(W1+W7)*x5)>>3;
+  x8 = W3*(x6+x7) + 4;
+  x6 = (x8-(W3-W5)*x6)>>3;
+  x7 = (x8-(W3+W5)*x7)>>3;
+  
+  /* second stage */
+  x8 = x0 + x1;
+  x0 -= x1;
+  x1 = W6*(x3+x2) + 4;
+  x2 = (x1-(W2+W6)*x2)>>3;
+  x3 = (x1+(W2-W6)*x3)>>3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+  
+  /* third stage */
+  x7 = x8 + x3;
+  x8 -= x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181*(x4+x5)+128)>>8;
+  x4 = (181*(x4-x5)+128)>>8;
+  
+  /* fourth stage */
+  blk[8*0] = iclp[(x7+x1)>>14];
+  blk[8*1] = iclp[(x3+x2)>>14];
+  blk[8*2] = iclp[(x0+x4)>>14];
+  blk[8*3] = iclp[(x8+x6)>>14];
+  blk[8*4] = iclp[(x8-x6)>>14];
+  blk[8*5] = iclp[(x0-x4)>>14];
+  blk[8*6] = iclp[(x3-x2)>>14];
+  blk[8*7] = iclp[(x7-x1)>>14];
+}
+
+/* two dimensional inverse discrete cosine transform */
+void idct(short *block, unsigned char *temp)
+{
+  int i;
+
+  for (i=0; i<8; i++)
+    idctrow(block+8*i);
+
+  for (i=0; i<8; i++)
+    idctcol(block+i);
+}
+
+void init_idct()
+{
+  int i;
+
+  iclp = iclip+512;
+  for (i= -512; i<512; i++)
+    iclp[i] = (i<-256) ? -256 : ((i>255) ? 255 : i);
+}
diff --git a/mpeg2enc/idctdata.c b/mpeg2enc/idctdata.c
new file mode 100644 (file)
index 0000000..a6a48cf
--- /dev/null
@@ -0,0 +1,260 @@
+//
+// MMX32 iDCT algorithm  (IEEE-1180 compliant) :: idct_mmx32()
+//
+// MPEG2AVI
+// --------
+//  v0.16B33 initial release
+//
+// This was one of the harder pieces of work to code.
+// Intel's app-note focuses on the numerical issues of the algorithm, but
+// assumes the programmer is familiar with IDCT mathematics, leaving the
+// form of the complete function up to the programmer's imagination.
+
+#include "config.h"
+#include "mpeg2enc.h"
+#include "global.h"
+#include "simd.h"
+
+//  ALGORITHM OVERVIEW
+//  ------------------
+// I played around with the code for quite a few hours.  I came up
+// with *A* working IDCT algorithm, however I'm not sure whether my routine
+// is "the correct one."  But rest assured, my code passes all six IEEE 
+// accuracy tests with plenty of margin.
+//
+//   My IDCT algorithm consists of 4 steps:
+//
+//   1) IDCT-row transformation (using the IDCT-row function) on all 8 rows
+//      This yields an intermediate 8x8 matrix.
+//
+//   2) intermediate matrix transpose (mandatory)
+//
+//   3) IDCT-row transformation (2nd time) on all 8 rows of the intermediate
+//      matrix.  The output is the final-result, in transposed form.
+//
+//   4) post-transformation matrix transpose 
+//      (not necessary if the input-data is already transposed, this could
+//       be done during the MPEG "zig-zag" scan, but since my algorithm
+//       requires at least one transpose operation, why not re-use the
+//       transpose-code.)
+//
+//   Although the (1st) and (3rd) steps use the SAME row-transform operation,
+//   the (3rd) step uses different shift&round constants (explained later.)
+//
+//   Also note that the intermediate transpose (2) would not be neccessary,
+//   if the subsequent operation were a iDCT-column transformation.  Since
+//   we only have the iDCT-row transform, we transpose the intermediate
+//   matrix and use the iDCT-row transform a 2nd time.
+//
+//   I had to change some constants/variables for my method to work :
+//
+//      As given by Intel, the #defines for SHIFT_INV_COL and RND_INV_COL are
+//      wrong.  Not surprising since I'm not using a true column-transform 
+//      operation, but the row-transform operation (as mentioned earlier.)
+//      round_inv_col[], which is given as "4 int16_t" values, should have the
+//      same dimensions as round_inv_row[].  The corrected variables are 
+//      shown.
+//
+//      Intel's code defines a different table for each each row operation.
+//      The tables given are 0/4, 1/7, 2/6, and 5/3.  My code only uses row#0.
+//      Using the other rows messes up the overall transform.
+//
+//   IMPLEMENTATION DETAILs
+//   ----------------------
+// 
+//   I divided the algorithm's work into two subroutines,
+//    1) idct_mmx32_rows() - transforms 8 rows, then transpose
+//    2) idct_mmx32_cols() - transforms 8 rows, then transpose
+//       yields final result ("drop-in" direct replacement for INT32 IDCT)
+//
+//   The 2nd function is a clone of the 1st, with changes made only to the
+//   shift&rounding instructions.
+//
+//      In the 1st function (rows), the shift & round instructions use 
+//       SHIFT_INV_ROW & round_inv_row[] (renamed to r_inv_row[])
+//
+//      In the 2nd function (cols)-> r_inv_col[], and
+//       SHIFT_INV_COL & round_inv_col[] (renamed to r_inv_col[])
+//
+//   Each function contains an integrated transpose-operator, which comes
+//   AFTER the primary transformation operation.  In the future, I'll optimize
+//   the code to do more of the transpose-work "in-place".  Right now, I've
+//   left the code as two subroutines and a main calling function, so other
+//   people can read the code more easily.
+//
+//   liaor@umcc.ais.org  http://members.tripod.com/~liaor
+//  
+
+//;=============================================================================
+//;
+//;  AP-922   http://developer.intel.com/vtune/cbts/strmsimd
+//; These examples contain code fragments for first stage iDCT 8x8
+//; (for rows) and first stage DCT 8x8 (for columns)
+//;
+//;=============================================================================
+
+#define BITS_INV_ACC   4       //; 4 or 5 for IEEE
+       // 5 yields higher accuracy, but lessens dynamic range on the input matrix
+#define SHIFT_INV_ROW  (16 - BITS_INV_ACC)
+#define SHIFT_INV_COL  (1 + BITS_INV_ACC +14 )  // changed from Intel's val)
+
+
+#define RND_INV_ROW            (1 << (SHIFT_INV_ROW-1))
+#define RND_INV_COL            (1 << (SHIFT_INV_COL-1)) 
+#define RND_INV_CORR   (RND_INV_COL - 1)               //; correction -1.0 and round
+
+/* TODO: This should *really* be aligned on 16-byte boundaries... */
+
+const int idct_r_inv_row[2] = { RND_INV_ROW, RND_INV_ROW};
+const int idct_r_inv_col[2] = {RND_INV_COL, RND_INV_COL};
+const int idct_r_inv_corr[2] = {RND_INV_CORR, RND_INV_CORR };
+
+/* Unused and thus redundant...
+const long long dct_one_corr = 0x0001000100010001;
+*/
+
+/*
+;=============================================================================
+;
+; The first stage iDCT 8x8 - inverse DCTs of rows
+;
+;-----------------------------------------------------------------------------
+; The 8-point inverse DCT direct algorithm
+;-----------------------------------------------------------------------------
+;
+; static const int16_t w[32] = {
+; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
+; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
+; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
+; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
+; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
+; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
+; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
+; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
+;
+; #define DCT_8_INV_ROW(x, y)
+
+;{
+; int a0, a1, a2, a3, b0, b1, b2, b3;
+;
+; a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
+; a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
+; a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
+; a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
+; b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
+; b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
+; b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
+; b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
+;
+; y[0] = SHIFT_ROUND ( a0 + b0 );
+; y[1] = SHIFT_ROUND ( a1 + b1 );
+; y[2] = SHIFT_ROUND ( a2 + b2 );
+; y[3] = SHIFT_ROUND ( a3 + b3 );
+; y[4] = SHIFT_ROUND ( a3 - b3 );
+; y[5] = SHIFT_ROUND ( a2 - b2 );
+; y[6] = SHIFT_ROUND ( a1 - b1 );
+; y[7] = SHIFT_ROUND ( a0 - b0 );
+;}
+;
+;-----------------------------------------------------------------------------
+;
+; In this implementation the outputs of the iDCT-1D are multiplied
+; for rows 0,4 - by cos_4_16,
+; for rows 1,7 - by cos_1_16,
+; for rows 2,6 - by cos_2_16,
+; for rows 3,5 - by cos_3_16
+; and are shifted to the left for better accuracy
+;
+; For the constants used,
+; FIX(float_const) = (int16_t) (float_const * (1<<15) + 0.5)
+;
+;=============================================================================
+*/
+
+/* CONCATENATED TABLE, rows 0,1,2,3,4,5,6,7 (in order )
+
+   In our implementation, however, we only use row0 !
+*/
+
+const int16_t idct_tab_01234567[] = {
+       //row0, this row is required
+       16384, 16384, 16384, -16384,    // ; movq-> w06 w04 w02 w00
+       21407, 8867, 8867, -21407,              // w07 w05 w03 w01
+       16384, -16384, 16384, 16384,    //; w14 w12 w10 w08
+       -8867, 21407, -21407, -8867,    //; w15 w13 w11 w09
+       22725, 12873, 19266, -22725,    //; w22 w20 w18 w16
+       19266, 4520, -4520, -12873,             //; w23 w21 w19 w17
+       12873, 4520, 4520, 19266,               //; w30 w28 w26 w24
+       -22725, 19266, -12873, -22725,  //w31 w29 w27 w25
+
+       // the rest of these rows (1-7), aren't used !
+
+       //row1
+       22725, 22725, 22725, -22725,    // ; movq-> w06 w04 w02 w00
+       29692, 12299, 12299, -29692,    //      ; w07 w05 w03 w01
+       22725, -22725, 22725, 22725,    //; w14 w12 w10 w08
+       -12299, 29692, -29692, -12299,  //; w15 w13 w11 w09
+       31521, 17855, 26722, -31521,    //; w22 w20 w18 w16
+       26722, 6270, -6270, -17855,             //; w23 w21 w19 w17
+       17855, 6270, 6270, 26722,               //; w30 w28 w26 w24
+       -31521, 26722, -17855, -31521,  // w31 w29 w27 w25
+
+       //row2
+       21407, 21407, 21407, -21407,    // ; movq-> w06 w04 w02 w00
+       27969, 11585, 11585, -27969,    // ; w07 w05 w03 w01
+       21407, -21407, 21407, 21407,    // ; w14 w12 w10 w08
+       -11585, 27969, -27969, -11585,  //  ;w15 w13 w11 w09
+       29692, 16819, 25172, -29692,    // ;w22 w20 w18 w16
+       25172, 5906, -5906, -16819,     // ;w23 w21 w19 w17
+       16819, 5906, 5906, 25172,               // ;w30 w28 w26 w24
+       -29692, 25172, -16819, -29692,  //  ;w31 w29 w27 w25
+
+       //row3
+       19266, 19266, 19266, -19266,    //; movq-> w06 w04 w02 w00
+       25172, 10426, 10426, -25172,    //; w07 w05 w03 w01
+       19266, -19266, 19266, 19266,    //; w14 w12 w10 w08
+       -10426, 25172, -25172, -10426,  //; w15 w13 w11 w09
+       26722, 15137, 22654, -26722,    //; w22 w20 w18 w16
+       22654, 5315, -5315, -15137,             //; w23 w21 w19 w17
+       15137, 5315, 5315, 22654,               //; w30 w28 w26 w24
+       -26722, 22654, -15137, -26722,  //; w31 w29 w27 w25
+
+       //row4
+       16384, 16384, 16384, -16384,    // ; movq-> w06 w04 w02 w00
+       21407, 8867, 8867, -21407,              // w07 w05 w03 w01
+       16384, -16384, 16384, 16384,    //; w14 w12 w10 w08
+       -8867, 21407, -21407, -8867,    //; w15 w13 w11 w09
+       22725, 12873, 19266, -22725,    //; w22 w20 w18 w16
+       19266, 4520, -4520, -12873,             //; w23 w21 w19 w17
+       12873, 4520, 4520, 19266,               //; w30 w28 w26 w24
+       -22725, 19266, -12873, -22725,  //w31 w29 w27 w25
+
+       //row5
+       19266, 19266, 19266, -19266,    //; movq-> w06 w04 w02 w00
+       25172, 10426, 10426, -25172,    //; w07 w05 w03 w01
+       19266, -19266, 19266, 19266,    //; w14 w12 w10 w08
+       -10426, 25172, -25172, -10426,  //; w15 w13 w11 w09
+       26722, 15137, 22654, -26722,    //; w22 w20 w18 w16
+       22654, 5315, -5315, -15137,             //; w23 w21 w19 w17
+       15137, 5315, 5315, 22654,               //; w30 w28 w26 w24
+       -26722, 22654, -15137, -26722,  //; w31 w29 w27 w25
+
+       //row6
+       21407, 21407, 21407, -21407,    // ; movq-> w06 w04 w02 w00
+       27969, 11585, 11585, -27969,    // ; w07 w05 w03 w01
+       21407, -21407, 21407, 21407,    // ; w14 w12 w10 w08
+       -11585, 27969, -27969, -11585,  //  ;w15 w13 w11 w09
+       29692, 16819, 25172, -29692,    // ;w22 w20 w18 w16
+       25172, 5906, -5906, -16819,     // ;w23 w21 w19 w17
+       16819, 5906, 5906, 25172,               // ;w30 w28 w26 w24
+       -29692, 25172, -16819, -29692,  //  ;w31 w29 w27 w25
+
+       //row7
+       22725, 22725, 22725, -22725,    // ; movq-> w06 w04 w02 w00
+       29692, 12299, 12299, -29692,    //      ; w07 w05 w03 w01
+       22725, -22725, 22725, 22725,    //; w14 w12 w10 w08
+       -12299, 29692, -29692, -12299,  //; w15 w13 w11 w09
+       31521, 17855, 26722, -31521,    //; w22 w20 w18 w16
+       26722, 6270, -6270, -17855,             //; w23 w21 w19 w17
+       17855, 6270, 6270, 26722,               //; w30 w28 w26 w24
+       -31521, 26722, -17855, -31521}; // w31 w29 w27 w25
diff --git a/mpeg2enc/mblock_sub44_sads.c b/mpeg2enc/mblock_sub44_sads.c
new file mode 100644 (file)
index 0000000..6e50036
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ *
+ * mblock_sub44_sads.c
+ * Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+ *
+ * Fast block sum-absolute difference computation for a rectangular area 4*x
+ * by y where y > h against a 4 by h block.
+ *
+ * Used for 4*4 sub-sampled motion compensation calculations.
+ *
+ * This is actually just a shell that uses templates from the included
+ * file "mblock_sub44_sads_x86_h.c".  I didn't trust the compiler to do a good
+ * job on nested inlining.  One day I'll experiment.
+ * 
+ *
+ * This file is part of mpeg2enc, a free MPEG-2 video stream encoder
+ * based on the original MSSG reference design
+ *
+ * mpeg2enc is free software; you can redistribute new parts
+ * and/or modify under the terms of the GNU General Public License 
+ * as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2dec is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See the files for those sections (c) MSSG
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include "config.h"
+#include "global.h"
+#include "mpeg2enc.h"
+#include "simd.h"
+#include "attributes.h"
+#include "mmx.h"
+#include "fastintfns.h"
+
+/*
+  Register usage:
+  mm0-mm3  Hold the current row 
+  mm4      Used for accumulating partial SAD
+  mm7      Holds zero
+ */
+
+static inline void mmx_zero_reg (void)
+{
+       /*  load 0 into mm7      */
+       pxor_r2r (mm7, mm7);
+}
+
+/*
+ * Load a 4*4 block of 4*4 sub-sampled pels (qpels) into the MMX
+ * registers
+ *
+ */
+
+static __inline__ void load_blk(uint8_t *blk,uint32_t rowstride,int h)
+{
+       movq_m2r( *blk, mm0);
+       blk += rowstride;
+       movq_m2r( *blk, mm1);
+       if( h == 2 )
+               return;
+       blk += rowstride;
+       movq_m2r( *blk, mm2);
+       blk += rowstride;
+       movq_m2r( *blk, mm3);
+}
+
+/*
+ * Do a shift right on the 4*4 block in the MMX registers
+ *
+ */
+static __inline__ void shift_blk(const uint32_t shift)
+{
+       psrlq_i2r( shift,mm0);
+       psrlq_i2r( shift,mm1);
+       psrlq_i2r( shift,mm2);
+       psrlq_i2r( shift,mm3);
+}
+
+/*
+ * Compute the Sum absolute differences between the 4*h block in
+ * the MMX registers 
+ *
+ * and the 4*h block pointed to by refblk
+ *
+ * h == 2 || h == 4
+ *
+ * TODO: Currently always loads and shifts 4*4 even if 4*2 is required.
+ *
+ */
+
+static __inline__ int qblock_sad_mmxe(uint8_t *refblk, 
+                                                                 uint32_t h,
+                                                                 uint32_t rowstride)
+{
+       int res;
+       pxor_r2r        (mm4,mm4);
+                       
+       movq_r2r        (mm0,mm5);              /* First row */
+       movd_m2r        (*refblk, mm6);
+       pxor_r2r    ( mm7, mm7);
+       refblk += rowstride;
+       punpcklbw_r2r   ( mm7, mm5);
+       punpcklbw_r2r   ( mm7, mm6);
+       psadbw_r2r      ( mm5, mm6);
+       paddw_r2r     ( mm6, mm4 );
+       
+
+
+       movq_r2r        (mm1,mm5);              /* Second row */
+       movd_m2r        (*refblk, mm6);
+       refblk += rowstride;
+       punpcklbw_r2r   ( mm7, mm5);
+       punpcklbw_r2r   ( mm7, mm6);
+       psadbw_r2r      ( mm5, mm6);
+       paddw_r2r     ( mm6, mm4 );
+
+       if( h == 4 )
+       {
+
+               movq_r2r        (mm2,mm5);              /* Third row */
+               movd_m2r        (*refblk, mm6);
+               refblk += rowstride;
+               punpcklbw_r2r   ( mm7, mm5);
+               punpcklbw_r2r   ( mm7, mm6);
+               psadbw_r2r      ( mm5, mm6);
+               paddw_r2r     ( mm6, mm4 );
+
+               
+               movq_r2r        (mm3,mm5);              /* Fourth row */
+               movd_m2r        (*refblk, mm6);
+               punpcklbw_r2r   ( mm7, mm5);
+               punpcklbw_r2r   ( mm7, mm6);
+               psadbw_r2r      ( mm5, mm6);
+               paddw_r2r     ( mm6, mm4 );
+               
+       }
+       movd_r2m      ( mm4, res );
+
+       return res;
+}
+
+
+
+static __inline__ int qblock_sad_mmx(uint8_t *refblk, 
+                                                                 uint32_t h,
+                                                                 uint32_t rowstride)
+{
+       int res;
+       pxor_r2r        (mm4,mm4);
+                       
+       movq_r2r        (mm0,mm5);              /* First row */
+       movd_m2r        (*refblk, mm6);
+       pxor_r2r    ( mm7, mm7);
+       refblk += rowstride;
+       punpcklbw_r2r   ( mm7, mm5);
+
+       punpcklbw_r2r   ( mm7, mm6);
+
+       movq_r2r                ( mm5, mm7);
+       psubusw_r2r     ( mm6, mm5);
+
+       psubusw_r2r   ( mm7, mm6);
+
+       paddw_r2r     ( mm5, mm4);
+       paddw_r2r     ( mm6, mm4 );
+       
+
+
+       movq_r2r        (mm1,mm5);              /* Second row */
+       movd_m2r        (*refblk, mm6);
+       pxor_r2r    ( mm7, mm7);
+       refblk += rowstride;
+       punpcklbw_r2r   ( mm7, mm5);
+       punpcklbw_r2r   ( mm7, mm6);
+       movq_r2r                ( mm5, mm7);
+       psubusw_r2r     ( mm6, mm5);
+       psubusw_r2r   ( mm7, mm6);
+       paddw_r2r     ( mm5, mm4);
+       paddw_r2r     ( mm6, mm4 );
+
+       if( h == 4 )
+       {
+
+               movq_r2r        (mm2,mm5);              /* Third row */
+               movd_m2r        (*refblk, mm6);
+               pxor_r2r    ( mm7, mm7);
+               refblk += rowstride;
+               punpcklbw_r2r   ( mm7, mm5);
+               punpcklbw_r2r   ( mm7, mm6);
+               movq_r2r                ( mm5, mm7);
+               psubusw_r2r     ( mm6, mm5);
+               psubusw_r2r   ( mm7, mm6);
+               paddw_r2r     ( mm5, mm4);
+               paddw_r2r     ( mm6, mm4 );
+               
+               movq_r2r        (mm3,mm5);              /* Fourth row */
+               movd_m2r        (*refblk, mm6);
+               pxor_r2r    ( mm7, mm7);
+               punpcklbw_r2r   ( mm7, mm5);
+               punpcklbw_r2r   ( mm7, mm6);
+               movq_r2r                ( mm5, mm7);
+               psubusw_r2r     ( mm6, mm5);
+               psubusw_r2r   ( mm7, mm6);
+               paddw_r2r     ( mm5, mm4);
+               paddw_r2r     ( mm6, mm4 );
+       }
+
+
+       movq_r2r      ( mm4, mm5 );
+    psrlq_i2r     ( 32, mm5 );
+    paddw_r2r     ( mm5, mm4 );
+       movq_r2r      ( mm4, mm6 );
+    psrlq_i2r     ( 16, mm6 );
+    paddw_r2r     ( mm6, mm4 );
+       movd_r2m      ( mm4, res );
+
+       return res & 0xffff;
+}
+
+
+/*
+ * Do the Extended MMX versions
+ */
+#define SIMD_SUFFIX(x) x##_mmxe
+#include "mblock_sub44_sads_x86_h.c"
+#undef SIMD_SUFFIX
+/*
+ * Do the original MMX versions
+ */
+#define SIMD_SUFFIX(x) x##_mmx
+#include "mblock_sub44_sads_x86_h.c"
+#undef SIMD_SUFFIX
+
+
+
+
diff --git a/mpeg2enc/mblock_sub44_sads_x86_h.c b/mpeg2enc/mblock_sub44_sads_x86_h.c
new file mode 100644 (file)
index 0000000..f3cd0b1
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ *
+ * mblock_sub44_sads_x86_h.c
+ * Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+ *
+ * Fast block sum-absolute difference computation for a rectangular area 4*x
+ * by y where y > h against a 4 by h block.
+ *
+ * Used for 4*4 sub-sampled motion compensation calculations.
+ * 
+ *
+ * This file is part of mpeg2enc, a free MPEG-2 video stream encoder
+ * based on the original MSSG reference design
+ *
+ * mpeg2enc is free software; you can redistribute new parts
+ * and/or modify under the terms of the GNU General Public License 
+ * as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2enc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See the files for those sections (c) MSSG
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ *
+ * Generates a vector sad's for 4*4 sub-sampled pel (qpel) data (with
+ * co-ordinates and top-left qpel address) from specified rectangle
+ * against a specified 16*h pel (4*4 qpel) reference block.  The
+ * generated vector contains results only for those sad's that fall
+ * below twice the running best sad and are aligned on 8-pel
+ * boundaries
+ *
+ * Invariant: blk points to top-left sub-sampled pel for macroblock
+ * at (ilow,ihigh)
+ * i{low,high) j(low,high) must be multiples of 4.
+ *
+ * sad = Sum Absolute Differences
+ *
+ * NOTES: for best efficiency i{low,high) should be multiples of 16.
+ *
+ * */
+
+int SIMD_SUFFIX(mblock_sub44_dists)( uint8_t *blk,  uint8_t *ref,
+                                                                        int ilow,int jlow,
+                                                                        int ihigh, int jhigh, 
+                                                                        int h, int rowstride, 
+                                                                        int threshold,
+                                                                        mc_result_s *resvec)
+{
+       int32_t x,y;
+       uint8_t *currowblk = blk;
+       uint8_t *curblk;
+       mc_result_s *cres = resvec;
+       int      gridrowstride = (rowstride);
+
+       for( y=jlow; y <= jhigh ; y+=4)
+       {
+               curblk = currowblk;
+               for( x = ilow; x <= ihigh; x += 4)
+               {
+                       int weight;
+                       if( (x & 15) == (ilow & 15) )
+                       {
+                               load_blk( curblk, rowstride, h );
+                       }
+                       weight = SIMD_SUFFIX(qblock_sad)(ref, h, rowstride);
+                       if( weight <= threshold )
+                       {
+                               threshold = intmin(weight<<2,threshold);
+                               /* Rough and-ready absolute distance penalty */
+                               /* NOTE: This penalty is *vital* to correct operation 
+                                  as otherwise the sub-mean filtering won't work on very
+                                  uniform images.
+                                */
+                               cres->weight = (uint16_t)weight+((intabs(x)+intabs(y))>>3);
+                               cres->x = (uint8_t)x;
+                               cres->y = (uint8_t)y;
+                               ++cres;
+                       }
+                       curblk += 1;
+                       shift_blk(8);
+               }
+               currowblk += gridrowstride;
+       }
+       emms();
+       return cres - resvec;
+}
+
+#undef concat
diff --git a/mpeg2enc/motion.c b/mpeg2enc/motion.c
new file mode 100644 (file)
index 0000000..8cee7bc
--- /dev/null
@@ -0,0 +1,3098 @@
+/* motion.c, motion estimation                                              */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include "config.h"
+#include "global.h"
+#include "cpu_accel.h"
+#include "simd.h"
+
+/* Macro-block Motion compensation results record */
+
+typedef struct _blockcrd {
+       int16_t x;
+       int16_t y;
+} blockxy;
+
+struct mb_motion
+{
+       blockxy pos;        // Half-pel co-ordinates of source block
+       int sad;                        // Sum of absolute difference
+       int var;
+       uint8_t *blk ;          // Source block data (in luminace data array)
+       int hx, hy;                     // Half-pel offsets
+       int fieldsel;           // 0 = top 1 = bottom
+       int fieldoff;       // Offset from start of frame data to first line
+                                               // of field.    (top = 0, bottom = width );
+};
+
+typedef struct mb_motion mb_motion_s;
+
+
+struct subsampled_mb
+{
+       uint8_t *mb;            // One pel
+       uint8_t *fmb;           // Two-pel subsampled
+       uint8_t *qmb;           // Four-pel subsampled
+       uint8_t *umb;           // U compoenent one-pel
+       uint8_t *vmb;       // V component  one-pel
+};
+
+typedef struct subsampled_mb subsampled_mb_s;
+
+
+static void frame_ME (motion_engine_t *engine,
+                                                                       pict_data_s *picture,
+                                                                 motion_comp_s *mc,
+                                                                 int mboffset,
+                                                                 int i, int j, struct mbinfo *mbi);
+
+static void field_ME (motion_engine_t *engine,
+                                                                       pict_data_s *picture,
+                                                                 motion_comp_s *mc,
+                                                                 int mboffset,
+                                                                 int i, int j, 
+                                                                 struct mbinfo *mbi, 
+                                                                 int secondfield, 
+                                                                 int ipflag);
+
+static void frame_estimate (motion_engine_t *engine,
+       uint8_t *org,
+        uint8_t *ref, 
+        subsampled_mb_s *ssmb,
+        int i, int j,
+        int sx, int sy, 
+         mb_motion_s *bestfr,
+         mb_motion_s *besttop,
+         mb_motion_s *bestbot,
+        int imins[2][2], int jmins[2][2]);
+
+static void field_estimate (motion_engine_t *engine,
+                                                       pict_data_s *picture,
+                                                       uint8_t *toporg,
+                                                       uint8_t *topref, 
+                                                       uint8_t *botorg, 
+                                                       uint8_t *botref,
+                                                       subsampled_mb_s *ssmb,
+                                                       int i, int j, int sx, int sy, int ipflag,
+                                                       mb_motion_s *bestfr,
+                                                       mb_motion_s *best8u,
+                                                       mb_motion_s *best8l,
+                                                       mb_motion_s *bestsp);
+
+static void dpframe_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *ref,
+       subsampled_mb_s *ssmb,
+       int i, int j, int iminf[2][2], int jminf[2][2],
+       mb_motion_s *dpbest,
+       int *imindmvp, int *jmindmvp, 
+       int *vmcp);
+
+static void dpfield_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *topref,
+       uint8_t *botref, 
+       uint8_t *mb,
+       int i, int j, 
+       int imins, int jmins, 
+       mb_motion_s *dpbest,
+       int *vmcp);
+
+static void fullsearch (motion_engine_t *engine, 
+       uint8_t *org, uint8_t *ref,
+       subsampled_mb_s *ssblk,
+       int lx, int i0, int j0, 
+       int sx, int sy, int h, 
+       int xmax, int ymax,
+       mb_motion_s *motion );
+
+static void find_best_one_pel( motion_engine_t *engine, 
+                                                               uint8_t *org, uint8_t *blk,
+                                                          int searched_size,
+                                                          int i0, int j0,
+                                                          int ilow, int jlow,
+                                                          int xmax, int ymax,
+                                                          int lx, int h, 
+                                                          mb_motion_s *res
+       );
+static int build_sub22_mcomps( motion_engine_t *engine, 
+                                                               int i0,  int j0, int ihigh, int jhigh, 
+                                                               int null_mc_sad,
+                                                               uint8_t *s22org,  uint8_t *s22blk, 
+                                                          int flx, int fh,  int searched_sub44_size );
+
+#ifdef X86_CPU
+static void find_best_one_pel_mmxe( motion_engine_t *engine, 
+                                                               uint8_t *org, uint8_t *blk,
+                                                          int searched_size,
+                                                          int i0, int j0,
+                                                          int ilow, int jlow,
+                                                          int xmax, int ymax,
+                                                          int lx, int h, 
+                                                          mb_motion_s *res
+       );
+
+static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0,  int j0, int ihigh, int jhigh, 
+                                                        int null_mc_sad,
+                                                        uint8_t *s22org,  uint8_t *s22blk, 
+                                                        int flx, int fh,  int searched_sub44_size );
+static int (*pmblock_sub44_dists)( uint8_t *blk,  uint8_t *ref,
+                                                       int ilow, int jlow,
+                                                       int ihigh, int jhigh, 
+                                                       int h, int rowstride, 
+                                                       int threshold,
+                                                       mc_result_s *resvec);
+#endif
+
+static int unidir_pred_var( const mb_motion_s *motion, 
+                                                       uint8_t *mb,  int lx, int h);
+static int bidir_pred_var( const mb_motion_s *motion_f,  
+                                                  const mb_motion_s *motion_b, 
+                                                  uint8_t *mb,  int lx, int h);
+static int bidir_pred_sad( const mb_motion_s *motion_f,  
+                                                  const mb_motion_s *motion_b, 
+                                                  uint8_t *mb,  int lx, int h);
+
+static int variance(  uint8_t *mb, int size, int lx);
+
+static int dist22 ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
+
+static int dist44 ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
+static int dist2_22( uint8_t *blk1, uint8_t *blk2,
+                                        int lx, int h);
+static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b, 
+                                         uint8_t *blk2,
+                                         int lx, int h);
+
+
+static void (*pfind_best_one_pel)( motion_engine_t *engine,
+                                                               uint8_t *org, uint8_t *blk,
+                                                          int searched_size,
+                                                          int i0, int j0,
+                                                          int ilow, int jlow,
+                                                          int xmax, int ymax,
+                                                          int lx, int h, 
+                                                          mb_motion_s *res
+       );
+static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
+                                                                       int i0,  int j0, int ihigh, int jhigh, 
+                                                                  int null_mc_sad,
+                                                                  uint8_t *s22org,  uint8_t *s22blk, 
+                                                                  int flx, int fh,  int searched_sub44_size );
+
+static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
+                                                int lx, int h);
+static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b, 
+                                                 uint8_t *blk2,
+                                                 int lx, int h);
+
+static int dist1_00( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
+static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int dist2 (uint8_t *blk1, uint8_t *blk2,
+                                                         int lx, int hx, int hy, int h);
+static int bdist2 (uint8_t *pf, uint8_t *pb,
+       uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+static int bdist1 (uint8_t *pf, uint8_t *pb,
+                                  uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+
+
+static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
+static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
+static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
+static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
+
+static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
+                                         int lx, int hx, int hy, int h);
+  
+  
+static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
+                                          uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+
+static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
+                                          uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+
+
+/* DEBUGGER...
+static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
+{
+       rx *= 2; ry *= 2;
+       if( motion->sad < 0 || motion->sad > 0x10000 )
+       {
+               printf( "SAD ooops %s\n", str );
+               exit(1);
+       }
+       if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
+       {
+               printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
+               exit(1);
+       }
+       if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
+       {
+               printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
+               exit(1);
+       }
+}
+
+static void init_mc( mb_motion_s *motion )
+{
+       motion->sad = -123;
+       motion->pos.x = -1000;
+       motion->pos.y = -1000;
+}
+*/
+
+/* unidir_NI_var_sum
+   Compute the combined variance of luminance and chrominance information
+   for a particular non-intra macro block after unidirectional
+   motion compensation...  
+
+   Note: results are scaled to give chrominance equal weight to
+   chrominance.  The variance of the luminance portion is computed
+   at the time the motion compensation is computed.
+
+   TODO: Perhaps we should compute the whole thing in fullsearch not
+   seperate it.  However, that would involve a lot of fiddling with
+   field_* and until its thoroughly debugged and tested I think I'll
+   leave that alone. Furthermore, it is unclear if its really worth
+   doing these computations for B *and* P frames.
+
+   TODO: BUG: ONLY works for 420 video...
+
+*/
+
+static int unidir_chrom_var_sum( mb_motion_s *lum_mc, 
+                                                         uint8_t **ref, 
+                                                         subsampled_mb_s *ssblk,
+                                                         int lx, int h )
+{
+       int uvlx = (lx>>1);
+       int uvh = (h>>1);
+       /* N.b. MC co-ordinates are computed in half-pel units! */
+       int cblkoffset = (lum_mc->fieldoff>>1) +
+               (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
+       
+       return  ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
+                        (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
+}
+
+/*
+  bidir_NI_var_sum
+   Compute the combined variance of luminance and chrominance information
+   for a particular non-intra macro block after unidirectional
+   motion compensation...  
+
+   Note: results are scaled to give chrominance equal weight to
+   chrominance.  The variance of the luminance portion is computed
+   at the time the motion compensation is computed.
+
+   Note: results scaled to give chrominance equal weight to chrominance.
+  
+  TODO: BUG: ONLY works for 420 video...
+
+  NOTE: Currently unused but may be required if it turns out that taking
+  chrominance into account in B frames is needed.
+
+ */
+
+int bidir_chrom_var_sum( mb_motion_s *lum_mc_f, 
+                                          mb_motion_s *lum_mc_b, 
+                                          uint8_t **ref_f, 
+                                          uint8_t **ref_b,
+                                          subsampled_mb_s *ssblk,
+                                          int lx, int h )
+{
+       int uvlx = (lx>>1);
+       int uvh = (h>>1);
+       /* N.b. MC co-ordinates are computed in half-pel units! */
+       int cblkoffset_f = (lum_mc_f->fieldoff>>1) + 
+               (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
+       int cblkoffset_b = (lum_mc_b->fieldoff>>1) + 
+               (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
+       
+       return  (
+               (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
+                                          ssblk->umb, uvlx, uvh ) +
+               (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
+                                          ssblk->vmb, uvlx, uvh ))*2;
+
+}
+
+static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
+{
+       return (variance(ssblk->umb,(h>>1),(lx>>1)) + 
+                       variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
+}
+
+/*
+ * frame picture motion estimation
+ *
+ * org: top left pel of source reference frame
+ * ref: top left pel of reconstructed reference frame
+ * ssmb:  macroblock to be matched
+ * i,j: location of mb relative to ref (=center of search window)
+ * sx,sy: half widths of search window
+ * besfr: location and SAD of best frame prediction
+ * besttop: location of best field pred. for top field of mb
+ * bestbo : location of best field pred. for bottom field of mb
+ */
+
+static void frame_estimate(motion_engine_t *engine, 
+       uint8_t *org,
+       uint8_t *ref,
+       subsampled_mb_s *ssmb,
+       int i, int j, int sx, int sy,
+       mb_motion_s *bestfr,
+       mb_motion_s *besttop,
+       mb_motion_s *bestbot,
+       int imins[2][2],
+       int jmins[2][2]
+       )
+{
+       subsampled_mb_s  botssmb;
+       mb_motion_s topfld_mc;
+       mb_motion_s botfld_mc;
+
+       botssmb.mb = ssmb->mb+width;
+       botssmb.fmb = ssmb->mb+(width>>1);
+       botssmb.qmb = ssmb->qmb+(width>>2);
+       botssmb.umb = ssmb->umb+(width>>1);
+       botssmb.vmb = ssmb->vmb+(width>>1);
+
+       /* frame prediction */
+       fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
+                                                 bestfr );
+       bestfr->fieldsel = 0;
+       bestfr->fieldoff = 0;
+
+       /* predict top field from top field */
+       fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
+                          &topfld_mc);
+
+       /* predict top field from bottom field */
+       fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
+                          width,height>>1, &botfld_mc);
+
+       /* set correct field selectors... */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       imins[0][0] = topfld_mc.pos.x;
+       jmins[0][0] = topfld_mc.pos.y;
+       imins[1][0] = botfld_mc.pos.x;
+       jmins[1][0] = botfld_mc.pos.y;
+
+       /* select prediction for top field */
+       if (topfld_mc.sad<=botfld_mc.sad)
+       {
+               *besttop = topfld_mc;
+       }
+       else
+       {
+               *besttop = botfld_mc;
+       }
+
+       /* predict bottom field from top field */
+       fullsearch(engine, org,ref,&botssmb,
+                                       width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
+                                       &topfld_mc);
+
+       /* predict bottom field from bottom field */
+       fullsearch(engine, org+width,ref+width,&botssmb,
+                                       width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
+                                       &botfld_mc);
+
+       /* set correct field selectors... */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       imins[0][1] = topfld_mc.pos.x;
+       jmins[0][1] = topfld_mc.pos.y;
+       imins[1][1] = botfld_mc.pos.x;
+       jmins[1][1] = botfld_mc.pos.y;
+
+       /* select prediction for bottom field */
+       if (botfld_mc.sad<=topfld_mc.sad)
+       {
+               *bestbot = botfld_mc;
+       }
+       else
+       {
+               *bestbot = topfld_mc;
+       }
+}
+
+/*
+ * field picture motion estimation subroutine
+ *
+ * toporg: address of original top reference field
+ * topref: address of reconstructed top reference field
+ * botorg: address of original bottom reference field
+ * botref: address of reconstructed bottom reference field
+ * ssmmb:  macroblock to be matched
+ * i,j: location of mb (=center of search window)
+ * sx,sy: half width/height of search window
+ *
+ * bestfld: location and distance of best field prediction
+ * best8u: location of best 16x8 pred. for upper half of mb
+ * best8lp: location of best 16x8 pred. for lower half of mb
+ * bdestsp: location and distance of best same parity field
+ *                    prediction (needed for dual prime, only valid if
+ *                    ipflag==0)
+ */
+
+static void field_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *toporg,
+       uint8_t *topref, 
+       uint8_t *botorg, 
+       uint8_t *botref,
+       subsampled_mb_s *ssmb,
+       int i, int j, int sx, int sy, int ipflag,
+       mb_motion_s *bestfld,
+       mb_motion_s *best8u,
+       mb_motion_s *best8l,
+       mb_motion_s *bestsp)
+
+{
+       mb_motion_s topfld_mc;
+       mb_motion_s botfld_mc;
+       int dt, db;
+       int notop, nobot;
+       subsampled_mb_s botssmb;
+
+       botssmb.mb = ssmb->mb+width;
+       botssmb.umb = ssmb->umb+(width>>1);
+       botssmb.vmb = ssmb->vmb+(width>>1);
+       botssmb.fmb = ssmb->fmb+(width>>1);
+       botssmb.qmb = ssmb->qmb+(width>>2);
+
+       /* if ipflag is set, predict from field of opposite parity only */
+       notop = ipflag && (picture->pict_struct==TOP_FIELD);
+       nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
+
+       /* field prediction */
+
+       /* predict current field from top field */
+       if (notop)
+               topfld_mc.sad = dt = 65536; /* infinity */
+       else
+               fullsearch(engine, toporg,topref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,16,width,height>>1,
+                                  &topfld_mc);
+       dt = topfld_mc.sad;
+       /* predict current field from bottom field */
+       if (nobot)
+               botfld_mc.sad = db = 65536; /* infinity */
+       else
+               fullsearch(engine, botorg,botref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,16,width,height>>1,
+                                  &botfld_mc);
+       db = botfld_mc.sad;
+       /* Set correct field selectors */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       /* same parity prediction (only valid if ipflag==0) */
+       if (picture->pict_struct==TOP_FIELD)
+       {
+               *bestsp = topfld_mc;
+       }
+       else
+       {
+               *bestsp = botfld_mc;
+       }
+
+       /* select field prediction */
+       if (dt<=db)
+       {
+               *bestfld = topfld_mc;
+       }
+       else
+       {
+               *bestfld = botfld_mc;
+       }
+
+
+       /* 16x8 motion compensation */
+
+       /* predict upper half field from top field */
+       if (notop)
+               topfld_mc.sad = dt = 65536;
+       else
+               fullsearch(engine, toporg,topref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,8,width,height>>1,
+                                   &topfld_mc);
+       dt = topfld_mc.sad;
+       /* predict upper half field from bottom field */
+       if (nobot)
+               botfld_mc.sad = db = 65536;
+       else
+               fullsearch(engine, botorg,botref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,8,width,height>>1,
+                                   &botfld_mc);
+       db = botfld_mc.sad;
+
+       /* Set correct field selectors */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       /* select prediction for upper half field */
+       if (dt<=db)
+       {
+               *best8u = topfld_mc;
+       }
+       else
+       {
+               *best8u = botfld_mc;
+       }
+
+       /* predict lower half field from top field */
+       /*
+         N.b. For interlaced data width<<4 (width*16) takes us 8 rows
+         down in the same field.  
+         Thus for the fast motion data (2*2
+         sub-sampled) we need to go 4 rows down in the same field.
+         This requires adding width*4 = (width<<2).
+         For the 4*4 sub-sampled motion data we need to go down 2 rows.
+         This requires adding width = width
+        
+       */
+       if (notop)
+               topfld_mc.sad = dt = 65536;
+       else
+               fullsearch(engine, toporg,topref,&botssmb,
+                                               width<<1,
+                                               i,j+8,sx,sy>>1,8,width,height>>1,
+                                  /* &imint,&jmint, &dt,*/ &topfld_mc);
+       dt = topfld_mc.sad;
+       /* predict lower half field from bottom field */
+       if (nobot)
+               botfld_mc.sad = db = 65536;
+       else
+               fullsearch(engine, botorg,botref,&botssmb,width<<1,
+                                               i,j+8,sx,sy>>1,8,width,height>>1,
+                                  /* &iminb,&jminb, &db,*/ &botfld_mc);
+       db = botfld_mc.sad;
+       /* Set correct field selectors */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       /* select prediction for lower half field */
+       if (dt<=db)
+       {
+               *best8l = topfld_mc;
+       }
+       else
+       {
+               *best8l = botfld_mc;
+       }
+}
+
+static void dpframe_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *ref,
+       subsampled_mb_s *ssmb,
+       
+       int i, int j, int iminf[2][2], int jminf[2][2],
+       mb_motion_s *best_mc,
+       int *imindmvp, int *jmindmvp,
+       int *vmcp)
+{
+       int pref,ppred,delta_x,delta_y;
+       int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
+       int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
+       int vmc,local_dist;
+
+       /* Calculate Dual Prime distortions for 9 delta candidates
+        * for each of the four minimum field vectors
+        * Note: only for P pictures!
+        */
+
+       /* initialize minimum dual prime distortion to large value */
+       vmc = INT_MAX;
+
+       for (pref=0; pref<2; pref++)
+       {
+               for (ppred=0; ppred<2; ppred++)
+               {
+                       /* convert Cartesian absolute to relative motion vector
+                        * values (wrt current macroblock address (i,j)
+                        */
+                       is = iminf[pref][ppred] - (i<<1);
+                       js = jminf[pref][ppred] - (j<<1);
+
+                       if (pref!=ppred)
+                       {
+                               /* vertical field shift adjustment */
+                               if (ppred==0)
+                                       js++;
+                               else
+                                       js--;
+
+                               /* mvxs and mvys scaling*/
+                               is<<=1;
+                               js<<=1;
+                               if (picture->topfirst == ppred)
+                               {
+                                       /* second field: scale by 1/3 */
+                                       is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
+                                       js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
+                               }
+                               else
+                                       continue;
+                       }
+
+                       /* vector for prediction from field of opposite 'parity' */
+                       if (picture->topfirst)
+                       {
+                               /* vector for prediction of top field from bottom field */
+                               it0 = ((is+(is>0))>>1);
+                               jt0 = ((js+(js>0))>>1) - 1;
+
+                               /* vector for prediction of bottom field from top field */
+                               ib0 = ((3*is+(is>0))>>1);
+                               jb0 = ((3*js+(js>0))>>1) + 1;
+                       }
+                       else
+                       {
+                               /* vector for prediction of top field from bottom field */
+                               it0 = ((3*is+(is>0))>>1);
+                               jt0 = ((3*js+(js>0))>>1) - 1;
+
+                               /* vector for prediction of bottom field from top field */
+                               ib0 = ((is+(is>0))>>1);
+                               jb0 = ((js+(js>0))>>1) + 1;
+                       }
+
+                       /* convert back to absolute half-pel field picture coordinates */
+                       is += i<<1;
+                       js += j<<1;
+                       it0 += i<<1;
+                       jt0 += j<<1;
+                       ib0 += i<<1;
+                       jb0 += j<<1;
+
+                       if (is >= 0 && is <= (width-16)<<1 &&
+                               js >= 0 && js <= (height-16))
+                       {
+                               for (delta_y=-1; delta_y<=1; delta_y++)
+                               {
+                                       for (delta_x=-1; delta_x<=1; delta_x++)
+                                       {
+                                               /* opposite field coordinates */
+                                               it = it0 + delta_x;
+                                               jt = jt0 + delta_y;
+                                               ib = ib0 + delta_x;
+                                               jb = jb0 + delta_y;
+
+                                               if (it >= 0 && it <= (width-16)<<1 &&
+                                                       jt >= 0 && jt <= (height-16) &&
+                                                       ib >= 0 && ib <= (width-16)<<1 &&
+                                                       jb >= 0 && jb <= (height-16))
+                                               {
+                                                       /* compute prediction error */
+                                                       local_dist = (*pbdist2)(
+                                                               ref + (is>>1) + (width<<1)*(js>>1),
+                                                               ref + width + (it>>1) + (width<<1)*(jt>>1),
+                                                               ssmb->mb,             /* current mb location */
+                                                               width<<1,       /* adjacent line distance */
+                                                               is&1, js&1, it&1, jt&1, /* half-pel flags */
+                                                               8);             /* block height */
+                                                       local_dist += (*pbdist2)(
+                                                               ref + width + (is>>1) + (width<<1)*(js>>1),
+                                                               ref + (ib>>1) + (width<<1)*(jb>>1),
+                                                               ssmb->mb + width,     /* current mb location */
+                                                               width<<1,       /* adjacent line distance */
+                                                               is&1, js&1, ib&1, jb&1, /* half-pel flags */
+                                                               8);             /* block height */
+
+                                                       /* update delta with least distortion vector */
+                                                       if (local_dist < vmc)
+                                                       {
+                                                               imins = is;
+                                                               jmins = js;
+                                                               imint = it;
+                                                               jmint = jt;
+                                                               iminb = ib;
+                                                               jminb = jb;
+                                                               imindmv = delta_x;
+                                                               jmindmv = delta_y;
+                                                               vmc = local_dist;
+                                                       }
+                                               }
+                                       }  /* end delta x loop */
+                               } /* end delta y loop */
+                       }
+               }
+       }
+
+       /* TODO: This is now likely to be obsolete... */
+       /* Compute L1 error for decision purposes */
+       local_dist = (*pbdist1)(
+               ref + (imins>>1) + (width<<1)*(jmins>>1),
+               ref + width + (imint>>1) + (width<<1)*(jmint>>1),
+               ssmb->mb,
+               width<<1,
+               imins&1, jmins&1, imint&1, jmint&1,
+               8);
+//printf("motion 1 %p\n", pbdist1);
+       local_dist += (*pbdist1)(
+               ref + width + (imins>>1) + (width<<1)*(jmins>>1),
+               ref + (iminb>>1) + (width<<1)*(jminb>>1),
+               ssmb->mb + width,
+               width<<1,
+               imins&1, jmins&1, iminb&1, jminb&1,
+               8);
+//printf("motion 2\n");
+
+       best_mc->sad = local_dist;
+       best_mc->pos.x = imins;
+       best_mc->pos.y = jmins;
+       *vmcp = vmc;
+       *imindmvp = imindmv;
+       *jmindmvp = jmindmv;
+//printf("motion 2\n");
+}
+
+static void dpfield_estimate(motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *topref,
+       uint8_t *botref, 
+       uint8_t *mb,
+       int i, int j, int imins, int jmins, 
+       mb_motion_s *bestdp_mc,
+       int *vmcp
+       )
+
+{
+       uint8_t *sameref, *oppref;
+       int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
+       int imino,jmino,imindmv,jmindmv,vmc_dp,local_dist;
+
+       /* Calculate Dual Prime distortions for 9 delta candidates */
+       /* Note: only for P pictures! */
+
+       /* Assign opposite and same reference pointer */
+       if (picture->pict_struct==TOP_FIELD)
+       {
+               sameref = topref;    
+               oppref = botref;
+       }
+       else 
+       {
+               sameref = botref;
+               oppref = topref;
+       }
+
+       /* convert Cartesian absolute to relative motion vector
+        * values (wrt current macroblock address (i,j)
+        */
+       mvxs = imins - (i<<1);
+       mvys = jmins - (j<<1);
+
+       /* vector for prediction from field of opposite 'parity' */
+       mvxo0 = (mvxs+(mvxs>0)) >> 1;  /* mvxs / / */
+       mvyo0 = (mvys+(mvys>0)) >> 1;  /* mvys / / 2*/
+
+                       /* vertical field shift correction */
+       if (picture->pict_struct==TOP_FIELD)
+               mvyo0--;
+       else
+               mvyo0++;
+
+                       /* convert back to absolute coordinates */
+       io0 = mvxo0 + (i<<1);
+       jo0 = mvyo0 + (j<<1);
+
+                       /* initialize minimum dual prime distortion to large value */
+       vmc_dp = 1 << 30;
+
+       for (delta_y = -1; delta_y <= 1; delta_y++)
+       {
+               for (delta_x = -1; delta_x <=1; delta_x++)
+               {
+                       /* opposite field coordinates */
+                       io = io0 + delta_x;
+                       jo = jo0 + delta_y;
+
+                       if (io >= 0 && io <= (width-16)<<1 &&
+                               jo >= 0 && jo <= (height2-16)<<1)
+                       {
+                               /* compute prediction error */
+                               local_dist = (*pbdist2)(
+                                       sameref + (imins>>1) + width2*(jmins>>1),
+                                       oppref  + (io>>1)    + width2*(jo>>1),
+                                       mb,             /* current mb location */
+                                       width2,         /* adjacent line distance */
+                                       imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
+                                       16);            /* block height */
+
+                               /* update delta with least distortion vector */
+                               if (local_dist < vmc_dp)
+                               {
+                                       imino = io;
+                                       jmino = jo;
+                                       imindmv = delta_x;
+                                       jmindmv = delta_y;
+                                       vmc_dp = local_dist;
+                               }
+                       }
+               }  /* end delta x loop */
+       } /* end delta y loop */
+
+       /* Compute L1 error for decision purposes */
+       bestdp_mc->sad =
+               (*pbdist1)(
+                       sameref + (imins>>1) + width2*(jmins>>1),
+                       oppref  + (imino>>1) + width2*(jmino>>1),
+                       mb,             /* current mb location */
+                       width2,         /* adjacent line distance */
+                       imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
+                       16);            /* block height */
+
+       bestdp_mc->pos.x = imindmv;
+       bestdp_mc->pos.x = imindmv;
+       *vmcp = vmc_dp;
+}
+
+
+/* 
+ *   Vectors of motion compensations 
+ */
+
+/*
+       Take a vector of motion compensations and repeatedly make passes
+       discarding all elements whose sad "weight" is above the current mean weight.
+*/
+
+static void sub_mean_reduction( mc_result_s *matches, int len, 
+                                                               int times,
+                                                           int *newlen_res, int *minweight_res)
+{
+       int i,j;
+       int weight_sum;
+       int mean_weight;
+       int min_weight = 100000;
+       if( len == 0 )
+       {
+               *minweight_res = 100000;
+               *newlen_res = 0;
+               return;
+       }
+
+       for(;;)
+       {
+               weight_sum = 0;
+               for( i = 0; i < len ; ++i )
+                       weight_sum += matches[i].weight;
+               mean_weight = weight_sum / len;
+               
+               if( times <= 0)
+                       break;
+                       
+               j = 0;
+               for( i =0; i < len; ++i )
+               {
+                       if( matches[i].weight <= mean_weight )
+                       {
+                               if( times == 1)
+                               {
+                                       min_weight = matches[i].weight ;
+                               }
+                               matches[j] = matches[i];
+                               ++j;
+                       }
+               }
+               len = j;
+               --times;
+       }
+       *newlen_res = len;
+       *minweight_res = mean_weight;
+}
+
+/*
+  Build a vector of the top   4*4 sub-sampled motion compensations in the box
+  (ilow,jlow) to (ihigh,jhigh).
+
+       The algorithm is as follows:
+       1. coarse matches on an 8*8 grid of positions are collected that fall below
+       a (very conservative) sad threshold (basically 50% more than moving average of
+       the mean sad of such matches).
+       
+       2. The worse than-average matches are discarded.
+       
+       3. The remaining coarse matches are expanded with the left/lower neighbouring
+       4*4 grid matches. Again only those better than a threshold (this time the mean
+       of the 8*8 grid matches are retained.
+       
+       4. Multiple passes are made discarding worse than-average matches.  The number of
+       passes is specified by the user.  The default it 2 (leaving roughly 1/4 of the matches).
+       
+       The net result is very fast and find good matches if they're to be found.  I.e. the
+       penalty over exhaustive search is pretty low.
+       
+       NOTE: The "discard below average" trick depends critically on having some variation in
+       the matches.  The slight penalty imposed for distant matches (reasonably since the 
+       motion vectors have to be encoded) is *vital* as otherwise pathologically bad
+       performance results on highly uniform images.
+       
+       TODO: We should probably allow the user to eliminate the initial thinning of 8*8
+       grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
+
+*/
+
+
+static int build_sub44_mcomps(motion_engine_t *engine,
+        int ilow, int jlow, int ihigh, int jhigh, 
+                                                       int i0, int j0,
+                                                               int null_mc_sad,
+                                                       uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
+{
+       uint8_t *s44orgblk;
+       int istrt = ilow-i0;
+       int jstrt = jlow-j0;
+       int iend = ihigh-i0;
+       int jend = jhigh-j0;
+       int mean_weight;
+       int threshold;
+
+#ifdef X86_CPU
+
+       /*int rangex, rangey;
+       static int rough_num_mcomps;
+       static mc_result_s rough_mcomps[MAX_44_MATCHES];
+       int k;
+       */
+#else
+       int i,j;
+       int s1;
+       uint8_t *old_s44orgblk;
+#endif
+       /* N.b. we may ignore the right hand block of the pair going over the
+          right edge as we have carefully allocated the buffer oversize to ensure
+          no memory faults.  The later motion compensation calculations
+          performed on the results of this pass will filter out
+          out-of-range blocks...
+       */
+
+
+       engine->sub44_num_mcomps = 0;
+       
+       threshold = 6*null_mc_sad / (4*4*mc_44_red);
+       s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
+       
+       /* Exhaustive search on 4*4 sub-sampled data.  This is affordable because
+               (a)     it is only 16th of the size of the real 1-pel data 
+               (b) we ignore those matches with an sad above our threshold.    
+       */
+#ifndef X86_CPU
+
+               /* Invariant:  s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
+               s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
+               for( j = jstrt; j <= jend; j += 4 )
+               {
+                       old_s44orgblk = s44orgblk;
+                       for( i = istrt; i <= iend; i += 4 )
+                       {
+                               s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
+                               if( s1 < threshold )
+                               {
+                                       engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
+                                       engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
+                                       engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
+                                       ++engine->sub44_num_mcomps;
+                               }
+                               s44orgblk += 1;
+                       }
+                       s44orgblk = old_s44orgblk + qlx;
+               }
+                       
+#else
+
+               engine->sub44_num_mcomps
+                       = (*pmblock_sub44_dists)( s44orgblk, s44blk,
+                                                                 istrt, jstrt,
+                                                                 iend, jend, 
+                                                                 qh, qlx, 
+                                                                 threshold,
+                                                                 engine->sub44_mcomps);
+
+#endif 
+               /* If we're really pushing quality we reduce once otherwise twice. */
+                       
+               sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
+                                                   &engine->sub44_num_mcomps, &mean_weight);
+
+
+       return engine->sub44_num_mcomps;
+}
+
+
+/* Build a vector of the best 2*2 sub-sampled motion
+  compensations using the best 4*4 matches as starting points.  As
+  with with the 4*4 matches We don't collect them densely as they're
+  just search starting points for 1-pel search and ones that are 1 out
+  should still give better than average matches...
+
+*/
+
+
+static int build_sub22_mcomps(motion_engine_t *engine,
+               int i0,  int j0, int ihigh, int jhigh, 
+               int null_mc_sad,
+               uint8_t *s22org,  uint8_t *s22blk, 
+               int flx, int fh,  int searched_sub44_size )
+{
+       int i,k,s;
+       int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
+
+       int min_weight;
+       int ilim = ihigh-i0;
+       int jlim = jhigh-j0;
+       blockxy matchrec;
+       uint8_t *s22orgblk;
+
+       engine->sub22_num_mcomps = 0;
+       for( k = 0; k < searched_sub44_size; ++k )
+       {
+
+               matchrec.x = engine->sub44_mcomps[k].x;
+               matchrec.y = engine->sub44_mcomps[k].y;
+
+               s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= ilim && matchrec.y <= jlim )
+                       {       
+                               s = (*pdist22)( s22orgblk,s22blk,flx,fh);
+                               if( s < threshold )
+                               {
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
+                                       ++engine->sub22_num_mcomps;
+                               }
+                       }
+
+                       if( i == 1 )
+                       {
+                               s22orgblk += flx-1;
+                               matchrec.x -= 2;
+                               matchrec.y += 2;
+                       }
+                       else
+                       {
+                               s22orgblk += 1;
+                               matchrec.x += 2;
+                               
+                       }
+               }
+
+       }
+
+       
+       sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
+                                               &engine->sub22_num_mcomps, &min_weight );
+       return engine->sub22_num_mcomps;
+}
+
+#ifdef X86_CPU
+int build_sub22_mcomps_mmxe(motion_engine_t *engine,
+       int i0,  int j0, int ihigh, int jhigh, 
+       int null_mc_sad,
+       uint8_t *s22org,  uint8_t *s22blk, 
+       int flx, int fh,  int searched_sub44_size )
+{
+       int i,k,s;
+       int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
+
+       int min_weight;
+       int ilim = ihigh-i0;
+       int jlim = jhigh-j0;
+       blockxy matchrec;
+       uint8_t *s22orgblk;
+       int resvec[4];
+
+       /* TODO: The calculation of the lstrow offset really belongs in
+       asm code... */
+       int lstrow=(fh-1)*flx;
+
+       engine->sub22_num_mcomps = 0;
+       for( k = 0; k < searched_sub44_size; ++k )
+       {
+
+               matchrec.x = engine->sub44_mcomps[k].x;
+               matchrec.y = engine->sub44_mcomps[k].y;
+
+               s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
+               mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= ilim && matchrec.y <= jlim )
+                       {       
+                               s =resvec[i];
+                               if( s < threshold )
+                               {
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
+                                       ++engine->sub22_num_mcomps;
+                               }
+                       }
+
+                       if( i == 1 )
+                       {
+                               matchrec.x -= 2;
+                               matchrec.y += 2;
+                       }
+                       else
+                       {
+                               matchrec.x += 2;
+                       }
+               }
+
+       }
+
+       
+       sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
+                                               &engine->sub22_num_mcomps, &min_weight );
+       return engine->sub22_num_mcomps;
+}
+
+#endif
+
+/*
+  Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
+       TODO: Its a bit silly to cart around absolute M/C co-ordinates that
+       eventually get turned into relative ones anyway...
+*/
+
+
+static void find_best_one_pel(motion_engine_t *engine,
+        uint8_t *org, uint8_t *blk,
+        int searched_size,
+        int i0, int j0,
+        int ilow, int jlow,
+        int xmax, int ymax,
+        int lx, int h, 
+        mb_motion_s *res
+       )
+
+{
+       int i,k;
+       int d;
+       blockxy minpos = res->pos;
+       int dmin = INT_MAX;
+       uint8_t *orgblk;
+       int penalty;
+       int init_search;
+       int init_size;
+       blockxy matchrec;
+       init_search = searched_size;
+       init_size = engine->sub22_num_mcomps;
+       for( k = 0; k < searched_size; ++k )
+       {       
+               matchrec.x = i0 + engine->sub22_mcomps[k].x;
+               matchrec.y = j0 + engine->sub22_mcomps[k].y;
+               orgblk = org + matchrec.x+lx*matchrec.y;
+               penalty = abs(matchrec.x)+abs(matchrec.y);
+
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= xmax && matchrec.y <= ymax )
+                       {
+               
+                               d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
+                               if (d<dmin)
+                               {
+                                       dmin = d;
+                                       minpos = matchrec;
+                               }
+                       }
+                       if( i == 1 )
+                       {
+                               orgblk += lx-1;
+                               matchrec.x -= 1;
+                               matchrec.y += 1;
+                       }
+                       else
+                       {
+                               orgblk += 1;
+                               matchrec.x += 1;
+                       }
+               }
+       }
+
+       res->pos = minpos;
+       res->blk = org + minpos.x+lx*minpos.y;
+       res->sad = dmin;
+
+}
+
+#ifdef X86_CPU
+void find_best_one_pel_mmxe(motion_engine_t *engine,
+       uint8_t *org, uint8_t *blk,
+       int searched_size,
+       int i0, int j0,
+       int ilow, int jlow,
+       int xmax, int ymax,
+       int lx, int h, 
+       mb_motion_s *res
+       )
+
+{
+       int i,k;
+       int d;
+       blockxy minpos = res->pos;
+       int dmin = INT_MAX;
+       uint8_t *orgblk;
+       int penalty;
+       int init_search;
+       int init_size;
+       blockxy matchrec;
+       int resvec[4];
+
+       init_search = searched_size;
+       init_size = engine->sub22_num_mcomps;
+       for( k = 0; k < searched_size; ++k )
+       {       
+               matchrec.x = i0 + engine->sub22_mcomps[k].x;
+               matchrec.y = j0 + engine->sub22_mcomps[k].y;
+               orgblk = org + matchrec.x+lx*matchrec.y;
+               penalty = abs(matchrec.x)+abs(matchrec.y);
+               
+
+               mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
+
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= xmax && matchrec.y <= ymax )
+                       {
+               
+                               d = penalty+resvec[i];
+                               if (d<dmin)
+                               {
+                                       dmin = d;
+                                       minpos = matchrec;
+                               }
+                       }
+                       if( i == 1 )
+                       {
+                               orgblk += lx-1;
+                               matchrec.x -= 1;
+                               matchrec.y += 1;
+                       }
+                       else
+                       {
+                               orgblk += 1;
+                               matchrec.x += 1;
+                       }
+               }
+       }
+
+       res->pos = minpos;
+       res->blk = org + minpos.x+lx*minpos.y;
+       res->sad = dmin;
+
+}
+#endif 
+/*
+ * full search block matching
+ *
+ * A.Stevens 2000: This is now a big misnomer.  The search is now a hierarchical/sub-sampling
+ * search not a full search.  However, experiments have shown it is always close to
+ * optimal and almost always very close or optimal.
+ *
+ * blk: top left pel of (16*h) block
+ * s22blk: top element of fast motion compensation block corresponding to blk
+ * h: height of block
+ * lx: distance (in bytes) of vertically adjacent pels in ref,blk
+ * org: top left pel of source reference picture
+ * ref: top left pel of reconstructed reference picture
+ * i0,j0: center of search window
+ * sx,sy: half widths of search window
+ * xmax,ymax: right/bottom limits of search area
+ * iminp,jminp: pointers to where the result is stored
+ *              result is given as half pel offset from ref(0,0)
+ *              i.e. NOT relative to (i0,j0)
+ */
+
+
+static void fullsearch(motion_engine_t *engine,
+       uint8_t *org,
+       uint8_t *ref,
+       subsampled_mb_s *ssblk,
+       int lx, int i0, int j0, 
+       int sx, int sy, int h,
+       int xmax, int ymax,
+       /* int *iminp, int *jminp, int *sadminp, */
+       mb_motion_s *res
+       )
+{
+       mb_motion_s best;
+       /* int imin, jmin, dmin */
+       int i,j,ilow,ihigh,jlow,jhigh;
+       int d;
+
+       /* NOTE: Surprisingly, the initial motion compensation search
+          works better when the original image not the reference (reconstructed)
+          image is used. 
+       */
+       uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
+       uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
+       uint8_t *orgblk;
+       int flx = lx >> 1;
+       int qlx = lx >> 2;
+       int fh = h >> 1;
+       int qh = h >> 2;
+
+
+       /* xmax and ymax into more useful form... */
+       xmax -= 16;
+       ymax -= h;
+  
+  
+       /* The search radii are *always* multiples of 4 to avoid messiness
+          in the initial 4*4 pel search.  This is handled by the
+          parameter checking/processing code in readparmfile() */
+  
+       /* Create a distance-order mcomps of possible motion compensations
+         based on the fast estimation data - 4*4 pel sums (4*4
+         sub-sampled) rather than actual pel's.  1/16 the size...  */
+       jlow = j0-sy;
+       jlow = jlow < 0 ? 0 : jlow;
+       jhigh =  j0+(sy-1);
+       jhigh = jhigh > ymax ? ymax : jhigh;
+       ilow = i0-sx;
+       ilow = ilow < 0 ? 0 : ilow;
+       ihigh =  i0+(sx-1);
+       ihigh = ihigh > xmax ? xmax : ihigh;
+
+       /*
+          Very rarely this may fail to find matchs due to all the good
+          looking ones being over threshold. hence we make sure we
+          fall back to a 0 motion compensation in this case.
+          
+                The sad for the 0 motion compensation is also very useful as
+                a basis for setting thresholds for rejecting really dud 4*4
+                and 2*2 sub-sampled matches.
+       */
+       best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
+               ssblk->mb,
+               lx,
+               h,
+               best.sad);
+       best.pos.x = i0;
+       best.pos.y = j0;
+
+       engine->sub44_num_mcomps = build_sub44_mcomps(engine,
+                                                                        ilow, jlow, ihigh, jhigh,
+                                                                         i0, j0,
+                                                                         best.sad,
+                                                                         s44org, 
+                                                                         ssblk->qmb, qlx, qh );
+
+       
+       /* Now create a distance-ordered mcomps of possible motion
+          compensations based on the fast estimation data - 2*2 pel sums
+          using the best fraction of the 4*4 estimates However we cover
+          only coarsely... on 4-pel boundaries...  */
+
+       engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh,  jhigh, 
+                                                                                          best.sad,
+                                                                                          s22org, ssblk->fmb, flx, fh, 
+                                                                                          engine->sub44_num_mcomps );
+
+               
+    /* Now choose best 1-pel match from what approximates (not exact
+          due to the pre-processing trick with the mean) the top 1/2 of
+          the 2*2 matches 
+       */
+       
+
+       (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
+                                          i0, j0,
+                                          ilow, jlow, xmax, ymax, 
+                                          lx, h, &best );
+
+       /* Final polish: half-pel search of best candidate against
+          reconstructed image. 
+       */
+
+       best.pos.x <<= 1; 
+       best.pos.y <<= 1;
+       best.hx = 0;
+       best.hy = 0;
+
+       ilow = best.pos.x - (best.pos.x>(ilow<<1));
+       ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
+       jlow = best.pos.y - (best.pos.y>(jlow<<1));
+       jhigh =  best.pos.y+ (best.pos.y<((jhigh)<<1));
+
+       for (j=jlow; j<=jhigh; j++)
+       {
+               for (i=ilow; i<=ihigh; i++)
+               {
+                       orgblk = ref+(i>>1)+((j>>1)*lx);
+                       if( i&1 )
+                       {
+                               if( j & 1 )
+                                       d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
+                               else
+                                       d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
+                       }
+                       else
+                       {
+                               if( j & 1 )
+                                       d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
+                               else
+                                       d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
+                       }
+                       if (d<best.sad)
+                       {
+                               best.sad = d;
+                               best.pos.x = i;
+                               best.pos.y = j;
+                               best.blk = orgblk;
+                               best.hx = i&1;
+                               best.hy = j&1;
+                       }
+               }
+       }
+       best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
+       *res = best;
+}
+
+/*
+ * total absolute difference between two (16*h) blocks
+ * including optional half pel interpolation of blk1 (hx,hy)
+ * blk1,blk2: addresses of top left pels of both blocks
+ * lx:        distance (in bytes) of vertically adjacent pels
+ * hx,hy:     flags for horizontal and/or vertical interpolation
+ * h:         height of block (usually 8 or 16)
+ * distlim:   bail out if sum exceeds this value
+ */
+
+/* A.Stevens 2000: New version for highly pipelined CPUs where branching is
+   costly.  Really it sucks that C doesn't define a stdlib abs that could
+   be realised as a compiler intrinsic using appropriate CPU instructions.
+   That 1970's heritage...
+*/
+
+
+static int dist1_00(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h,int distlim)
+{
+       uint8_t *p1,*p2;
+       int j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       for (j=0; j<h; j++)
+       {
+
+#define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
+               pipestep(0);  pipestep(1);  pipestep(2);  pipestep(3);
+               pipestep(4);  pipestep(5);  pipestep(6);  pipestep(7);
+               pipestep(8);  pipestep(9);  pipestep(10); pipestep(11);
+               pipestep(12); pipestep(13); pipestep(14); pipestep(15);
+#undef pipestep
+
+               if (s >= distlim)
+                       break;
+                       
+               p1+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+static int dist1_01(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h)
+{
+       uint8_t *p1,*p2;
+       int i,j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+
+                       v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
+                       /*
+                         v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
+                       */
+                       s+=abs(v);
+               }
+               p1+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+static int dist1_10(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h)
+{
+       uint8_t *p1,*p1a,*p2;
+       int i,j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       p1a = p1 + lx;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
+                       s+=abs(v);
+               }
+               p1 = p1a;
+               p1a+= lx;
+               p2+= lx;
+       }
+
+       return s;
+}
+
+static int dist1_11(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h)
+{
+       uint8_t *p1,*p1a,*p2;
+       int i,j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       p1a = p1 + lx;
+         
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
+                       s+=abs(v);
+               }
+               p1 = p1a;
+               p1a+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+/* USED only during debugging...
+void check_fast_motion_data(uint8_t *blk, char *label )
+{
+  uint8_t *b, *nb;
+  uint8_t *pb,*p;
+  uint8_t *qb;
+  uint8_t *start_s22blk, *start_s44blk;
+  int i;
+  unsigned int mismatch;
+  int nextfieldline;
+  start_s22blk = (blk+height*width);
+  start_s44blk = (blk+height*width+(height>>1)*(width>>1));
+
+  if (pict_struct==FRAME_PICTURE)
+       {
+         nextfieldline = width;
+       }
+  else
+       {
+         nextfieldline = 2*width;
+       }
+
+       mismatch = 0;
+       pb = start_s22blk;
+       qb = start_s44blk;
+       p = blk;
+       while( pb < qb )
+       {
+               for( i = 0; i < nextfieldline/2; ++i )
+               {
+                       mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
+                       p += 2;
+                       ++pb;                   
+               }
+               p += nextfieldline;
+       }
+               if( mismatch )
+                       {
+                               printf( "Mismatch detected check %s for buffer %08x\n", label,  blk );
+                                       exit(1);
+                       }
+}
+*/
+
+/* 
+   Append fast motion estimation data to original luminance
+   data.  N.b. memory allocation for luminance data allows space
+   for this information...
+ */
+
+void fast_motion_data(uint8_t *blk, int picture_struct )
+{
+       uint8_t *b, *nb;
+       uint8_t *pb;
+       uint8_t *qb;
+       uint8_t *start_s22blk, *start_s44blk;
+       uint16_t *start_rowblk, *start_colblk;
+       int i;
+       int nextfieldline;
+#ifdef TEST_RCSEARCH
+       uint16_t *pc, *pr,*p;
+       int rowsum;
+       int j,s;
+       int down16 = width*16;
+       uint16_t sums[32];
+       uint16_t rowsums[2048];
+       uint16_t colsums[2048];  /* TODO: BUG: should resize with width */
+#endif 
+  
+
+       /* In an interlaced field the "next" line is 2 width's down 
+          rather than 1 width down                                 */
+
+       if (picture_struct==FRAME_PICTURE)
+       {
+               nextfieldline = width;
+       }
+       else
+       {
+               nextfieldline = 2*width;
+       }
+
+       start_s22blk   = blk+fsubsample_offset;
+       start_s44blk   = blk+qsubsample_offset;
+       start_rowblk = (uint16_t *)blk+rowsums_offset;
+       start_colblk = (uint16_t *)blk+colsums_offset;
+       b = blk;
+       nb = (blk+nextfieldline);
+       /* Sneaky stuff here... we can do lines in both fields at once */
+       pb = (uint8_t *) start_s22blk;
+
+       while( nb < start_s22blk )
+       {
+               for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
+               {
+                       /* TODO: A.Stevens this has to be the most word-length dependent
+                          code in the world.  Better than MMX assembler though I guess... */
+                       pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
+                       pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;     
+                       pb += 2;
+                       b += 4;
+                       nb += 4;
+               }
+               b += nextfieldline;
+               nb = b + nextfieldline;
+       }
+
+
+       /* Now create the 4*4 sub-sampled data from the 2*2 
+          N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
+          original.  Albeit half as many lines and pixels...
+       */
+
+       nextfieldline = nextfieldline >> 1;
+
+       qb = start_s44blk;
+       b  = start_s22blk;
+       nb = (start_s22blk+nextfieldline);
+
+       while( nb < start_s44blk )
+       {
+               for( i = 0; i < nextfieldline/4; ++i )
+               {
+                       /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
+                       qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
+                       qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
+                       qb += 2;
+                       b += 4;
+                       nb += 4;
+               }
+               b += nextfieldline;
+               nb = b + nextfieldline;
+       }
+
+#ifdef TEST_RCSEARCH
+       /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
+  
+       /*
+         Initial row sums....
+       */
+       pb = blk;
+       for(j = 0; j < height; ++j )
+       {
+               rowsum = 0;
+               for( i = 0; i < 16; ++ i )
+               {
+                       rowsum += pb[i];
+               }
+               rowsums[j] = rowsum;
+               pb += width;
+       }
+  
+       /*
+         Initial column sums
+       */
+       for( i = 0; i < width; ++i )
+       {
+               colsums[i] = 0;
+       }
+       pb = blk;
+       for( j = 0; j < 16; ++j )
+       {
+               for( i = 0; i < width; ++i )
+               {
+                       colsums[i] += *pb;
+                       ++pb;
+               }
+       }
+  
+       /* Now fill in the row/column sum tables...
+          Note: to allow efficient construction of sum/col differences for a
+          given position row sums are held in a *column major* array
+          (changing y co-ordinate makes for small index changes)
+          the col sums are held in a *row major* array
+       */
+  
+       pb = blk;
+       pc = start_colblk;
+       for(j = 0; j <32; ++j )
+       {
+               pr = start_rowblk;
+               rowsum = rowsums[j];
+               for( i = 0; i < width-16; ++i )
+               {
+                       pc[i] = colsums[i];
+                       pr[j] = rowsum;
+                       colsums[i] = (colsums[i] + pb[down16] )-pb[0];
+                       rowsum = (rowsum + pb[16]) - pb[0];
+                       ++pb;
+                       pr += height;
+               }
+               pb += 16;   /* move pb on to next row... rememember we only did width-16! */
+               pc += width;
+       }
+#endif                 
+}
+
+
+static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
+{
+       uint8_t *p1 = s22blk1;
+       uint8_t *p2 = s22blk2;
+       int s = 0;
+       int j;
+
+       for( j = 0; j < fh; ++j )
+       {
+               register int diff;
+#define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
+               pipestep(0); pipestep(1);
+               pipestep(2); pipestep(3);
+               pipestep(4); pipestep(5);
+               pipestep(6); pipestep(7);
+               p1 += flx;
+               p2 += flx;
+#undef pipestep
+       }
+
+       return s;
+}
+
+
+
+/*
+  Sum absolute differences for 4*4 sub-sampled data.  
+
+  TODO: currently assumes  only 16*16 or 16*8 motion compensation will be used...
+  I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
+ */
+
+
+static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
+{
+       register uint8_t *p1 = s44blk1;
+       register uint8_t *p2 = s44blk2;
+       int s = 0;
+       register int diff;
+
+       /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
+#define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
+       pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+       if( qh > 1 )
+       {
+               p1 += qlx; p2 += qlx;
+               pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+               if( qh > 2 )
+               {
+                       p1 += qlx; p2 += qlx;
+                       pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+                       p1 += qlx; p2 += qlx;
+                       pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+               }
+       }
+
+
+       return s;
+}
+
+/*
+ * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
+ * blk1,blk2: addresses of top left pels of both blocks
+ * lx:        distance (in bytes) of vertically adjacent pels
+ * h:         height of block (usually 8 or 16)
+ */
+static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
+{
+       uint8_t *p1 = blk1, *p2 = blk2;
+       int i,j,v;
+       int s = 0;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<8; i++)
+               {
+                       v = p1[i] - p2[i];
+                       s+= v*v;
+               }
+               p1+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+/* total squared difference between bidirection prediction of (8*h)
+ * blocks of 2*2 sub-sampled pels and reference 
+ * blk1f, blk1b,blk2: addresses of top left
+ * pels of blocks 
+ * lx: distance (in bytes) of vertically adjacent
+ * pels 
+ * h: height of block (usually 4 or 8)
+ */
+static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2, 
+                                        int lx, int h)
+{
+       uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
+       int i,j,v;
+       int s = 0;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<8; i++)
+               {
+                       v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
+                       s+= v*v;
+               }
+               p1f+= lx;
+               p1b+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+/*
+ * total squared difference between two (16*h) blocks
+ * including optional half pel interpolation of blk1 (hx,hy)
+ * blk1,blk2: addresses of top left pels of both blocks
+ * lx:        distance (in bytes) of vertically adjacent pels
+ * hx,hy:     flags for horizontal and/or vertical interpolation
+ * h:         height of block (usually 8 or 16)
+ */
+
+static int dist2(blk1,blk2,lx,hx,hy,h)
+       uint8_t *blk1,*blk2;
+       int lx,hx,hy,h;
+{
+       uint8_t *p1,*p1a,*p2;
+       int i,j;
+       int s,v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       if (!hx && !hy)
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = p1[i] - p2[i];
+                               s+= v*v;
+                       }
+                       p1+= lx;
+                       p2+= lx;
+               }
+       else if (hx && !hy)
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
+                               s+= v*v;
+                       }
+                       p1+= lx;
+                       p2+= lx;
+               }
+       else if (!hx && hy)
+       {
+               p1a = p1 + lx;
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
+                               s+= v*v;
+                       }
+                       p1 = p1a;
+                       p1a+= lx;
+                       p2+= lx;
+               }
+       }
+       else /* if (hx && hy) */
+       {
+               p1a = p1 + lx;
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
+                               s+= v*v;
+                       }
+                       p1 = p1a;
+                       p1a+= lx;
+                       p2+= lx;
+               }
+       }
+       return s;
+}
+
+
+/*
+ * absolute difference error between a (16*h) block and a bidirectional
+ * prediction
+ *
+ * p2: address of top left pel of block
+ * pf,hxf,hyf: address and half pel flags of forward ref. block
+ * pb,hxb,hyb: address and half pel flags of backward ref. block
+ * h: height of block
+ * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+ */
+
+static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
+       uint8_t *pf,*pb,*p2;
+       int lx,hxf,hyf,hxb,hyb,h;
+{
+       uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+       int i,j;
+       int s,v;
+
+       pfa = pf + hxf;
+       pfb = pf + lx*hyf;
+       pfc = pfb + hxf;
+
+       pba = pb + hxb;
+       pbb = pb + lx*hyb;
+       pbc = pbb + hxb;
+
+       s = 0;
+
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
+                                 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
+                               - *p2++;
+                       s += abs(v);
+               }
+               p2+= lx-16;
+               pf+= lx-16;
+               pfa+= lx-16;
+               pfb+= lx-16;
+               pfc+= lx-16;
+               pb+= lx-16;
+               pba+= lx-16;
+               pbb+= lx-16;
+               pbc+= lx-16;
+       }
+
+       return s;
+}
+
+/*
+ * squared error between a (16*h) block and a bidirectional
+ * prediction
+ *
+ * p2: address of top left pel of block
+ * pf,hxf,hyf: address and half pel flags of forward ref. block
+ * pb,hxb,hyb: address and half pel flags of backward ref. block
+ * h: height of block
+ * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+ */
+
+static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
+       uint8_t *pf,*pb,*p2;
+       int lx,hxf,hyf,hxb,hyb,h;
+{
+       uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+       int i,j;
+       int s,v;
+
+       pfa = pf + hxf;
+       pfb = pf + lx*hyf;
+       pfc = pfb + hxf;
+
+       pba = pb + hxb;
+       pbb = pb + lx*hyb;
+       pbc = pbb + hxb;
+
+       s = 0;
+
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
+                                 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
+                               - *p2++;
+                       s+=v*v;
+               }
+               p2+= lx-16;
+               pf+= lx-16;
+               pfa+= lx-16;
+               pfb+= lx-16;
+               pfc+= lx-16;
+               pb+= lx-16;
+               pba+= lx-16;
+               pbb+= lx-16;
+               pbc+= lx-16;
+       }
+
+       return s;
+}
+
+
+/*
+ * variance of a (size*size) block, multiplied by 256
+ * p:  address of top left pel of block
+ * lx: distance (in bytes) of vertically adjacent pels
+ */
+static int variance(uint8_t *p, int size,      int lx)
+{
+       int i,j;
+       unsigned int v,s,s2;
+
+       s = s2 = 0;
+
+       for (j=0; j<size; j++)
+       {
+               for (i=0; i<size; i++)
+               {
+                       v = *p++;
+                       s+= v;
+                       s2+= v*v;
+               }
+               p+= lx-size;
+       }
+       return s2 - (s*s)/(size*size);
+}
+
+/*
+  Compute the variance of the residual of uni-directionally motion
+  compensated block.
+ */
+
+static int unidir_pred_var( const mb_motion_s *motion,
+                                                       uint8_t *mb,  
+                                                       int lx, 
+                                                       int h)
+{
+       return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
+}
+
+
+/*
+  Compute the variance of the residual of bi-directionally motion
+  compensated block.
+ */
+
+static int bidir_pred_var( const mb_motion_s *motion_f, 
+                                                  const mb_motion_s *motion_b,
+                                                  uint8_t *mb,  
+                                                  int lx, int h)
+{
+       return (*pbdist2)( motion_f->blk, motion_b->blk,
+                                          mb, lx, 
+                                          motion_f->hx, motion_f->hy,
+                                          motion_b->hx, motion_b->hy,
+                                          h);
+}
+
+/*
+  Compute SAD for bi-directionally motion compensated blocks...
+ */
+
+static int bidir_pred_sad( const mb_motion_s *motion_f, 
+                                                  const mb_motion_s *motion_b,
+                                                  uint8_t *mb,  
+                                                  int lx, int h)
+{
+       return (*pbdist1)(motion_f->blk, motion_b->blk, 
+                                        mb, lx, 
+                                        motion_f->hx, motion_f->hy,
+                                        motion_b->hx, motion_b->hy,
+                                        h);
+}
+
+static void frame_ME(motion_engine_t *engine,
+                                       pict_data_s *picture,
+                                        motion_comp_s *mc,
+                                        int mb_row_start,
+                                        int i, int j, 
+                                        mbinfo_s *mbi)
+{
+       mb_motion_s framef_mc;
+       mb_motion_s frameb_mc;
+       mb_motion_s dualpf_mc;
+       mb_motion_s topfldf_mc;
+       mb_motion_s botfldf_mc;
+       mb_motion_s topfldb_mc;
+       mb_motion_s botfldb_mc;
+
+       int var,v0;
+       int vmc,vmcf,vmcr,vmci;
+       int vmcfieldf,vmcfieldr,vmcfieldi;
+       subsampled_mb_s ssmb;
+       int imins[2][2],jmins[2][2];
+       int imindmv,jmindmv,vmc_dp;
+//printf("frame_ME 1\n");
+
+
+       /* A.Stevens fast motion estimation data is appended to actual
+          luminance information 
+       */
+       ssmb.mb = mc->cur[0] + mb_row_start + i;
+       ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
+       ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
+       ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset + 
+                                                 ((i>>1) + (mb_row_start>>2)));
+       ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset + 
+                                                 (i>>2) + (mb_row_start>>4));
+
+       /* Compute variance MB as a measure of Intra-coding complexity
+          We include chrominance information here, scaled to compensate
+          for sub-sampling.  Silly MPEG forcing chrom/lum to have same
+          quantisations...
+        */
+       var = variance(ssmb.mb,16,width);
+
+//printf("motion %d\n", picture->pict_type);
+       if (picture->pict_type==I_TYPE)
+       {
+               mbi->mb_type = MB_INTRA;
+       }
+       else if (picture->pict_type==P_TYPE)
+       {
+               /* For P pictures we take into account chrominance. This
+                  provides much better performance at scene changes */
+               var += chrom_var_sum(&ssmb,16,width);
+
+               if (picture->frame_pred_dct)
+               {
+                       fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
+                                          width,i,j,mc->sxf,mc->syf,16,width,height,
+                                           &framef_mc);
+                       framef_mc.fieldoff = 0;
+                       vmc = framef_mc.var +
+                               unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
+                       mbi->motion_type = MC_FRAME;
+               }
+               else
+               {
+//printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
+                       frame_estimate(engine, mc->oldorg[0],
+                               mc->oldref[0],
+                               &ssmb,
+                               i,
+                               j,
+                               mc->sxf,
+                               mc->syf,
+                               &framef_mc,
+                               &topfldf_mc,
+                               &botfldf_mc,
+                               imins,
+                               jmins);
+//printf("frame_ME 3\n");
+                       vmcf = framef_mc.var + 
+                               unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
+                       vmcfieldf = 
+                               topfldf_mc.var + 
+                               unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
+                               botfldf_mc.var + 
+                               unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
+                       /* DEBUG DP currently disabled... */
+//                     if ( M==1)
+//                     {
+//                             dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
+//                                                              i,j>>1,imins,jmins,
+//                                                              &dualpf_mc,
+//                                                              &imindmv,&jmindmv, &vmc_dp);
+//                     }
+
+                       /* NOTE: Typically M =3 so DP actually disabled... */
+                       /* select between dual prime, frame and field prediction */
+//                     if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
+//                     {
+//                             mbi->motion_type = MC_DMV;
+//                             /* No chrominance squared difference  measure yet.
+//                                Assume identical to luminance */
+//                             vmc = vmc_dp + vmc_dp;
+//                     }
+//                     else 
+                       if ( vmcf < vmcfieldf)
+                       {
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmcf;
+                                       
+                       }
+                       else
+                       {
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldf;
+                       }
+               }
+
+
+               /* select between intra or non-intra coding:
+                *
+                * selection is based on intra block variance (var) vs.
+                * prediction error variance (vmc)
+                *
+                * Used to be: blocks with small prediction error are always 
+                * coded non-intra even if variance is smaller (is this reasonable?
+                *
+                * TODO: A.Stevens Jul 2000
+                * The bbmpeg guys have found this to be *unreasonable*.
+                * I'm not sure I buy their solution using vmc*2.  It is probabably
+                * the vmc>= 9*256 test that is suspect.
+                * 
+                */
+
+
+               if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
+               {
+                       mbi->mb_type = MB_INTRA;
+                       mbi->var = var;
+               }
+               
+               else
+               {
+                       /* select between MC / No-MC
+                        *
+                        * use No-MC if var(No-MC) <= 1.25*var(MC)
+                        * (i.e slightly biased towards No-MC)
+                        *
+                        * blocks with small prediction error are always coded as No-MC
+                        * (requires no motion vectors, allows skipping)
+                        */
+                       v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
+
+                       if (4*v0>5*vmc )
+                       {
+                               /* use MC */
+                               var = vmc;
+                               mbi->mb_type = MB_FORWARD;
+                               if (mbi->motion_type==MC_FRAME)
+                               {
+                                       mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
+                               }
+                               else if (mbi->motion_type==MC_DMV)
+                               {
+                                       /* these are FRAME vectors */
+                                       /* same parity vector */
+                                       mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
+
+                                       /* opposite parity vector */
+                                       mbi->dmvector[0] = imindmv;
+                                       mbi->dmvector[1] = jmindmv;
+                               }
+                               else
+                               {
+                                       /* these are FRAME vectors */
+                                       mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
+                                       mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
+                                       mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
+                                       mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
+                                       mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
+                               }
+                       }
+                       else
+                       {
+
+                               /* No-MC */
+                               var = v0;
+                               mbi->mb_type = 0;
+                               mbi->motion_type = MC_FRAME;
+                               mbi->MV[0][0][0] = 0;
+                               mbi->MV[0][0][1] = 0;
+                       }
+               }
+       }
+       else /* if (pict_type==B_TYPE) */
+       {
+               if (picture->frame_pred_dct)
+               {
+                       var = variance(ssmb.mb,16,width);
+                       /* forward */
+                       fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
+                                          width,i,j,mc->sxf,mc->syf,
+                                          16,width,height,
+                                          &framef_mc
+                                          );
+                       framef_mc.fieldoff = 0;
+                       vmcf = framef_mc.var;
+
+                       /* backward */
+                       fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
+                                          width,i,j,mc->sxb,mc->syb,
+                                          16,width,height,
+                                          &frameb_mc);
+                       frameb_mc.fieldoff = 0;
+                       vmcr = frameb_mc.var;
+
+                       /* interpolated (bidirectional) */
+
+                       vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
+
+                       /* decisions */
+
+                       /* select between forward/backward/interpolated prediction:
+                        * use the one with smallest mean sqaured prediction error
+                        */
+                       if (vmcf<=vmcr && vmcf<=vmci)
+                       {
+                               vmc = vmcf;
+                               mbi->mb_type = MB_FORWARD;
+                       }
+                       else if (vmcr<=vmci)
+                       {
+                               vmc = vmcr;
+                               mbi->mb_type = MB_BACKWARD;
+                       }
+                       else
+                       {
+                               vmc = vmci;
+                               mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                       }
+
+                       mbi->motion_type = MC_FRAME;
+               }
+               else
+               {
+                       /* forward prediction */
+                       frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
+                                                  i,j,mc->sxf,mc->syf,
+                                                  &framef_mc,
+                                                  &topfldf_mc,
+                                                  &botfldf_mc,
+                                                  imins,jmins);
+
+
+                       /* backward prediction */
+                       frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
+                                                  i,j,mc->sxb,mc->syb,
+                                                  &frameb_mc,
+                                                  &topfldb_mc,
+                                                  &botfldb_mc,
+                                                  imins,jmins);
+
+                       vmcf = framef_mc.var;
+                       vmcr = frameb_mc.var;
+                       vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
+
+                       vmcfieldf = topfldf_mc.var + botfldf_mc.var;
+                       vmcfieldr = topfldb_mc.var + botfldb_mc.var;
+                       vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb, 
+                                                                               width<<1, 8) +
+                                       bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb, 
+                                                                               width<<1, 8);
+
+                       /* select prediction type of minimum distance from the
+                        * six candidates (field/frame * forward/backward/interpolated)
+                        */
+                       if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
+                                 && vmci<vmcr && vmci<vmcfieldr)
+                       {
+                               /* frame, interpolated */
+                               mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmci;
+                       }
+                       else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
+                                          && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
+                       {
+                               /* field, interpolated */
+                               mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldi;
+                       }
+                       else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
+                       {
+                               /* frame, forward */
+                               mbi->mb_type = MB_FORWARD;
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmcf;
+
+                       }
+                       else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
+                       {
+                               /* field, forward */
+                               mbi->mb_type = MB_FORWARD;
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldf;
+                       }
+                       else if (vmcr<vmcfieldr)
+                       {
+                               /* frame, backward */
+                               mbi->mb_type = MB_BACKWARD;
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmcr;
+                       }
+                       else
+                       {
+                               /* field, backward */
+                               mbi->mb_type = MB_BACKWARD;
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldr;
+
+                       }
+               }
+
+               /* select between intra or non-intra coding:
+                *
+                * selection is based on intra block variance (var) vs.
+                * prediction error variance (vmc)
+                *
+                * Used to be: blocks with small prediction error are always 
+                * coded non-intra even if variance is smaller (is this reasonable?
+                *
+                * TODO: A.Stevens Jul 2000
+                * The bbmpeg guys have found this to be *unreasonable*.
+                * I'm not sure I buy their solution using vmc*2 in the first comparison.
+                * It is probabably the vmc>= 9*256 test that is suspect.
+                *
+                */
+               if (vmc>var && vmc>=9*256)
+                       mbi->mb_type = MB_INTRA;
+               else
+               {
+                       var = vmc;
+                       if (mbi->motion_type==MC_FRAME)
+                       {
+                               /* forward */
+                               mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
+                               /* backward */
+                               mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
+                       }
+                       else
+                       {
+                               /* these are FRAME vectors */
+                               /* forward */
+                               mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
+                               mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
+                               mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
+                               mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
+                               mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
+                               /* backward */
+                               mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
+                               mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
+                               mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
+                               mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
+                               mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
+                       }
+               }
+       }
+}
+
+
+
+/*
+ * motion estimation for field pictures
+ *
+ * mbi:    pointer to macroblock info structure
+ * secondfield: indicates second field of a frame (in P fields this means
+ *              that reference field of opposite parity is in curref instead
+ *              of oldref)
+ * ipflag: indicates a P type field which is the second field of a frame
+ *         in which the first field is I type (this restricts predictions
+ *         to be based only on the opposite parity (=I) field)
+ *
+ * results:
+ * mbi->
+ *  mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
+ *  MV[][][]: motion vectors (field format)
+ *  mv_field_sel: top/bottom field
+ *  motion_type: MC_FIELD, MC_16X8
+ *
+ * uses global vars: pict_type, pict_struct
+ */
+static void field_ME(motion_engine_t *engine,
+       pict_data_s *picture,
+       motion_comp_s *mc,
+       int mb_row_start,
+       int i, int j, 
+       mbinfo_s *mbi, 
+       int secondfield, int ipflag)
+{
+       int w2;
+       uint8_t *toporg, *topref, *botorg, *botref;
+       subsampled_mb_s ssmb;
+       mb_motion_s fields_mc, dualp_mc;
+       mb_motion_s fieldf_mc, fieldb_mc;
+       mb_motion_s field8uf_mc, field8lf_mc;
+       mb_motion_s field8ub_mc, field8lb_mc;
+       int var,vmc,v0,dmc,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
+       int imins,jmins;
+       int dmc8f,dmc8r;
+       int vmc_dp,dmc_dp;
+
+       w2 = width<<1;
+
+       /* Fast motion data sits at the end of the luminance buffer */
+       ssmb.mb = mc->cur[0] + i + w2*j;
+       ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
+       ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
+       ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
+       ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
+
+       if (picture->pict_struct==BOTTOM_FIELD)
+       {
+               ssmb.mb += width;
+               ssmb.umb += (width >> 1);
+               ssmb.vmb += (width >> 1);
+               ssmb.fmb += (width >> 1);
+               ssmb.qmb += (width >> 2);
+       }
+
+       var = variance(ssmb.mb,16,w2) + 
+               ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
+
+       if (picture->pict_type==I_TYPE)
+               mbi->mb_type = MB_INTRA;
+       else if (picture->pict_type==P_TYPE)
+       {
+               toporg = mc->oldorg[0];
+               topref = mc->oldref[0];
+               botorg = mc->oldorg[0] + width;
+               botref = mc->oldref[0] + width;
+
+               if (secondfield)
+               {
+                       /* opposite parity field is in same frame */
+                       if (picture->pict_struct==TOP_FIELD)
+                       {
+                               /* current is top field */
+                               botorg = mc->cur[0] + width;
+                               botref = mc->curref[0] + width;
+                       }
+                       else
+                       {
+                               /* current is bottom field */
+                               toporg = mc->cur[0];
+                               topref = mc->curref[0];
+                       }
+               }
+
+               field_estimate(engine,
+                                               picture,
+                                          toporg,topref,botorg,botref,&ssmb,
+                                          i,j,mc->sxf,mc->syf,ipflag,
+                                          &fieldf_mc,
+                                          &field8uf_mc,
+                                          &field8lf_mc,
+                                          &fields_mc);
+               dmcfield = fieldf_mc.sad;
+               dmc8f = field8uf_mc.sad + field8lf_mc.sad;
+
+//             if (M==1 && !ipflag)  /* generic condition which permits Dual Prime */
+//             {
+//                     dpfield_estimate(engine,
+//                                                     picture,
+//                                                      topref,botref,ssmb.mb,i,j,imins,jmins,
+//                                                      &dualp_mc,
+//                                                      &vmc_dp);
+//                     dmc_dp = dualp_mc.sad;
+//             }
+               
+//             /* select between dual prime, field and 16x8 prediction */
+//             if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
+//             {
+//                     /* Dual Prime prediction */
+//                     mbi->motion_type = MC_DMV;
+//                     dmc = dualp_mc.sad;
+//                     vmc = vmc_dp;
+//
+//             }
+//             else 
+               if (dmc8f<dmcfield)
+               {
+                       /* 16x8 prediction */
+                       mbi->motion_type = MC_16X8;
+                       /* upper and lower half blocks */
+                       vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
+                       vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
+               }
+               else
+               {
+                       /* field prediction */
+                       mbi->motion_type = MC_FIELD;
+                       vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
+               }
+
+               /* select between intra and non-intra coding */
+
+               if ( vmc>var && vmc>=9*256)
+                       mbi->mb_type = MB_INTRA;
+               else
+               {
+                       /* zero MV field prediction from same parity ref. field
+                        * (not allowed if ipflag is set)
+                        */
+                       if (!ipflag)
+                               v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
+                                                          ssmb.mb,w2,0,0,16);
+                       if (ipflag  || (4*v0>5*vmc && v0>=9*256))
+                       {
+                               var = vmc;
+                               mbi->mb_type = MB_FORWARD;
+                               if (mbi->motion_type==MC_FIELD)
+                               {
+                                       mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
+                                       mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
+                               }
+                               else if (mbi->motion_type==MC_DMV)
+                               {
+                                       /* same parity vector */
+                                       mbi->MV[0][0][0] = imins - (i<<1);
+                                       mbi->MV[0][0][1] = jmins - (j<<1);
+
+                                       /* opposite parity vector */
+                                       mbi->dmvector[0] = dualp_mc.pos.x;
+                                       mbi->dmvector[1] = dualp_mc.pos.y;
+                               }
+                               else
+                               {
+                                       mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
+                                       mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
+                                       mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
+                                       mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
+                                       mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
+                               }
+                       }
+                       else
+                       {
+                               /* No MC */
+                               var = v0;
+                               mbi->mb_type = 0;
+                               mbi->motion_type = MC_FIELD;
+                               mbi->MV[0][0][0] = 0;
+                               mbi->MV[0][0][1] = 0;
+                               mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
+                       }
+               }
+       }
+       else /* if (pict_type==B_TYPE) */
+       {
+               /* forward prediction */
+               field_estimate(engine, 
+                                               picture,
+                                          mc->oldorg[0],mc->oldref[0],
+                                          mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
+                                          i,j,mc->sxf,mc->syf,0,
+                                          &fieldf_mc,
+                                          &field8uf_mc,
+                                          &field8lf_mc,
+                                          &fields_mc);
+               dmcfieldf = fieldf_mc.sad;
+               dmc8f = field8uf_mc.sad + field8lf_mc.sad;
+
+               /* backward prediction */
+               field_estimate(engine,
+                                               picture,
+                                          mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
+                                          &ssmb,
+                                          i,j,mc->sxb,mc->syb,0,
+                                          &fieldb_mc,
+                                          &field8ub_mc,
+                                          &field8lb_mc,
+                                          &fields_mc);
+               dmcfieldr = fieldb_mc.sad;
+               dmc8r = field8ub_mc.sad + field8lb_mc.sad;
+
+               /* calculate distances for bidirectional prediction */
+               /* field */
+               dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
+
+               /* 16x8 upper and lower half blocks */
+               dmc8i =  bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
+               dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
+
+               /* select prediction type of minimum distance */
+               if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
+                       && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
+               {
+                       /* field, interpolated */
+                       mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                       mbi->motion_type = MC_FIELD;
+                       vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
+               }
+               else if (dmc8i<dmcfieldf && dmc8i<dmc8f
+                                && dmc8i<dmcfieldr && dmc8i<dmc8r)
+               {
+                       /* 16x8, interpolated */
+                       mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                       mbi->motion_type = MC_16X8;
+
+                       /* upper and lower half blocks */
+                       vmc =  bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
+                       vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
+               }
+               else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
+               {
+                       /* field, forward */
+                       mbi->mb_type = MB_FORWARD;
+                       mbi->motion_type = MC_FIELD;
+                       vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
+               }
+               else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
+               {
+                       /* 16x8, forward */
+                       mbi->mb_type = MB_FORWARD;
+                       mbi->motion_type = MC_16X8;
+
+                       /* upper and lower half blocks */
+                       vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
+                       vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
+               }
+               else if (dmcfieldr<dmc8r)
+               {
+                       /* field, backward */
+                       mbi->mb_type = MB_BACKWARD;
+                       mbi->motion_type = MC_FIELD;
+                       vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
+               }
+               else
+               {
+                       /* 16x8, backward */
+                       mbi->mb_type = MB_BACKWARD;
+                       mbi->motion_type = MC_16X8;
+
+                       /* upper and lower half blocks */
+                       vmc =  unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
+                       vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
+
+               }
+
+               /* select between intra and non-intra coding */
+               if (vmc>var && vmc>=9*256)
+                       mbi->mb_type = MB_INTRA;
+               else
+               {
+                       var = vmc;
+                       if (mbi->motion_type==MC_FIELD)
+                       {
+                               /* forward */
+                               mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
+                               /* backward */
+                               mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
+                       }
+                       else /* MC_16X8 */
+                       {
+                               /* forward */
+                               mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
+                               mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
+                               mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
+                               mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
+                               /* backward */
+                               mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
+                               mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
+                               mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
+                               mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
+                       }
+               }
+       }
+
+}
+
+
+
+/*
+  Initialise motion compensation - currently purely selection of which
+  versions of the various low level computation routines to use
+  
+  */
+
+void init_motion()
+{
+       int cpucap = cpu_accel();
+
+       if( cpucap  == 0 )      /* No MMX/SSE etc support available */
+       {
+               pdist22 = dist22;
+               pdist44 = dist44;
+               pdist1_00 = dist1_00;
+               pdist1_01 = dist1_01;
+               pdist1_10 = dist1_10;
+               pdist1_11 = dist1_11;
+               pbdist1 = bdist1;
+               pdist2 = dist2;
+               pbdist2 = bdist2;
+               pdist2_22 = dist2_22;
+               pbdist2_22 = bdist2_22;
+               pfind_best_one_pel = find_best_one_pel;
+               pbuild_sub22_mcomps     = build_sub22_mcomps;
+        }
+#ifdef X86_CPU
+       else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
+       {
+               if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
+               pdist22 = dist22_mmxe;
+               pdist44 = dist44_mmxe;
+               pdist1_00 = dist1_00_mmxe;
+               pdist1_01 = dist1_01_mmxe;
+               pdist1_10 = dist1_10_mmxe;
+               pdist1_11 = dist1_11_mmxe;
+               pbdist1 = bdist1_mmx;
+               pdist2 = dist2_mmx;
+               pbdist2 = bdist2_mmx;
+               pdist2_22 = dist2_22_mmx;
+               pbdist2_22 = bdist2_22_mmx;
+               pfind_best_one_pel = find_best_one_pel_mmxe;
+               pbuild_sub22_mcomps     = build_sub22_mcomps_mmxe;
+               pmblock_sub44_dists = mblock_sub44_dists_mmxe;
+
+       }
+       else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
+       {
+               if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
+               pdist22 = dist22_mmx;
+               pdist44 = dist44_mmx;
+               pdist1_00 = dist1_00_mmx;
+               pdist1_01 = dist1_01_mmx;
+               pdist1_10 = dist1_10_mmx;
+               pdist1_11 = dist1_11_mmx;
+               pbdist1 = bdist1_mmx;
+               pdist2 = dist2_mmx;
+               pbdist2 = bdist2_mmx;
+               pdist2_22 = dist2_22_mmx;
+               pbdist2_22 = bdist2_22_mmx;
+               pfind_best_one_pel = find_best_one_pel;
+               pbuild_sub22_mcomps     = build_sub22_mcomps;
+               pmblock_sub44_dists = mblock_sub44_dists_mmx;
+       }
+#endif
+}
+
+
+void motion_engine_loop(motion_engine_t *engine)
+{
+       while(!engine->done)
+       {
+               pthread_mutex_lock(&(engine->input_lock));
+
+               if(!engine->done)
+               {
+                       pict_data_s *picture = engine->pict_data;
+                       motion_comp_s *mc_data = engine->motion_comp;
+                       int secondfield = engine->secondfield;
+                       int ipflag = engine->ipflag;
+                       mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
+                       int i, j;
+                       int mb_row_incr;                        /* Offset increment to go down 1 row of mb's */
+                       int mb_row_start;
+
+                       if (picture->pict_struct == FRAME_PICTURE)
+                       {                       
+                               mb_row_incr = 16*width;
+                               mb_row_start = engine->start_row / 16 * mb_row_incr;
+/* loop through all macroblocks of a frame picture */
+                               for (j=engine->start_row; j < engine->end_row; j+=16)
+                               {
+                                       for (i=0; i<width; i+=16)
+                                       {
+                                               frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
+                                               mbi++;
+                                       }
+                                       mb_row_start += mb_row_incr;
+                               }
+                       }
+                       else
+                       {               
+                               mb_row_incr = (16 * 2) * width;
+                               mb_row_start = engine->start_row / 16 * mb_row_incr;
+/* loop through all macroblocks of a field picture */
+                               for (j=engine->start_row; j < engine->end_row; j+=16)
+                               {
+                                       for (i=0; i<width; i+=16)
+                                       {
+                                               field_ME(engine, picture, mc_data, mb_row_start, i,j,
+                                                                mbi,secondfield,ipflag);
+                                               mbi++;
+                                       }
+                                       mb_row_start += mb_row_incr;
+                               }
+                       }
+               }
+
+               pthread_mutex_unlock(&(engine->output_lock));
+       }
+}
+
+
+void motion_estimation(pict_data_s *picture,
+       motion_comp_s *mc_data,
+       int secondfield, int ipflag)
+{
+       int i;
+/* Start loop */
+       for(i = 0; i < processors; i++)
+       {
+               motion_engines[i].motion_comp = mc_data;
+
+               motion_engines[i].pict_data = picture;
+
+               motion_engines[i].secondfield = secondfield;
+               motion_engines[i].ipflag = ipflag;
+               pthread_mutex_unlock(&(motion_engines[i].input_lock));
+       }
+
+/* Wait for completion */
+       for(i = 0; i < processors; i++)
+       {
+               pthread_mutex_lock(&(motion_engines[i].output_lock));
+       }
+}
+
+void start_motion_engines()
+{
+       int i;
+       int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
+       int current_row = 0;
+       pthread_attr_t  attr;
+       pthread_mutexattr_t mutex_attr;
+
+       pthread_mutexattr_init(&mutex_attr);
+       pthread_attr_init(&attr);
+       motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
+       for(i = 0; i < processors; i++)
+       {
+               motion_engines[i].start_row = current_row * i * 16;
+               current_row += rows_per_processor;
+               if(current_row > height2 / 16) current_row = height2 / 16;
+               motion_engines[i].end_row = current_row * 16;
+               pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
+               pthread_mutex_lock(&(motion_engines[i].input_lock));
+               pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
+               pthread_mutex_lock(&(motion_engines[i].output_lock));
+               motion_engines[i].done = 0;
+               pthread_create(&(motion_engines[i].tid), 
+                       &attr, 
+                       (void*)motion_engine_loop, 
+                       &motion_engines[i]);
+       }
+}
+
+void stop_motion_engines()
+{
+       int i;
+       for(i = 0; i < processors; i++)
+       {
+               motion_engines[i].done = 1;
+               pthread_mutex_unlock(&(motion_engines[i].input_lock));
+               pthread_join(motion_engines[i].tid, 0);
+               pthread_mutex_destroy(&(motion_engines[i].input_lock));
+               pthread_mutex_destroy(&(motion_engines[i].output_lock));
+       }
+       free(motion_engines);
+}
+
+
diff --git a/mpeg2enc/mpeg2enc.c b/mpeg2enc/mpeg2enc.c
new file mode 100644 (file)
index 0000000..f634597
--- /dev/null
@@ -0,0 +1,1221 @@
+/* mpeg2enc.c, main() and parameter file reading                            */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#define MAX(a,b) ( (a)>(b) ? (a) : (b) )
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GLOBAL_ /* used by global.h */
+#include "config.h"
+#include "global.h"
+
+/* private prototypes */
+static void init _ANSI_ARGS_((void));
+static void readcmdline _ANSI_ARGS_((int argc, char *argv[]));
+static void readquantmat _ANSI_ARGS_((void));
+
+
+// Hack for libdv to remove glib dependancy
+
+void
+g_log (const char    *log_domain,
+       int  log_level,
+       const char    *format,
+       ...)
+{
+}
+
+void
+g_logv (const char    *log_domain,
+       int  log_level,
+       const char    *format,
+       ...)
+{
+}
+
+
+void mpeg2enc_set_w(int width)
+{
+       horizontal_size = width;
+}
+
+void mpeg2enc_set_h(int height)
+{
+       vertical_size = height;
+}
+
+void mpeg2enc_set_rate(double rate)
+{
+       input_frame_rate = rate;
+}
+
+void mpeg2enc_set_input_buffers(int eof, char *y, char *u, char *v)
+{
+       pthread_mutex_lock(&output_lock);
+       input_buffer_end = eof;
+       input_buffer_y = y;
+       input_buffer_u = u;
+       input_buffer_v = v;
+       pthread_mutex_unlock(&input_lock);
+// Wait for buffers to get copied before returning.
+       pthread_mutex_lock(&copy_lock);
+}
+
+void mpeg2enc_init_buffers()
+{
+       pthread_mutex_init(&input_lock, 0);
+       pthread_mutex_init(&output_lock, 0);
+       pthread_mutex_init(&copy_lock, 0);
+       pthread_mutex_lock(&input_lock);
+       pthread_mutex_lock(&copy_lock);
+       input_buffer_end = 0;
+}
+
+int mpeg2enc(int argc, char *argv[])
+{
+       stdin_fd = stdin;
+       
+       verbose = 1;
+
+printf("mpeg2enc 1\n");
+
+/* Read command line */
+       readcmdline(argc, argv);
+printf("mpeg2enc 2\n");
+
+/* read quantization matrices */
+       readquantmat();
+printf("mpeg2enc 3\n");
+
+       if(!strlen(out_path))
+       {
+               fprintf(stderr, "No output file given.\n");
+       }
+printf("mpeg2enc 4\n");
+
+/* open output file */
+       if(!(outfile = fopen(out_path, "wb")))
+       {
+      sprintf(errortext,"Couldn't create output file %s", out_path);
+      error(errortext);
+       }
+printf("mpeg2enc 5\n");
+
+       init();
+//printf("mpeg2enc 6\n");
+
+       if(nframes < 0x7fffffff)
+               printf("Frame    Completion    Current bitrate     Predicted file size\n");
+       putseq();
+
+       stop_slice_engines();
+       stop_motion_engines();
+       stop_transform_engines();
+       stop_itransform_engines();
+
+       fclose(outfile);
+       fclose(statfile);
+
+       if(qt_file) quicktime_close(qt_file);
+       if(qt_output) quicktime_close(qt_output);
+       if(mpeg_file) mpeg3_close(mpeg_file);
+
+       if(do_stdin)
+       {
+               fclose(stdin_fd);
+       }
+       pthread_mutex_destroy(&input_lock);
+       pthread_mutex_destroy(&output_lock);
+       return 0;
+}
+
+int HorzMotionCode(int i)
+{
+       if (i < 8)
+      return 1;
+       if (i < 16)
+      return 2;
+       if (i < 32)
+      return 3;
+       if ((i < 64) || (constrparms))
+      return 4;
+       if (i < 128)
+      return 5;
+       if (i < 256)
+      return 6;
+       if ((i < 512) || (level == 10))
+      return 7;
+       if ((i < 1024) || (level == 8))
+      return 8;
+       if (i < 2048)
+      return 9;
+       return 1;
+}
+
+int VertMotionCode(int i)
+{
+       if (i < 8)
+      return 1;
+       if (i < 16)
+      return 2;
+       if (i < 32)
+      return 3;
+       if ((i < 64) || (level == 10) || (constrparms))
+      return 4;
+       return 5;
+}
+
+/*
+       Wrapper for malloc that allocates pbuffers aligned to the 
+       specified byte boundary and checks for failure.
+       N.b.  don't try to free the resulting pointers, eh...
+       BUG:    Of course this won't work if a char * won't fit in an int....
+*/
+static uint8_t *bufalloc( size_t size )
+{
+       char *buf = malloc( size + BUFFER_ALIGN );
+       int adjust;
+
+       if( buf == NULL )
+       {
+               error("malloc failed\n");
+       }
+       adjust = BUFFER_ALIGN-((int)buf)%BUFFER_ALIGN;
+       if( adjust == BUFFER_ALIGN )
+               adjust = 0;
+       return (uint8_t*)(buf+adjust);
+}
+
+static void init()
+{
+       int i, n, size;
+       static int block_count_tab[3] = {6,8,12};
+       int lum_buffer_size, chrom_buffer_size;
+       pthread_mutexattr_t mutex_attr;
+       pthread_mutexattr_init(&mutex_attr);
+       pthread_mutex_init(&test_lock, &mutex_attr);
+
+       bzero(&cur_picture, sizeof(pict_data_s));
+       mpeg2_initbits();
+       init_fdct();
+       init_idct();
+       init_motion();
+       init_predict();
+       init_quantizer();
+       init_transform();
+
+/* round picture dimensions to nZearest multiple of 16 or 32 */
+       mb_width = (horizontal_size+15)/16;
+       mb_height = prog_seq ? 
+               (vertical_size + 15) / 16 : 
+               2 * ((vertical_size + 31) / 32);
+       mb_height2 = fieldpic ? 
+               mb_height >> 1 : 
+               mb_height; /* for field pictures */
+       width = 16 * mb_width;
+       height = 16 * mb_height;
+
+       chrom_width = (chroma_format==CHROMA444) ? width : width>>1;
+       chrom_height = (chroma_format!=CHROMA420) ? height : height>>1;
+
+       height2 = fieldpic ? height>>1 : height;
+       width2 = fieldpic ? width<<1 : width;
+       chrom_width2 = fieldpic ? chrom_width<<1 : chrom_width;
+
+       block_count = block_count_tab[chroma_format-1];
+       lum_buffer_size = (width*height) + 
+                                        sizeof(uint8_t) *(width/2)*(height/2) +
+                                        sizeof(uint8_t) *(width/4)*(height/4+1);
+       chrom_buffer_size = chrom_width*chrom_height;
+
+       fsubsample_offset = (width)*(height) * sizeof(uint8_t);
+       qsubsample_offset =  fsubsample_offset + (width/2)*(height/2)*sizeof(uint8_t);
+
+       rowsums_offset = 0;
+       colsums_offset = 0;
+
+       mb_per_pict = mb_width*mb_height2;
+
+/* clip table */
+       if (!(clp = (unsigned char *)malloc(1024)))
+      error("malloc failed\n");
+       clp+= 384;
+       for (i=-384; i<640; i++)
+      clp[i] = (i<0) ? 0 : ((i>255) ? 255 : i);
+
+
+       
+       /* Allocate the frame buffer */
+
+
+       frame_buffers = (uint8_t ***) 
+               bufalloc(2*READ_LOOK_AHEAD*sizeof(uint8_t**));
+       
+       for(n=0;n<2*READ_LOOK_AHEAD;n++)
+       {
+         frame_buffers[n] = (uint8_t **) bufalloc(3*sizeof(uint8_t*));
+                for (i=0; i<3; i++)
+                {
+                        frame_buffers[n][i] = 
+                                bufalloc( (i==0) ? lum_buffer_size : chrom_buffer_size );
+                }
+       }
+
+
+
+       /* TODO: The ref and aux frame buffers are no redundant! */
+       for( i = 0 ; i<3; i++)
+       {
+               int size =  (i==0) ? lum_buffer_size : chrom_buffer_size;
+               newrefframe[i] = bufalloc(size);
+               oldrefframe[i] = bufalloc(size);
+               auxframe[i]    = bufalloc(size);
+               predframe[i]   = bufalloc(size);
+       }
+
+       cur_picture.qblocks =
+               (int16_t (*)[64])bufalloc(mb_per_pict*block_count*sizeof(int16_t [64]));
+
+       /* Initialise current transformed picture data tables
+          These will soon become a buffer for transformed picture data to allow
+          look-ahead for bit allocation etc.
+        */
+       cur_picture.mbinfo = (
+               struct mbinfo *)bufalloc(mb_per_pict*sizeof(struct mbinfo));
+
+       cur_picture.blocks =
+               (int16_t (*)[64])bufalloc(mb_per_pict * block_count * sizeof(int16_t [64]));
+  
+  
+/* open statistics output file */
+       if(statname[0]=='-') statfile = stdout;
+       else 
+       if(!(statfile = fopen(statname,"w")))
+       {
+      sprintf(errortext,"Couldn't create statistics output file %s",statname);
+      error(errortext);
+       }
+
+       ratectl = malloc(processors * sizeof(ratectl_t*));
+       for(i = 0; i < processors; i++)
+               ratectl[i] = calloc(1, sizeof(ratectl_t));
+       
+
+
+/* Start parallel threads */
+
+//printf("init 1\n");
+       start_motion_engines();
+//printf("init 2\n");
+       start_transform_engines();
+//printf("init 3\n");
+       start_itransform_engines();
+//printf("init 4\n");
+       start_slice_engines();
+//printf("init 5\n");
+}
+
+void error(text)
+char *text;
+{
+  fprintf(stderr,text);
+  putc('\n',stderr);
+  exit(1);
+}
+
+#define STRINGLEN 254
+
+int calculate_smp()
+{
+/* Get processor count */
+       int result = 1;
+       FILE *proc;
+       if(proc = fopen("/proc/cpuinfo", "r"))
+       {
+               char string[1024];
+               while(!feof(proc))
+               {
+                       fgets(string, 1024, proc);
+                       if(!strncasecmp(string, "processor", 9))
+                       {
+                               char *ptr = strchr(string, ':');
+                               if(ptr)
+                               {
+                                       ptr++;
+                                       result = atol(ptr) + 1;
+                               }
+                       }
+                       else
+                       if(!strncasecmp(string, "cpus detected", 13))
+                       {
+                               char *ptr = strchr(string, ':');
+                               if(ptr)
+                               {
+                                       ptr++;
+                                       result = atol(ptr);
+                               }
+                       }
+               }
+               fclose(proc);
+       }
+       return result;
+}
+
+static void readcmdline(int argc, char *argv[])
+{
+       int i, j;
+       int h,m,s,f;
+       FILE *fd;
+       char line[256];
+// Master frame rate table must match decoder
+       static double ratetab[]=
+       {24000.0/1001.0,  // Official rates
+               24.0,
+               25.0,
+               30000.0/1001.0,
+               30.0,
+               50.0,
+               60000.0/1001.0,
+               60.0,
+
+               1,           // Unofficial economy rates
+               5,
+               10,
+               12, 
+               15,
+               0,
+               0};
+// VBV buffer size limits
+       int vbvlim[4] = { 597, 448, 112, 29 };
+       long total_frame_rates = (sizeof(ratetab) / sizeof(double));
+       FILE *proc;
+// Default 16
+       int param_searchrad = 16;
+       int isnum = 1;
+
+//printf("readcmdline 1\n");
+       frame0 =                   0;  /* number of first frame */
+       start_frame = end_frame = -1;
+       use_hires_quant = 0;
+       use_denoise_quant = 0;
+       quiet = 1;
+       bit_rate = 5000000;                       /* default bit_rate (bits/s) */
+       prog_seq = 0;                        /* progressive_sequence is faster */
+       mpeg1 = 0;                                   /* ISO/IEC 11172-2 stream */
+    fixed_mquant = 0;                             /* vary the quantization */
+       quant_floor = 0;
+       act_boost = 3.0;
+       N = 15;                                      /* N (# of frames in GOP) */
+       M = 1;  /* M (I/P frame distance) */
+       processors = calculate_smp();
+       frame_rate = -1;
+       chroma_format =            1;  /* chroma_format: 1=4:2:0, 2=4:2:2, 3=4:4:4   LibMPEG3 only does 1 */
+       mpeg_file = 0;
+       qt_file = 0;
+       do_stdin = 0;
+       do_buffers = 1;
+       seq_header_every_gop = 0;
+
+
+
+
+
+//printf("readcmdline 2\n");
+
+
+       sprintf(tplorg, "");
+       sprintf(out_path, "");
+
+#define INTTOYES(x) ((x) ? "Yes" : "No")
+  if(argc < 2)
+  {
+    printf("mpeg2encode V1.3, 2000/01/10\n"
+"(C) 1996, MPEG Software Simulation Group\n"
+"(C) 2001 Heroine Virtual\n"
+"Usage: %s [options] <input file> <output file>\n\n"
+" -1                 generate an MPEG-1 stream instead of MPEG-2 (%s)\n"
+" -422                                 generate YUV 4:2:2 output\n"
+" -b bitrate              fix the bitrate, vary the quantization (%d)\n"
+" -d                                                     Denoise (%s)\n"
+" -f rate                                      Convert framerate\n"
+" -h                          High resolution quantization table (%s)\n"
+" -m frames                set number of frames between P frames (%d)\n"
+" -n frames                set number of frames between I frames (%d)\n"
+" -p                                   encode progressive frames (%s)\n"
+" -q quantization         fix the quantization, vary the bitrate\n"
+" [number]               Start encoding from frame number to end\n"
+" [number1] [number2]    Encode frame number 1 to frame number 2\n"
+" -u                                        Use only 1 processor\n\n"
+"Default settings:\n"
+"   fixed 5000000 bits/sec\n"
+"   interlaced\n"
+"   MPEG-2\n"
+"   15 frames between I frames   0 frames between P frames\n\n"
+"For the recommended encoding parameters see docs/index.html.\n",
+argv[0], 
+mpeg1 ? "MPEG-1" : "MPEG-2",
+(int)bit_rate,
+INTTOYES(use_denoise_quant),
+INTTOYES(use_hires_quant),
+M - 1,
+N,
+INTTOYES(prog_seq));
+    exit(1);
+  }
+//printf("readcmdline 3\n");
+
+       for(i = 1; i < argc; i++)
+       {
+               isnum = 1;
+
+               for(j = 0; j < strlen(argv[i]) && isnum; j++)
+               {
+                       if(isalpha(argv[i][j])) isnum = 0;
+               }
+
+
+//printf("readcmdline %s\n", argv[i]);
+               if(!strcmp(argv[i], "-1"))
+               {
+                       mpeg1 = 1;
+               }
+               else
+               if(!strcmp(argv[i], "-b"))
+               {
+                       i++;
+                       if(i < argc)
+                       {
+                               bit_rate = atol(argv[i]);
+                       }
+                       else
+                       {
+                               fprintf(stderr, "-b requires a bitrate\n");
+                               exit(1);
+                       }
+               }
+               else
+               if(!strcmp(argv[i], "-d"))
+               {
+                       use_denoise_quant = 1;
+               }
+               else
+               if(!strcmp(argv[i], "-f"))
+               {
+                       i++;
+                       if(i < argc)
+                       {
+                               frame_rate = atof(argv[i]);
+                       }
+                       else
+                       {
+                               fprintf(stderr, "-f requires a frame rate\n");
+                               exit(1);
+                       }
+               }
+               else
+               if(!strcmp(argv[i], "-h"))
+               {
+                       use_hires_quant = 1;
+               }
+               else
+               if(!strcmp(argv[i], "-m"))
+               {
+                       i++;
+                       if(i < argc)
+                       {
+                               M = atol(argv[i]) + 1;
+                       }
+                       else
+                       {
+                               fprintf(stderr, "-m requires a frame count\n");
+                               exit(1);
+                       }
+               }
+               else
+               if(!strcmp(argv[i], "-n"))
+               {
+                       i++;
+                       if(i < argc)
+                       {
+                               N = atol(argv[i]);
+                       }
+                       else
+                       {
+                               fprintf(stderr, "-n requires a frame count\n");
+                               exit(1);
+                       }
+               }
+               else
+               if(!strcmp(argv[i], "-p"))
+               {
+                       prog_seq = 1;
+               }
+               else
+               if(!strcmp(argv[i], "-q"))
+               {
+                       i++;
+                       if(i < argc)
+                       {
+                               fixed_mquant = atol(argv[i]);
+                       }
+                       else
+                       {
+                               fprintf(stderr, "-q requires a quantization value\n");
+                               exit(1);
+                       }
+               }
+               else
+               if(!strcmp(argv[i], "-u"))
+               {
+                       processors = 1;
+               }
+               else
+               if(!strcmp(argv[i], "-422"))
+               {
+                       chroma_format = 2;
+               }
+               else
+               if(!strcmp(argv[i], "-g"))
+               {
+                       seq_header_every_gop = 1;
+               }
+               else
+               if(!strcmp(argv[i], "-"))
+               {
+                       do_stdin = 1;
+               }
+               else
+/* Start or end frame if number */
+               if(isnum)
+               {
+                       if(start_frame < 0)
+                               start_frame = atol(argv[i]);
+                       else
+                       if(end_frame < 0)
+                               end_frame = atol(argv[i]);
+               }
+               else
+               if(!strlen(tplorg) && !do_stdin && !do_buffers)
+               {
+/* Input path */
+                       strncpy(tplorg, argv[i], STRINGLEN);
+               }
+               else
+               if(!strlen(out_path))
+               {
+/* Output path */
+                       strncpy(out_path, argv[i], STRINGLEN);
+               }
+       }
+//printf("readcmdline 4\n");
+
+       if(!strlen(out_path))
+       {
+// Default output path
+               strncpy(out_path, tplorg, STRINGLEN);
+               for(i = strlen(out_path) - 1; i >= 0 && out_path[i] != '.'; i--)
+                       ;
+
+               if(i < 0) i = strlen(out_path);
+               
+               if(mpeg1)
+                       sprintf(&out_path[i], ".m1v");
+               else
+                       sprintf(&out_path[i], ".m2v");
+       }
+//printf("readcmdline 5\n");
+
+/* Get info from input file */
+       if(do_stdin)
+       {
+               inputtype = T_STDIN;
+       }
+       else
+       if(do_buffers)
+       {
+               inputtype = T_BUFFERS;
+       }
+       else
+       if(mpeg3_check_sig(tplorg))
+       {
+               mpeg_file = mpeg3_open(tplorg);
+               inputtype = T_MPEG;
+       }
+       else
+       if(quicktime_check_sig(tplorg))
+       {
+               qt_file = quicktime_open(tplorg, 1, 0);
+               inputtype = T_QUICKTIME;
+       }
+//printf("readcmdline 6\n");
+
+       if(!qt_file && !mpeg_file && !do_stdin && !do_buffers)
+       {
+               fprintf(stderr, "File format not recognized.\n");
+               exit(1);
+       }
+
+//printf("readcmdline 7\n");
+       if(qt_file)
+       {
+               if(!quicktime_video_tracks(qt_file))
+               {
+                       fprintf(stderr, "No video tracks in file.\n");
+                       exit(1);
+               }
+
+               if(!quicktime_supported_video(qt_file, 0))
+               {
+                       fprintf(stderr, "Unsupported video codec.\n");
+                       exit(1);
+               }
+       }
+//printf("readcmdline 8\n");
+
+/************************************************************************
+ *                            BEGIN PARAMETER FILE
+ ************************************************************************/
+
+/* To eliminate the user hassle we replaced the parameter file with hard coded constants. */
+       strcpy(tplref,             "-");  /* name of intra quant matrix file     ("-": default matrix) */
+       strcpy(iqname,             "-");  /* name of intra quant matrix file     ("-": default matrix) */
+       strcpy(niqname,            "-");  /* name of non intra quant matrix file ("-": default matrix) */
+       strcpy(statname,           "/dev/null");  /* name of statistics file ("-": stdout ) */
+
+       if(qt_file)
+       {
+               nframes =                  quicktime_video_length(qt_file, 0);  /* number of frames */
+               horizontal_size =          quicktime_video_width(qt_file, 0);
+               vertical_size =            quicktime_video_height(qt_file, 0);
+       }
+       else
+       if(mpeg_file)
+       {
+               nframes =                  0x7fffffff;  /* Use percentage instead */
+               horizontal_size =          mpeg3_video_width(mpeg_file, 0);
+               vertical_size =            mpeg3_video_height(mpeg_file, 0);
+       }
+       else
+       if(do_stdin)
+       {
+               unsigned char data[1024];
+               nframes =                  0x7fffffff;
+               
+               fgets(data, 1024, stdin_fd);
+               horizontal_size =          atol(data);
+               fgets(data, 1024, stdin_fd);
+               vertical_size =            atol(data);
+       }
+       else
+       if(do_buffers)
+       {
+               nframes = 0x7fffffff;
+       }
+       
+       
+       h = m = s = f =            0;  /* timecode of first frame */
+       fieldpic =                 0;  /* 0: progressive, 1: bottom first, 2: top first, 3 = progressive seq, field MC and DCT in picture */
+       aspectratio =              1;  /* aspect_ratio_information 1=square pel, 2=4:3, 3=16:9, 4=2.11:1 */
+       low_delay =                0;  /* low_delay  */
+       constrparms =              0;  /* constrained_parameters_flag */
+       profile =                  4;  /* Profile ID: Simple = 5, Main = 4, SNR = 3, Spatial = 2, High = 1 */
+       level =                    4;  /* Level ID:   Low = 10, Main = 8, High 1440 = 6, High = 4                  */
+       video_format =             2;  /* video_format: 0=comp., 1=PAL, 2=NTSC, 3=SECAM, 4=MAC, 5=unspec. */
+       color_primaries =          5;  /* color_primaries */
+       dctsatlim               = mpeg1 ? 255 : 2047;
+       dctsatlim = 255;
+       transfer_characteristics = 5;  /* transfer_characteristics */
+       matrix_coefficients =      4;  /* matrix_coefficients (not used) */
+       display_horizontal_size =  horizontal_size;
+       display_vertical_size =    vertical_size;
+       cur_picture.dc_prec =      0;  /* intra_dc_precision (0: 8 bit, 1: 9 bit, 2: 10 bit, 3: 11 bit */
+       cur_picture.topfirst =     1;  /* top_field_first */
+
+       frame_pred_dct_tab[0] =    mpeg1 ? 1 : 0;  /* frame_pred_frame_dct (I P B) */
+       frame_pred_dct_tab[1] =    mpeg1 ? 1 : 0;  /* frame_pred_frame_dct (I P B) */
+       frame_pred_dct_tab[2] =    mpeg1 ? 1 : 0;  /* frame_pred_frame_dct (I P B) */
+
+       conceal_tab[0]                   = 0;  /* concealment_motion_vectors (I P B) */
+       conceal_tab[1]                   = 0;  /* concealment_motion_vectors (I P B) */
+       conceal_tab[2]                   = 0;  /* concealment_motion_vectors (I P B) */
+       qscale_tab[0]                    = mpeg1 ? 0 : 1;  /* q_scale_type  (I P B) */
+       qscale_tab[1]                    = mpeg1 ? 0 : 1;  /* q_scale_type  (I P B) */
+       qscale_tab[2]                    = mpeg1 ? 0 : 1;  /* q_scale_type  (I P B) */
+
+       intravlc_tab[0]                  = 0;  /* intra_vlc_format (I P B)*/
+       intravlc_tab[1]                  = 0;  /* intra_vlc_format (I P B)*/
+       intravlc_tab[2]                  = 0;  /* intra_vlc_format (I P B)*/
+       altscan_tab[0]                   = 0;  /* alternate_scan (I P B) */
+       altscan_tab[1]                   = 0;  /* alternate_scan (I P B) */
+       altscan_tab[2]                   = 0;  /* alternate_scan (I P B) */
+       opt_dc_prec         = 0;  /* 8 bits */
+       opt_topfirst        = (fieldpic == 2);
+       opt_repeatfirst     = 0;
+       opt_prog_frame      = prog_seq;
+       cur_picture.repeatfirst =              0;  /* repeat_first_field */
+       cur_picture.prog_frame =               prog_seq;  /* progressive_frame */
+/* P:  forw_hor_f_code forw_vert_f_code search_width/height */
+    motion_data = (struct motion_data *)malloc(3 * sizeof(struct motion_data));
+       video_buffer_size =        46 * 1024 * 8;
+
+/************************************************************************
+ *                                END PARAMETER FILE
+ ************************************************************************/
+//printf("readcmdline 10\n");
+
+       if(mpeg1)
+       {
+               opt_prog_frame = 1;
+               cur_picture.prog_frame = 1;
+               prog_seq = 1;
+       }
+
+       if(qt_file)
+       {
+               input_frame_rate = quicktime_frame_rate(qt_file, 0);
+       }
+       else
+       if(mpeg_file)
+       {
+               input_frame_rate = mpeg3_frame_rate(mpeg_file, 0);
+       }
+       else
+       if(do_stdin)
+       {
+               char data[1024];
+               
+               fgets(data, 1024, stdin_fd);
+               
+               input_frame_rate = atof(data);
+       }
+       
+       
+       if(frame_rate < 0)
+       {
+               frame_rate = input_frame_rate;
+       }
+//printf("readcmdline 11\n");
+
+//processors = 1;
+//nframes = 16;
+       if(start_frame >= 0 && end_frame >= 0)
+       {
+               nframes = end_frame - start_frame;
+               frame0 = start_frame;
+       }
+       else
+       if(start_frame >= 0)
+       {
+               end_frame = nframes;
+               nframes -= start_frame;
+               frame0 = start_frame;
+       }
+       else
+       {
+               start_frame = 0;
+               end_frame = nframes;
+       }
+//printf("readcmdline 12\n");
+
+// Show status
+       if(verbose)
+       {
+               printf("Encoding: %s frames %ld\n", out_path, nframes);
+
+       if(fixed_mquant == 0) 
+               printf("   bitrate %.0f\n", bit_rate);
+       else
+               printf("   quantization %d\n", fixed_mquant);
+       printf("   %d frames between I frames   %d frames between P frames\n", N, M - 1);
+       printf("   %s\n", (prog_seq ? "progressive" : "interlaced"));
+               printf("   %s\n", (mpeg1 ? "MPEG-1" : "MPEG-2"));
+               printf("   %s\n", (chroma_format == 1) ? "YUV-420" : "YUV-422");
+               printf("   %d processors\n", processors);
+               printf("   %.02f frames per second\n", frame_rate);
+               printf("   Denoise %s\n", INTTOYES(use_denoise_quant));
+               printf("   Hires quantization %s\n", INTTOYES(use_hires_quant));
+
+
+               if(mpeg_file)
+               {
+                       fprintf(stderr, "(MPEG to MPEG transcoding for official use only.)\n");
+                       mpeg3_set_mmx(mpeg_file, 0);
+               }
+       }
+
+
+
+       { 
+               int radius_x = ((param_searchrad + 4) / 8) * 8;
+               int radius_y = ((param_searchrad * vertical_size / horizontal_size + 4) / 8) * 8;
+               int c;
+       
+               /* TODO: These f-codes should really be adjusted for each
+                  picture type... */
+               c=5;
+               if( radius_x*M < 64) c = 4;
+               if( radius_x*M < 32) c = 3;
+               if( radius_x*M < 16) c = 2;
+               if( radius_x*M < 8) c = 1;
+
+               if (!motion_data)
+                       error("malloc failed\n");
+
+               for (i=0; i<M; i++)
+               {
+                       if(i==0)
+                       {
+                               motion_data[i].forw_hor_f_code  = c;
+                               motion_data[i].forw_vert_f_code = c;
+                               motion_data[i].sxf = MAX(1,radius_x*M);
+                               motion_data[i].syf = MAX(1,radius_y*M);
+                       }
+                       else
+                       {
+                               motion_data[i].forw_hor_f_code  = c;
+                               motion_data[i].forw_vert_f_code = c;
+                               motion_data[i].sxf = MAX(1,radius_x*i);
+                               motion_data[i].syf = MAX(1,radius_y*i);
+                               motion_data[i].back_hor_f_code  = c;
+                               motion_data[i].back_vert_f_code = c;
+                               motion_data[i].sxb = MAX(1,radius_x*(M-i));
+                               motion_data[i].syb = MAX(1,radius_y*(M-i));
+                       }
+               }
+               
+       }
+
+//    vbv_buffer_size = floor(((double)bit_rate * 0.20343) / 16384.0);
+    if(mpeg1)
+               vbv_buffer_size = 20 * 16384;
+       else
+               vbv_buffer_size = 112 * 16384;
+
+       fast_mc_frac    = 10;
+       mc_44_red               = 2;
+       mc_22_red               = 3;
+
+    if(vbv_buffer_size > vbvlim[(level - 4) >> 1])
+        vbv_buffer_size = vbvlim[(level - 4) >> 1];
+
+/* Set up frame buffers */
+       frame_buffer = malloc(horizontal_size * vertical_size * 3 + 4);
+       row_pointers = malloc(sizeof(unsigned char*) * vertical_size);
+       for(i = 0; i < vertical_size; i++) row_pointers[i] = &frame_buffer[horizontal_size * 3 * i];
+
+// Get frame rate code from input frame rate
+       for(i = 0; i < total_frame_rates; i++)
+       {
+               if(fabs(frame_rate - ratetab[i]) < 0.001) frame_rate_code = i + 1;
+       }
+
+/* make flags boolean (x!=0 -> x=1) */
+  mpeg1 = !!mpeg1;
+  fieldpic = !!fieldpic;
+  low_delay = !!low_delay;
+  constrparms = !!constrparms;
+  prog_seq = !!prog_seq;
+  cur_picture.topfirst = !!cur_picture.topfirst;
+
+  for (i = 0; i < 3; i++)
+  {
+    frame_pred_dct_tab[i] = !!frame_pred_dct_tab[i];
+    conceal_tab[i] = !!conceal_tab[i];
+    qscale_tab[i] = !!qscale_tab[i];
+    intravlc_tab[i] = !!intravlc_tab[i];
+    altscan_tab[i] = !!altscan_tab[i];
+  }
+  cur_picture.repeatfirst = !!cur_picture.repeatfirst;
+  cur_picture.prog_frame = !!cur_picture.prog_frame;
+
+  /* make sure MPEG specific parameters are valid */
+  range_checks();
+
+  /* timecode -> frame number */
+  tc0 = h;
+  tc0 = 60*tc0 + m;
+  tc0 = 60*tc0 + s;
+  tc0 = (int)(frame_rate+0.5)*tc0 + f;
+
+  qt_output = 0;
+
+
+
+
+
+
+
+
+
+
+  if (!mpeg1)
+  {
+    profile_and_level_checks();
+  }
+  else
+  {
+    /* MPEG-1 */
+    if (constrparms)
+    {
+      if (horizontal_size>768
+          || vertical_size>576
+          || ((horizontal_size+15)/16)*((vertical_size+15) / 16) > 396
+          || ((horizontal_size+15)/16)*((vertical_size+15) / 16)*frame_rate>396*25.0
+          || frame_rate>30.0)
+      {
+        if (!quiet)
+          fprintf(stderr,"*** Warning: setting constrained_parameters_flag = 0\n");
+        constrparms = 0;
+      }
+    }
+  }
+
+  /* relational checks */
+
+  if (mpeg1)
+  {
+    if (!prog_seq)
+    {
+      prog_seq = 1;
+    }
+
+    if (chroma_format!=CHROMA420)
+    {
+      chroma_format = CHROMA420;
+    }
+
+    if (cur_picture.dc_prec!=0)
+    {
+      cur_picture.dc_prec = 0;
+    }
+
+    for (i=0; i<3; i++)
+      if (qscale_tab[i])
+      {
+        qscale_tab[i] = 0;
+      }
+
+    for (i=0; i<3; i++)
+      if (intravlc_tab[i])
+      {
+        intravlc_tab[i] = 0;
+      }
+
+    for (i=0; i<3; i++)
+      if (altscan_tab[i])
+      {
+        altscan_tab[i] = 0;
+      }
+  }
+
+  if (!mpeg1 && constrparms)
+  {
+    constrparms = 0;
+  }
+
+  if (prog_seq && !cur_picture.prog_frame)
+  {
+    cur_picture.prog_frame = 1;
+  }
+
+  if (cur_picture.prog_frame && fieldpic)
+  {
+    fieldpic = 0;
+  }
+
+  if (!cur_picture.prog_frame && cur_picture.repeatfirst)
+  {
+    cur_picture.repeatfirst = 0;
+  }
+
+  if (cur_picture.prog_frame)
+  {
+    for (i=0; i<3; i++)
+      if (!frame_pred_dct_tab[i])
+      {
+        frame_pred_dct_tab[i] = 1;
+      }
+  }
+
+  if (prog_seq && !cur_picture.repeatfirst && cur_picture.topfirst)
+  {
+     if (!quiet)
+       fprintf(stderr,"Warning: setting top_field_first = 0\n");
+    cur_picture.topfirst = 0;
+  }
+
+  /* search windows */
+  for (i=0; i<M; i++)
+  {
+    if (motion_data[i].sxf > (4<<motion_data[i].forw_hor_f_code)-1)
+    {
+      if (!quiet)
+        fprintf(stderr,
+          "Warning: reducing forward horizontal search width to %d\n",
+          (4<<motion_data[i].forw_hor_f_code)-1);
+      motion_data[i].sxf = (4<<motion_data[i].forw_hor_f_code)-1;
+    }
+
+    if (motion_data[i].syf > (4<<motion_data[i].forw_vert_f_code)-1)
+    {
+      if (!quiet)
+        fprintf(stderr,
+          "Warning: reducing forward vertical search width to %d\n",
+          (4<<motion_data[i].forw_vert_f_code)-1);
+      motion_data[i].syf = (4<<motion_data[i].forw_vert_f_code)-1;
+    }
+
+    if (i!=0)
+    {
+      if (motion_data[i].sxb > (4<<motion_data[i].back_hor_f_code)-1)
+      {
+        if (!quiet)
+          fprintf(stderr,
+            "Warning: reducing backward horizontal search width to %d\n",
+            (4<<motion_data[i].back_hor_f_code)-1);
+        motion_data[i].sxb = (4<<motion_data[i].back_hor_f_code)-1;
+      }
+
+      if (motion_data[i].syb > (4<<motion_data[i].back_vert_f_code)-1)
+      {
+        if (!quiet)
+          fprintf(stderr,
+            "Warning: reducing backward vertical search width to %d\n",
+            (4<<motion_data[i].back_vert_f_code)-1);
+        motion_data[i].syb = (4<<motion_data[i].back_vert_f_code)-1;
+      }
+    }
+  }
+
+}
+
+/*
+  If the user has selected suppression of hf noise via
+  quantisation then we boost quantisation of hf components
+  EXPERIMENTAL: currently a linear ramp from 0 at 4pel to 
+  50% increased quantisation...
+*/
+
+static int quant_hfnoise_filt(int orgquant, int qmat_pos )
+{
+       int x = qmat_pos % 8;
+       int y = qmat_pos / 8;
+       int qboost = 1024;
+
+       if(!use_denoise_quant)
+       {
+               return orgquant;
+       }
+
+       /* Maximum 50% quantisation boost for HF components... */
+       if( x > 4 )
+               qboost += (256*(x-4)/3);
+       if( y > 4 )
+               qboost += (256*(y-4)/3);
+
+       return (orgquant * qboost + 512)/ 1024;
+}
+
+static void readquantmat()
+{
+  int i,v,q;
+       load_iquant = 0;
+       load_niquant = 0;
+
+  if (iqname[0]=='-')
+  {
+       if(use_hires_quant)
+       {
+               load_iquant |= 1;
+               for (i=0; i<64; i++)
+               {
+                       intra_q[i] = hires_intra_quantizer_matrix[i];
+               }       
+       }
+       else
+       {
+               load_iquant = use_denoise_quant;
+               for (i=0; i<64; i++)
+               {
+                       v = quant_hfnoise_filt(default_intra_quantizer_matrix[i], i);
+                       if (v<1 || v>255)
+                               error("value in intra quant matrix invalid (after noise filt adjust)");
+                       intra_q[i] = v;
+               } 
+       }
+  }
+
+/* TODO: Inv Quant matrix initialisation should check if the fraction fits in 16 bits! */
+  if (niqname[0]=='-')
+  {
+               if(use_hires_quant)
+               {
+                       for (i=0; i<64; i++)
+                       {
+                               inter_q[i] = hires_nonintra_quantizer_matrix[i];
+                       }       
+               }
+               else
+               {
+/* default non-intra matrix is all 16's. For *our* default we use something
+   more suitable for domestic analog sources... which is non-standard...*/
+                       load_niquant |= 1;
+                       for (i=0; i<64; i++)
+                       {
+                               v = quant_hfnoise_filt(default_nonintra_quantizer_matrix[i],i);
+                               if (v<1 || v>255)
+                                       error("value in non-intra quant matrix invalid (after noise filt adjust)");
+                               inter_q[i] = v;
+                       }
+               }
+  }
+
+       for (i=0; i<64; i++)
+       {
+               i_intra_q[i] = (int)(((double)IQUANT_SCALE) / ((double)intra_q[i]));
+               i_inter_q[i] = (int)(((double)IQUANT_SCALE) / ((double)inter_q[i]));
+       }
+       
+       for( q = 1; q <= 112; ++q )
+       {
+               for (i=0; i<64; i++)
+               {
+                       intra_q_tbl[q][i] = intra_q[i] * q;
+                       inter_q_tbl[q][i] = inter_q[i] * q;
+                       intra_q_tblf[q][i] = (float)intra_q_tbl[q][i];
+                       inter_q_tblf[q][i] = (float)inter_q_tbl[q][i];
+                       i_intra_q_tblf[q][i] = 1.0f/ ( intra_q_tblf[q][i] * 0.98);
+                       i_intra_q_tbl[q][i] = (IQUANT_SCALE/intra_q_tbl[q][i]);
+                       i_inter_q_tblf[q][i] =  1.0f/ (inter_q_tblf[q][i] * 0.98);
+                       i_inter_q_tbl[q][i] = (IQUANT_SCALE/inter_q_tbl[q][i] );
+               }
+       }
+}
diff --git a/mpeg2enc/predict.c b/mpeg2enc/predict.c
new file mode 100644 (file)
index 0000000..c220247
--- /dev/null
@@ -0,0 +1,717 @@
+/* predict.c, motion compensated prediction                                 */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include "global.h"
+#include "cpu_accel.h"
+#include "simd.h"
+
+
+/* private prototypes */
+static void predict_mb (
+       pict_data_s *picture,
+       uint8_t *oldref[], uint8_t *newref[], uint8_t *cur[],
+       int lx, int bx, int by, mbinfo_s *mbi, int secondfield);
+
+static void pred (
+       pict_data_s *picture,
+       uint8_t *src[], int sfield,
+       uint8_t *dst[], int dfield,
+       int lx, int w, int h, int x, int y, int dx, int dy, int addflag);
+
+static void pred_comp (
+       pict_data_s *picture,
+       uint8_t *src, uint8_t *dst,
+       int lx, int w, int h, int x, int y, int dx, int dy, int addflag);
+#ifdef X86_CPU
+static void pred_comp_mmxe(
+       pict_data_s *picture,
+       uint8_t *src, uint8_t *dst,
+       int lx, int w, int h, int x, int y, int dx, int dy, int addflag);
+static void pred_comp_mmx(
+       pict_data_s *picture,
+       uint8_t *src, uint8_t *dst,
+       int lx, int w, int h, int x, int y, int dx, int dy, int addflag);
+#endif
+static void calc_DMV 
+       (       pict_data_s *picture,int DMV[][2], 
+               int *dmvector, int mvx, int mvy);
+
+static void clearblock (pict_data_s *picture,
+                                               uint8_t *cur[], int i0, int j0);
+
+/*
+  Initialise prediction - currently purely selection of which
+  versions of the various low level computation routines to use
+  
+  */
+
+static void (*ppred_comp)(
+       pict_data_s *picture,
+       uint8_t *src, uint8_t *dst,
+       int lx, int w, int h, int x, int y, int dx, int dy, int addflag);
+
+void init_predict()
+{
+       int cpucap = cpu_accel();
+
+       if( cpucap  == 0 )      /* No MMX/SSE etc support available */
+       {
+               ppred_comp = pred_comp;
+       }
+
+#ifdef X86_CPU
+       else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
+       {
+               if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for PREDICTION!\n");
+               ppred_comp = pred_comp_mmxe;
+       }
+    else if(cpucap & ACCEL_X86_MMX ) /* Original MMX... */
+       {
+               if(verbose) fprintf( stderr, "SETTING MMX for PREDICTION!\n");
+               ppred_comp = pred_comp_mmx;
+       }
+#endif
+    else
+       {
+               ppred_comp = pred_comp;
+       }
+}
+
+/* form prediction for a complete picture (frontend for predict_mb)
+ *
+ * reff: reference frame for forward prediction
+ * refb: reference frame for backward prediction
+ * cur:  destination (current) frame
+ * secondfield: predict second field of a frame
+ * mbi:  macroblock info
+ *
+ * Notes:
+ * - cf. predict_mb
+ */
+
+void predict(pict_data_s *picture, 
+                        uint8_t *reff[],
+                        uint8_t *refb[],
+                        uint8_t *cur[3],
+                        int secondfield)
+{
+       int i, j, k;
+       mbinfo_s *mbi = picture->mbinfo;
+       k = 0;
+
+       /* loop through all macroblocks of the picture */
+       for (j=0; j<height2; j+=16)
+               for (i=0; i<width; i+=16)
+               {
+                       predict_mb(picture,reff,refb,cur,width,i,j,
+                                          &mbi[k], secondfield );
+                       k++;
+               }
+}
+
+/* form prediction for one macroblock
+ *
+ * oldref: reference frame for forward prediction
+ * newref: reference frame for backward prediction
+ * cur:    destination (current) frame
+ * lx:     frame width (identical to global var `width')
+ * bx,by:  picture (field or frame) coordinates of macroblock to be predicted
+ * pict_type: I, P or B
+ * pict_struct: FRAME_PICTURE, TOP_FIELD, BOTTOM_FIELD
+ * mb_type:     MB_FORWARD, MB_BACKWARD, MB_INTRA
+ * motion_type: MC_FRAME, MC_FIELD, MC_16X8, MC_DMV
+ * secondfield: predict second field of a frame
+ * PMV[2][2][2]: motion vectors (in half pel picture coordinates)
+ * mv_field_sel[2][2]: motion vertical field selects (for field predictions)
+ * dmvector: differential motion vectors (for dual prime)
+ *
+ * Notes:
+ * - when predicting a P type picture which is the second field of
+ *   a frame, the same parity reference field is in oldref, while the
+ *   opposite parity reference field is assumed to be in newref!
+ * - intra macroblocks are modelled to have a constant prediction of 128
+ *   for all pels; this results in a DC DCT coefficient symmetric to 0
+ * - vectors for field prediction in frame pictures are in half pel frame
+ *   coordinates (vertical component is twice the field value and always
+ *   even)
+ *
+ * already covers dual prime (not yet used)
+ */
+
+static void predict_mb (
+       pict_data_s *picture,
+       uint8_t *oldref[], uint8_t *newref[], uint8_t *cur[],
+       int lx, int bx, int by, mbinfo_s *mbi,  int secondfield
+       )
+{
+       int addflag, currentfield;
+       uint8_t **predframe;
+       int DMV[2][2];
+
+       if (mbi->mb_type&MB_INTRA)
+       {
+               clearblock(picture,cur,bx,by);
+               return;
+       }
+
+       addflag = 0; /* first prediction is stored, second is added and averaged */
+
+       if ((mbi->mb_type & MB_FORWARD) || (picture->pict_type==P_TYPE))
+       {
+               /* forward prediction, including zero MV in P pictures */
+
+               if (picture->pict_struct==FRAME_PICTURE)
+               {
+                       /* frame picture */
+
+                       if ((mbi->motion_type==MC_FRAME) || !(mbi->mb_type & MB_FORWARD))
+                       {
+                               /* frame-based prediction in frame picture */
+                               pred(picture,
+                                        oldref,0,cur,0,
+                                        lx,16,16,bx,by,mbi->MV[0][0][0],mbi->MV[0][0][1],0);
+                       }
+                       else if (mbi->motion_type==MC_FIELD)
+                       {
+                               /* field-based prediction in frame picture
+                                *
+                                * note scaling of the vertical coordinates (by, mbi->MV[][0][1])
+                                * from frame to field!
+                                */
+
+                               /* top field prediction */
+                               pred(picture,oldref,mbi->mv_field_sel[0][0],cur,0,
+                                        lx<<1,16,8,bx,by>>1,mbi->MV[0][0][0],mbi->MV[0][0][1]>>1,0);
+
+                               /* bottom field prediction */
+                               pred(picture,oldref,mbi->mv_field_sel[1][0],cur,1,
+                                        lx<<1,16,8,bx,by>>1,mbi->MV[1][0][0],mbi->MV[1][0][1]>>1,0);
+                       }
+                       else if (mbi->motion_type==MC_DMV)
+                       {
+                               /* dual prime prediction */
+
+                               /* calculate derived motion vectors */
+                               calc_DMV(picture,DMV,mbi->dmvector,mbi->MV[0][0][0],mbi->MV[0][0][1]>>1);
+
+                               /* predict top field from top field */
+                               pred(picture,oldref,0,cur,0,
+                                        lx<<1,16,8,bx,by>>1,mbi->MV[0][0][0],mbi->MV[0][0][1]>>1,0);
+
+                               /* predict bottom field from bottom field */
+                               pred(picture,oldref,1,cur,1,
+                                        lx<<1,16,8,bx,by>>1,mbi->MV[0][0][0],mbi->MV[0][0][1]>>1,0);
+
+                               /* predict and add to top field from bottom field */
+                               pred(picture,oldref,1,cur,0,
+                                        lx<<1,16,8,bx,by>>1,DMV[0][0],DMV[0][1],1);
+
+                               /* predict and add to bottom field from top field */
+                               pred(picture,oldref,0,cur,1,
+                                        lx<<1,16,8,bx,by>>1,DMV[1][0],DMV[1][1],1);
+                       }
+                       else
+                       {
+                               /* invalid mbi->motion_type in frame picture */
+                               fprintf(stderr,"invalid motion_type\n");
+                       }
+               }
+               else /* TOP_FIELD or BOTTOM_FIELD */
+               {
+                       /* field picture */
+
+                       currentfield = (picture->pict_struct==BOTTOM_FIELD);
+
+                       /* determine which frame to use for prediction */
+                       if ((picture->pict_type==P_TYPE) && secondfield
+                               && (currentfield!=mbi->mv_field_sel[0][0]))
+                               predframe = newref; /* same frame */
+                       else
+                               predframe = oldref; /* previous frame */
+
+                       if ((mbi->motion_type==MC_FIELD) || !(mbi->mb_type & MB_FORWARD))
+                       {
+                               /* field-based prediction in field picture */
+                               pred(picture,predframe,mbi->mv_field_sel[0][0],cur,currentfield,
+                                        lx<<1,16,16,bx,by,mbi->MV[0][0][0],mbi->MV[0][0][1],0);
+                       }
+                       else if (mbi->motion_type==MC_16X8)
+                       {
+                               /* 16 x 8 motion compensation in field picture */
+
+                               /* upper half */
+                               pred(picture,predframe,mbi->mv_field_sel[0][0],cur,currentfield,
+                                        lx<<1,16,8,bx,by,mbi->MV[0][0][0],mbi->MV[0][0][1],0);
+
+                               /* determine which frame to use for lower half prediction */
+                               if ((picture->pict_type==P_TYPE) && secondfield
+                                       && (currentfield!=mbi->mv_field_sel[1][0]))
+                                       predframe = newref; /* same frame */
+                               else
+                                       predframe = oldref; /* previous frame */
+
+                               /* lower half */
+                               pred(picture,predframe,mbi->mv_field_sel[1][0],cur,currentfield,
+                                        lx<<1,16,8,bx,by+8,mbi->MV[1][0][0],mbi->MV[1][0][1],0);
+                       }
+                       else if (mbi->motion_type==MC_DMV)
+                       {
+                               /* dual prime prediction */
+
+                               /* determine which frame to use for prediction */
+                               if (secondfield)
+                                       predframe = newref; /* same frame */
+                               else
+                                       predframe = oldref; /* previous frame */
+
+                               /* calculate derived motion vectors */
+                               calc_DMV(picture,DMV,mbi->dmvector,mbi->MV[0][0][0],mbi->MV[0][0][1]);
+
+                               /* predict from field of same parity */
+                               pred(picture,oldref,currentfield,cur,currentfield,
+                                        lx<<1,16,16,bx,by,mbi->MV[0][0][0],mbi->MV[0][0][1],0);
+
+                               /* predict from field of opposite parity */
+                               pred(picture,predframe,!currentfield,cur,currentfield,
+                                        lx<<1,16,16,bx,by,DMV[0][0],DMV[0][1],1);
+                       }
+                       else
+                       {
+                               /* invalid motion_type in field picture */
+                               fprintf(stderr,"invalid motion_type\n");
+                       }
+               }
+               addflag = 1; /* next prediction (if any) will be averaged with this one */
+       }
+
+       if (mbi->mb_type & MB_BACKWARD)
+       {
+               /* backward prediction */
+
+               if (picture->pict_struct==FRAME_PICTURE)
+               {
+                       /* frame picture */
+
+                       if (mbi->motion_type==MC_FRAME)
+                       {
+                               /* frame-based prediction in frame picture */
+                               pred(picture,newref,0,cur,0,
+                                        lx,16,16,bx,by,mbi->MV[0][1][0],mbi->MV[0][1][1],addflag);
+                       }
+                       else
+                       {
+                               /* field-based prediction in frame picture
+                                *
+                                * note scaling of the vertical coordinates (by, mbi->MV[][1][1])
+                                * from frame to field!
+                                */
+
+                               /* top field prediction */
+                               pred(picture,newref,mbi->mv_field_sel[0][1],cur,0,
+                                        lx<<1,16,8,bx,by>>1,mbi->MV[0][1][0],mbi->MV[0][1][1]>>1,addflag);
+
+                               /* bottom field prediction */
+                               pred(picture,newref,mbi->mv_field_sel[1][1],cur,1,
+                                        lx<<1,16,8,bx,by>>1,mbi->MV[1][1][0],mbi->MV[1][1][1]>>1,addflag);
+                       }
+               }
+               else /* TOP_FIELD or BOTTOM_FIELD */
+               {
+                       /* field picture */
+
+                       currentfield = (picture->pict_struct==BOTTOM_FIELD);
+
+                       if (mbi->motion_type==MC_FIELD)
+                       {
+                               /* field-based prediction in field picture */
+                               pred(picture,newref,mbi->mv_field_sel[0][1],cur,currentfield,
+                                        lx<<1,16,16,bx,by,mbi->MV[0][1][0],mbi->MV[0][1][1],addflag);
+                       }
+                       else if (mbi->motion_type==MC_16X8)
+                       {
+                               /* 16 x 8 motion compensation in field picture */
+
+                               /* upper half */
+                               pred(picture,newref,mbi->mv_field_sel[0][1],cur,currentfield,
+                                        lx<<1,16,8,bx,by,mbi->MV[0][1][0],mbi->MV[0][1][1],addflag);
+
+                               /* lower half */
+                               pred(picture,newref,mbi->mv_field_sel[1][1],cur,currentfield,
+                                        lx<<1,16,8,bx,by+8,mbi->MV[1][1][0],mbi->MV[1][1][1],addflag);
+                       }
+                       else
+                       {
+                               /* invalid motion_type in field picture */
+                               fprintf(stderr,"invalid motion_type\n");
+                       }
+               }
+       }
+}
+
+/* predict a rectangular block (all three components)
+ *
+ * src:     source frame (Y,U,V)
+ * sfield:  source field select (0: frame or top field, 1: bottom field)
+ * dst:     destination frame (Y,U,V)
+ * dfield:  destination field select (0: frame or top field, 1: bottom field)
+ *
+ * the following values are in luminance picture (frame or field) dimensions
+ * lx:      distance of vertically adjacent pels (selects frame or field pred.)
+ * w,h:     width and height of block (only 16x16 or 16x8 are used)
+ * x,y:     coordinates of destination block
+ * dx,dy:   half pel motion vector
+ * addflag: store or add (= average) prediction
+ */
+
+static void pred (
+       pict_data_s *picture,
+       uint8_t *src[], int sfield,
+       uint8_t *dst[], int dfield,
+       int lx, int w, int h, int x, int y, int dx, int dy, int addflag
+       )
+{
+       int cc;
+
+       for (cc=0; cc<3; cc++)
+       {
+               if (cc==1)
+               {
+                       /* scale for color components */
+                       if (chroma_format==CHROMA420)
+                       {
+                               /* vertical */
+                               h >>= 1; y >>= 1; dy /= 2;
+                       }
+                       if (chroma_format!=CHROMA444)
+                       {
+                               /* horizontal */
+                               w >>= 1; x >>= 1; dx /= 2;
+                               lx >>= 1;
+                       }
+               }
+               (*ppred_comp)(  picture,
+                                               src[cc]+(sfield?lx>>1:0),dst[cc]+(dfield?lx>>1:0),
+                                               lx,w,h,x,y,dx,dy,addflag);
+       }
+}
+
+/* low level prediction routine
+ *
+ * src:     prediction source
+ * dst:     prediction destination
+ * lx:      line width (for both src and dst)
+ * x,y:     destination coordinates
+ * dx,dy:   half pel motion vector
+ * w,h:     size of prediction block
+ * addflag: store or add prediction
+ *
+ * There are also SIMD versions of this routine...
+ */
+
+static void pred_comp(
+       pict_data_s *picture,
+       uint8_t *src,
+       uint8_t *dst,
+       int lx,
+       int w, int h,
+       int x, int y,
+       int dx, int dy,
+       int addflag)
+{
+       int xint, xh, yint, yh;
+       int i, j;
+       uint8_t *s, *d;
+
+       /* half pel scaling */
+       xint = dx>>1; /* integer part */
+       xh = dx & 1;  /* half pel flag */
+       yint = dy>>1;
+       yh = dy & 1;
+
+       /* origins */
+       s = src + lx*(y+yint) + (x+xint); /* motion vector */
+       d = dst + lx*y + x;
+
+       if (!xh && !yh)
+               if (addflag)
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = (unsigned int)(d[i]+s[i]+1)>>1;
+                               s+= lx;
+                               d+= lx;
+                       }
+               else
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = s[i];
+                               s+= lx;
+                               d+= lx;
+                       }
+       else if (!xh && yh)
+               if (addflag)
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = (d[i] + ((unsigned int)(s[i]+s[i+lx]+1)>>1)+1)>>1;
+                               s+= lx;
+                               d+= lx;
+                       }
+               else
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = (unsigned int)(s[i]+s[i+lx]+1)>>1;
+                               s+= lx;
+                               d+= lx;
+                       }
+       else if (xh && !yh)
+               if (addflag)
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = (d[i] + ((unsigned int)(s[i]+s[i+1]+1)>>1)+1)>>1;
+                               s+= lx;
+                               d+= lx;
+                       }
+               else
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = (unsigned int)(s[i]+s[i+1]+1)>>1;
+                               s+= lx;
+                               d+= lx;
+                       }
+       else /* if (xh && yh) */
+               if (addflag)
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = (d[i] + ((unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2)+1)>>1;
+                               s+= lx;
+                               d+= lx;
+                       }
+               else
+                       for (j=0; j<h; j++)
+                       {
+                               for (i=0; i<w; i++)
+                                       d[i] = (unsigned int)(s[i]+s[i+1]+s[i+lx]+s[i+lx+1]+2)>>2;
+                               s+= lx;
+                               d+= lx;
+                       }
+}
+
+#ifdef X86_CPU
+static void pred_comp_mmxe(
+       pict_data_s *picture,
+       uint8_t *src,
+       uint8_t *dst,
+       int lx,
+       int w, int h,
+       int x, int y,
+       int dx, int dy,
+       int addflag)
+{
+       int xint, xh, yint, yh;
+       uint8_t *s, *d;
+       
+       /* half pel scaling */
+       xint = dx>>1; /* integer part */
+       xh = dx & 1;  /* half pel flag */
+       yint = dy>>1;
+       yh = dy & 1;
+
+       /* origins */
+       s = src + lx*(y+yint) + (x+xint); /* motion vector */
+       d = dst + lx*y + x;
+
+       if( xh )
+       {
+               if( yh ) 
+                       predcomp_11_mmxe(s,d,lx,w,h,addflag);
+               else /* !yh */
+                       predcomp_10_mmxe(s,d,lx,w,h,addflag);
+       }
+       else /* !xh */
+       {
+               if( yh ) 
+                       predcomp_01_mmxe(s,d,lx,w,h,addflag);
+               else /* !yh */
+                       predcomp_00_mmxe(s,d,lx,w,h,addflag);
+       }
+               
+}
+
+static void pred_comp_mmx(
+       pict_data_s *picture,
+       uint8_t *src,
+       uint8_t *dst,
+       int lx,
+       int w, int h,
+       int x, int y,
+       int dx, int dy,
+       int addflag)
+{
+       int xint, xh, yint, yh;
+       uint8_t *s, *d;
+       
+       /* half pel scaling */
+       xint = dx>>1; /* integer part */
+       xh = dx & 1;  /* half pel flag */
+       yint = dy>>1;
+       yh = dy & 1;
+
+       /* origins */
+       s = src + lx*(y+yint) + (x+xint); /* motion vector */
+       d = dst + lx*y + x;
+
+       if( xh )
+       {
+               if( yh ) 
+                       predcomp_11_mmx(s,d,lx,w,h,addflag);
+               else /* !yh */
+                       predcomp_10_mmx(s,d,lx,w,h,addflag);
+       }
+       else /* !xh */
+       {
+               if( yh ) 
+                       predcomp_01_mmx(s,d,lx,w,h,addflag);
+               else /* !yh */
+                       predcomp_00_mmx(s,d,lx,w,h,addflag);
+       }
+               
+}
+#endif
+
+/* calculate derived motion vectors (DMV) for dual prime prediction
+ * dmvector[2]: differential motion vectors (-1,0,+1)
+ * mvx,mvy: motion vector (for same parity)
+ *
+ * DMV[2][2]: derived motion vectors (for opposite parity)
+ *
+ * uses global variables pict_struct and topfirst
+ *
+ * Notes:
+ *  - all vectors are in field coordinates (even for frame pictures)
+ */
+
+static void calc_DMV(
+       pict_data_s *picture,int DMV[][2], 
+       int *dmvector, int mvx, int mvy
+)
+{
+  if (picture->pict_struct==FRAME_PICTURE)
+  {
+    if (picture->topfirst)
+    {
+      /* vector for prediction of top field from bottom field */
+      DMV[0][0] = ((mvx  +(mvx>0))>>1) + dmvector[0];
+      DMV[0][1] = ((mvy  +(mvy>0))>>1) + dmvector[1] - 1;
+
+      /* vector for prediction of bottom field from top field */
+      DMV[1][0] = ((3*mvx+(mvx>0))>>1) + dmvector[0];
+      DMV[1][1] = ((3*mvy+(mvy>0))>>1) + dmvector[1] + 1;
+    }
+    else
+    {
+      /* vector for prediction of top field from bottom field */
+      DMV[0][0] = ((3*mvx+(mvx>0))>>1) + dmvector[0];
+      DMV[0][1] = ((3*mvy+(mvy>0))>>1) + dmvector[1] - 1;
+
+      /* vector for prediction of bottom field from top field */
+      DMV[1][0] = ((mvx  +(mvx>0))>>1) + dmvector[0];
+      DMV[1][1] = ((mvy  +(mvy>0))>>1) + dmvector[1] + 1;
+    }
+  }
+  else
+  {
+    /* vector for prediction from field of opposite 'parity' */
+    DMV[0][0] = ((mvx+(mvx>0))>>1) + dmvector[0];
+    DMV[0][1] = ((mvy+(mvy>0))>>1) + dmvector[1];
+
+    /* correct for vertical field shift */
+    if (picture->pict_struct==TOP_FIELD)
+      DMV[0][1]--;
+    else
+      DMV[0][1]++;
+  }
+}
+
+static void clearblock(
+       pict_data_s *picture,
+       uint8_t *cur[], int i0, int j0
+       )
+{
+  int i, j, w, h;
+  uint8_t *p;
+
+  p = cur[0] + ((picture->pict_struct==BOTTOM_FIELD) ? width : 0) + i0 + width2*j0;
+
+  for (j=0; j<16; j++)
+  {
+    for (i=0; i<16; i++)
+      p[i] = 128;
+    p+= width2;
+  }
+
+  w = h = 16;
+
+  if (chroma_format!=CHROMA444)
+  {
+    i0>>=1; w>>=1;
+  }
+
+  if (chroma_format==CHROMA420)
+  {
+    j0>>=1; h>>=1;
+  }
+
+  p = cur[1] + ((picture->pict_struct==BOTTOM_FIELD) ? chrom_width : 0) + i0
+             + chrom_width2*j0;
+
+  for (j=0; j<h; j++)
+  {
+    for (i=0; i<w; i++)
+      p[i] = 128;
+    p+= chrom_width2;
+  }
+
+  p = cur[2] + ((picture->pict_struct==BOTTOM_FIELD) ? chrom_width : 0) + i0
+             + chrom_width2*j0;
+
+  for (j=0; j<h; j++)
+  {
+    for (i=0; i<w; i++)
+      p[i] = 128;
+    p+= chrom_width2;
+  }
+}
diff --git a/mpeg2enc/putbits.c b/mpeg2enc/putbits.c
new file mode 100644 (file)
index 0000000..ab6b9f7
--- /dev/null
@@ -0,0 +1,170 @@
+/* putbits.c, bit-level output                                              */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <stdio.h>
+#include "config.h"
+#include "global.h"
+
+extern FILE *outfile; /* the only global var we need here */
+
+/* private data */
+static unsigned char outbfr;
+static int outcnt;
+static int bytecnt;
+
+void slice_initbits(slice_engine_t *engine)
+{
+       engine->outcnt = 8;
+       engine->slice_size = 0;
+}
+
+void slice_testbits(slice_engine_t *engine)
+{
+       int i;
+       printf("slice test size %x outcnt %d outbfr %x\n", engine->slice_size, engine->outcnt, engine->outbfr << engine->outcnt);
+/*
+ *     for(i = 0; i < engine->slice_size; i++)
+ *             printf("%02x ", engine->slice_buffer[i]);
+ *     printf("%x\n", engine->outbfr << engine->outcnt);
+ */
+}
+
+void slice_putc(slice_engine_t *engine, unsigned char c)
+{
+       if(engine->slice_size >= engine->slice_allocated)
+       {
+               long new_allocation = (engine->slice_allocated > 0) ? (engine->slice_allocated * 2) : 64;
+               unsigned char *new_buffer = calloc(1, new_allocation);
+               if(engine->slice_buffer)
+               {
+                       memcpy(new_buffer, engine->slice_buffer, engine->slice_size);
+                       free(engine->slice_buffer);
+               }
+               engine->slice_buffer = new_buffer;
+               engine->slice_allocated = new_allocation;
+       }
+       engine->slice_buffer[engine->slice_size++] = c;
+}
+
+void slice_putbits(slice_engine_t *engine, long val, int n)
+{
+       int i;
+       unsigned int mask;
+
+       mask = 1 << (n - 1); /* selects first (leftmost) bit */
+
+       for(i = 0; i < n; i++)
+       {
+       engine->outbfr <<= 1;
+
+       if(val & mask)
+               engine->outbfr |= 1;
+
+       mask >>= 1; /* select next bit */
+       engine->outcnt--;
+
+       if(engine->outcnt == 0) /* 8 bit buffer full */
+       {
+               slice_putc(engine, engine->outbfr);
+               engine->outcnt = 8;
+                       engine->outbfr = 0;
+       }
+       }
+}
+
+/* zero bit stuffing to next byte boundary (5.2.3, 6.2.1) */
+void slice_alignbits(slice_engine_t *engine)
+{
+    if(engine->outcnt != 8)
+        slice_putbits(engine, 0, engine->outcnt);
+}
+
+void slice_finishslice(slice_engine_t *engine)
+{
+       int i;
+       slice_alignbits(engine);
+
+       if(!fwrite(engine->slice_buffer, 1, engine->slice_size, outfile))
+       {
+               perror("Write error");
+       }
+       bytecnt += engine->slice_size;
+}
+
+
+
+/* initialize buffer, call once before first putbits or alignbits */
+void mpeg2_initbits()
+{
+       outcnt = 8;
+       bytecnt = 0;
+}
+
+
+/* write rightmost n (0<=n<=32) bits of val to outfile */
+void mpeg2enc_putbits(val,n)
+int val;
+int n;
+{
+  int i;
+  unsigned int mask;
+
+  mask = 1 << (n-1); /* selects first (leftmost) bit */
+
+  for (i=0; i<n; i++)
+  {
+    outbfr <<= 1;
+
+    if (val & mask)
+      outbfr|= 1;
+
+    mask >>= 1; /* select next bit */
+    outcnt--;
+
+    if (outcnt==0) /* 8 bit buffer full */
+    {
+      putc(outbfr,outfile);
+      outcnt = 8;
+      bytecnt++;
+    }
+  }
+}
+
+/* zero bit stuffing to next byte boundary (5.2.3, 6.2.1) */
+void alignbits()
+{
+  if (outcnt!=8)
+    mpeg2enc_putbits(0,outcnt);
+}
+
+/* return total number of generated bits */
+double bitcount()
+{
+       return (double)8 * bytecnt + (8 - outcnt);
+}
diff --git a/mpeg2enc/puthdr.c b/mpeg2enc/puthdr.c
new file mode 100644 (file)
index 0000000..b275db9
--- /dev/null
@@ -0,0 +1,226 @@
+/* puthdr.c, generation of headers                                          */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include "config.h"
+#include "global.h"
+
+/* private prototypes */
+static int frametotc _ANSI_ARGS_((int frame));
+
+/* generate sequence header (6.2.2.1, 6.3.3)
+ *
+ * matrix download not implemented
+ */
+void putseqhdr()
+{
+  int i;
+
+  alignbits();
+  mpeg2enc_putbits(SEQ_START_CODE,32); /* sequence_header_code */
+  mpeg2enc_putbits(horizontal_size,12); /* horizontal_size_value */
+  mpeg2enc_putbits(vertical_size,12); /* vertical_size_value */
+  mpeg2enc_putbits(aspectratio,4); /* aspect_ratio_information */
+  mpeg2enc_putbits(frame_rate_code,4); /* frame_rate_code */
+  mpeg2enc_putbits((int)ceil(bit_rate/400.0),18); /* bit_rate_value */
+  mpeg2enc_putbits(1,1); /* marker_bit */
+  mpeg2enc_putbits(vbv_buffer_size,10); /* vbv_buffer_size_value */
+  mpeg2enc_putbits(constrparms,1); /* constrained_parameters_flag */
+
+  mpeg2enc_putbits(load_iquant,1); /* load_intra_quantizer_matrix */
+  if (load_iquant)
+    for (i=0; i<64; i++)  /* matrices are always downloaded in zig-zag order */
+      mpeg2enc_putbits(intra_q[mpeg2_zig_zag_scan[i]],8); /* intra_quantizer_matrix */
+
+  mpeg2enc_putbits(load_niquant,1); /* load_non_intra_quantizer_matrix */
+  if (load_niquant)
+    for (i=0; i<64; i++)
+      mpeg2enc_putbits(inter_q[mpeg2_zig_zag_scan[i]],8); /* non_intra_quantizer_matrix */
+       if (!mpeg1)
+       {
+               putseqext();
+               putseqdispext();
+       }
+}
+
+/* generate sequence extension (6.2.2.3, 6.3.5) header (MPEG-2 only) */
+void putseqext()
+{
+  alignbits();
+  mpeg2enc_putbits(EXT_START_CODE,32); /* extension_start_code */
+  mpeg2enc_putbits(SEQ_ID,4); /* extension_start_code_identifier */
+  mpeg2enc_putbits((profile<<4)|level,8); /* profile_and_level_indication */
+  mpeg2enc_putbits(prog_seq,1); /* progressive sequence */
+  mpeg2enc_putbits(chroma_format,2); /* chroma_format */
+  mpeg2enc_putbits(horizontal_size>>12,2); /* horizontal_size_extension */
+  mpeg2enc_putbits(vertical_size>>12,2); /* vertical_size_extension */
+  mpeg2enc_putbits(((int)ceil(bit_rate/400.0))>>18,12); /* bit_rate_extension */
+  mpeg2enc_putbits(1,1); /* marker_bit */
+  mpeg2enc_putbits(vbv_buffer_size>>10,8); /* vbv_buffer_size_extension */
+  mpeg2enc_putbits(0,1); /* low_delay  -- currently not implemented */
+  mpeg2enc_putbits(0,2); /* frame_rate_extension_n */
+  mpeg2enc_putbits(0,5); /* frame_rate_extension_d */
+}
+
+/* generate sequence display extension (6.2.2.4, 6.3.6)
+ *
+ * content not yet user setable
+ */
+void putseqdispext()
+{
+  alignbits();
+  mpeg2enc_putbits(EXT_START_CODE,32); /* extension_start_code */
+  mpeg2enc_putbits(DISP_ID,4); /* extension_start_code_identifier */
+  mpeg2enc_putbits(video_format,3); /* video_format */
+  mpeg2enc_putbits(1,1); /* colour_description */
+  mpeg2enc_putbits(color_primaries,8); /* colour_primaries */
+  mpeg2enc_putbits(transfer_characteristics,8); /* transfer_characteristics */
+  mpeg2enc_putbits(matrix_coefficients,8); /* matrix_coefficients */
+  mpeg2enc_putbits(display_horizontal_size,14); /* display_horizontal_size */
+  mpeg2enc_putbits(1,1); /* marker_bit */
+  mpeg2enc_putbits(display_vertical_size,14); /* display_vertical_size */
+}
+
+/* output a zero terminated string as user data (6.2.2.2.2, 6.3.4.1)
+ *
+ * string must not emulate start codes
+ */
+void putuserdata(userdata)
+char *userdata;
+{
+  alignbits();
+  mpeg2enc_putbits(USER_START_CODE,32); /* user_data_start_code */
+  while (*userdata)
+    mpeg2enc_putbits(*userdata++,8);
+}
+
+/* generate group of pictures header (6.2.2.6, 6.3.9)
+ *
+ * uses tc0 (timecode of first frame) and frame0 (number of first frame)
+ */
+void putgophdr(frame,closed_gop)
+int frame,closed_gop;
+{
+  int tc;
+
+  alignbits();
+  mpeg2enc_putbits(GOP_START_CODE,32); /* group_start_code */
+  tc = frametotc(tc0+frame);
+  mpeg2enc_putbits(tc,25); /* time_code */
+  mpeg2enc_putbits(closed_gop,1); /* closed_gop */
+  mpeg2enc_putbits(0,1); /* broken_link */
+}
+
+/* convert frame number to time_code
+ *
+ * drop_frame not implemented
+ */
+static int frametotc(frame)
+int frame;
+{
+  int fps, pict, sec, minute, hour, tc;
+
+  fps = (int)(frame_rate+0.5);
+  pict = frame%fps;
+  frame = (frame-pict)/fps;
+  sec = frame%60;
+  frame = (frame-sec)/60;
+  minute = frame%60;
+  frame = (frame-minute)/60;
+  hour = frame%24;
+  tc = (hour<<19) | (minute<<13) | (1<<12) | (sec<<6) | pict;
+
+  return tc;
+}
+
+/* generate picture header (6.2.3, 6.3.10) */
+void putpicthdr(pict_data_s *picture)
+{
+  alignbits();
+  mpeg2enc_putbits(PICTURE_START_CODE,32); /* picture_start_code */
+  mpeg2enc_putbits(picture->temp_ref,10); /* temporal_reference */
+  mpeg2enc_putbits(picture->pict_type,3); /* picture_coding_type */
+  mpeg2enc_putbits(picture->vbv_delay,16); /* vbv_delay */
+//printf("putpicthdr %d %d %d\n", picture->temp_ref, picture->pict_type, picture->vbv_delay);
+
+  if (picture->pict_type==P_TYPE || picture->pict_type==B_TYPE)
+  {
+    mpeg2enc_putbits(0,1); /* full_pel_forward_vector */
+    if (mpeg1)
+      mpeg2enc_putbits(picture->forw_hor_f_code,3);
+    else
+      mpeg2enc_putbits(7,3); /* forward_f_code */
+  }
+
+  if (picture->pict_type==B_TYPE)
+  {
+    mpeg2enc_putbits(0,1); /* full_pel_backward_vector */
+    if (mpeg1)
+      mpeg2enc_putbits(picture->back_hor_f_code,3);
+    else
+      mpeg2enc_putbits(7,3); /* backward_f_code */
+  }
+
+  mpeg2enc_putbits(0,1); /* extra_bit_picture */
+}
+
+/* generate picture coding extension (6.2.3.1, 6.3.11)
+ *
+ * composite display information (v_axis etc.) not implemented
+ */
+void putpictcodext(pict_data_s *picture)
+{
+  alignbits();
+  mpeg2enc_putbits(EXT_START_CODE,32); /* extension_start_code */
+  mpeg2enc_putbits(CODING_ID,4); /* extension_start_code_identifier */
+  mpeg2enc_putbits(picture->forw_hor_f_code,4); /* forward_horizontal_f_code */
+  mpeg2enc_putbits(picture->forw_vert_f_code,4); /* forward_vertical_f_code */
+  mpeg2enc_putbits(picture->back_hor_f_code,4); /* backward_horizontal_f_code */
+  mpeg2enc_putbits(picture->back_vert_f_code,4); /* backward_vertical_f_code */
+  mpeg2enc_putbits(picture->dc_prec,2); /* intra_dc_precision */
+  mpeg2enc_putbits(picture->pict_struct,2); /* picture_structure */
+  mpeg2enc_putbits((picture->pict_struct==FRAME_PICTURE)?picture->topfirst:0,1); /* top_field_first */
+  mpeg2enc_putbits(picture->frame_pred_dct,1); /* frame_pred_frame_dct */
+  mpeg2enc_putbits(0,1); /* concealment_motion_vectors  -- currently not implemented */
+  mpeg2enc_putbits(picture->q_scale_type,1); /* q_scale_type */
+  mpeg2enc_putbits(picture->intravlc,1); /* intra_vlc_format */
+  mpeg2enc_putbits(picture->altscan,1); /* alternate_scan */
+  mpeg2enc_putbits(picture->repeatfirst,1); /* repeat_first_field */
+  mpeg2enc_putbits(picture->prog_frame,1); /* chroma_420_type */
+  mpeg2enc_putbits(picture->prog_frame,1); /* progressive_frame */
+  mpeg2enc_putbits(0,1); /* composite_display_flag */
+}
+
+/* generate sequence_end_code (6.2.2) */
+void putseqend()
+{
+  alignbits();
+  mpeg2enc_putbits(SEQ_END_CODE,32);
+}
diff --git a/mpeg2enc/putmpg.c b/mpeg2enc/putmpg.c
new file mode 100644 (file)
index 0000000..4a55db8
--- /dev/null
@@ -0,0 +1,145 @@
+/* putmpg.c, block and motion vector encoding routines                      */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <stdio.h>
+#include "config.h"
+#include "global.h"
+
+/* generate variable length codes for an intra-coded block (6.2.6, 6.3.17) */
+void putintrablk(slice_engine_t *engine, 
+       pict_data_s *picture, 
+       short *blk, 
+       int cc)
+{
+       int n, dct_diff, run, signed_level;
+
+/* DC coefficient (7.2.1) */
+       dct_diff = blk[0] - engine->dc_dct_pred[cc]; /* difference to previous block */
+       engine->dc_dct_pred[cc] = blk[0];
+
+       if(cc == 0)
+       putDClum(engine, dct_diff);
+       else
+       putDCchrom(engine, dct_diff);
+
+/* AC coefficients (7.2.2) */
+       run = 0;
+       for(n = 1; n < 64; n++)
+       {
+/* use appropriate entropy scanning pattern */
+       signed_level = blk[(picture->altscan ? alternate_scan : mpeg2_zig_zag_scan)[n]];
+       if (signed_level!=0)
+       {
+//printf("putintrablk 1 %d\n", picture->altscan);slice_testbits(engine);
+               putAC(engine, run, signed_level, picture->intravlc);
+//printf("putintrablk 2\n");slice_testbits(engine);
+               run = 0;
+       }
+       else
+               run++; /* count zero coefficients */
+       }
+
+/* End of Block -- normative block punctuation */
+       if (picture->intravlc)
+       slice_putbits(engine, 6, 4); /* 0110 (Table B-15) */
+       else
+       slice_putbits(engine, 2, 2); /* 10 (Table B-14) */
+}
+
+/* generate variable length codes for a non-intra-coded block (6.2.6, 6.3.17) */
+void putnonintrablk(slice_engine_t *engine, 
+       pict_data_s *picture, 
+       short *blk)
+{
+       int n, run, signed_level, first;
+
+       run = 0;
+       first = 1;
+
+       for(n = 0; n < 64; n++)
+       {
+/* use appropriate entropy scanning pattern */
+      signed_level = blk[(picture->altscan ? alternate_scan : mpeg2_zig_zag_scan)[n]];
+
+      if (signed_level!=0)
+      {
+         if (first)
+         {
+/* first coefficient in non-intra block */
+                 putACfirst(engine, run, signed_level);
+                 first = 0;
+         }
+         else
+               putAC(engine, run, signed_level, 0);
+
+         run = 0;
+      }
+      else
+          run++; /* count zero coefficients */
+       }
+
+/* End of Block -- normative block punctuation  */
+       slice_putbits(engine, 2, 2);
+}
+
+/* generate variable length code for a motion vector component (7.6.3.1) */
+void putmv(slice_engine_t *engine, int dmv, int f_code)
+{
+       int r_size, f, vmin, vmax, dv, temp, motion_code, motion_residual;
+
+       r_size = f_code - 1; /* number of fixed length code ('residual') bits */
+       f = 1 << r_size;
+       vmin = -16 * f; /* lower range limit */
+       vmax = 16 * f - 1; /* upper range limit */
+       dv = 32 * f;
+
+/* fold vector difference into [vmin...vmax] */
+       if(dmv > vmax)
+       dmv -= dv;
+       else 
+       if(dmv < vmin)
+       dmv += dv;
+
+/* check value */
+       if(dmv < vmin || dmv > vmax)
+       if(!quiet)
+               fprintf(stderr,"invalid motion vector\n");
+
+/* split dmv into motion_code and motion_residual */
+       temp = ((dmv < 0) ? -dmv : dmv) + f - 1;
+       motion_code = temp >> r_size;
+       if(dmv < 0)
+        motion_code = -motion_code;
+       motion_residual = temp & (f - 1);
+
+       putmotioncode(engine, motion_code); /* variable length code */
+
+       if (r_size != 0 && motion_code != 0)
+       slice_putbits(engine, motion_residual, r_size); /* fixed length code */
+}
diff --git a/mpeg2enc/putpic.c b/mpeg2enc/putpic.c
new file mode 100644 (file)
index 0000000..1b2e00c
--- /dev/null
@@ -0,0 +1,506 @@
+/* putpic.c, block and motion vector encoding routines                      */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <stdio.h>
+#include "config.h"
+#include "global.h"
+
+/* output motion vectors (6.2.5.2, 6.3.16.2)
+ *
+ * this routine also updates the predictions for motion vectors (PMV)
+ */
+static void putmvs(slice_engine_t *engine,
+       pict_data_s *picture,
+       mbinfo_s *mb,
+       int PMV[2][2][2],
+       int back)
+{
+       int hor_f_code;
+       int vert_f_code;
+
+       if( back )
+       {
+               hor_f_code = picture->back_hor_f_code;
+               vert_f_code = picture->back_vert_f_code;
+       }
+       else
+       {
+               hor_f_code = picture->forw_hor_f_code;
+               vert_f_code = picture->forw_vert_f_code;
+       }
+
+
+       if(picture->pict_struct == FRAME_PICTURE)
+       {
+       if(mb->motion_type == MC_FRAME)
+       {
+/* frame prediction */
+               putmv(engine, mb->MV[0][back][0] - PMV[0][back][0], hor_f_code);
+               putmv(engine, mb->MV[0][back][1] - PMV[0][back][1], vert_f_code);
+               PMV[0][back][0] = PMV[1][back][0] = mb->MV[0][back][0];
+               PMV[0][back][1] = PMV[1][back][1] = mb->MV[0][back][1];
+       }
+       else 
+               if (mb->motion_type == MC_FIELD)
+       {
+               /* field prediction */
+               slice_putbits(engine, mb->mv_field_sel[0][back], 1);
+               putmv(engine, mb->MV[0][back][0] - PMV[0][back][0], hor_f_code);
+               putmv(engine, (mb->MV[0][back][1] >> 1) - (PMV[0][back][1] >> 1), vert_f_code);
+               slice_putbits(engine, mb->mv_field_sel[1][back], 1);
+               putmv(engine, mb->MV[1][back][0] - PMV[1][back][0], hor_f_code);
+               putmv(engine, (mb->MV[1][back][1] >> 1) - (PMV[1][back][1] >> 1), vert_f_code);
+               PMV[0][back][0] = mb->MV[0][back][0];
+               PMV[0][back][1] = mb->MV[0][back][1];
+               PMV[1][back][0] = mb->MV[1][back][0];
+               PMV[1][back][1] = mb->MV[1][back][1];
+       }
+       else
+       {
+               /* dual prime prediction */
+               putmv(engine, mb->MV[0][back][0] - PMV[0][back][0], hor_f_code);
+               putdmv(engine, mb->dmvector[0]);
+               putmv(engine, (mb->MV[0][back][1] >> 1) - (PMV[0][back][1] >> 1), vert_f_code);
+               putdmv(engine, mb->dmvector[1]);
+               PMV[0][back][0] = PMV[1][back][0] = mb->MV[0][back][0];
+               PMV[0][back][1] = PMV[1][back][1] = mb->MV[0][back][1];
+       }
+       }
+       else
+       {
+       /* field picture */
+       if(mb->motion_type == MC_FIELD)
+       {
+               /* field prediction */
+               slice_putbits(engine, mb->mv_field_sel[0][back], 1);
+               putmv(engine, mb->MV[0][back][0] - PMV[0][back][0], hor_f_code);
+               putmv(engine, mb->MV[0][back][1] - PMV[0][back][1], vert_f_code);
+               PMV[0][back][0] = PMV[1][back][0] = mb->MV[0][back][0];
+               PMV[0][back][1] = PMV[1][back][1] = mb->MV[0][back][1];
+       }
+       else 
+               if(mb->motion_type == MC_16X8)
+       {
+               /* 16x8 prediction */
+               slice_putbits(engine, mb->mv_field_sel[0][back], 1);
+               putmv(engine, mb->MV[0][back][0] - PMV[0][back][0], hor_f_code);
+               putmv(engine, mb->MV[0][back][1] - PMV[0][back][1], vert_f_code);
+               slice_putbits(engine, mb->mv_field_sel[1][back], 1);
+               putmv(engine, mb->MV[1][back][0] - PMV[1][back][0], hor_f_code);
+               putmv(engine, mb->MV[1][back][1] - PMV[1][back][1], vert_f_code);
+               PMV[0][back][0] = mb->MV[0][back][0];
+               PMV[0][back][1] = mb->MV[0][back][1];
+               PMV[1][back][0] = mb->MV[1][back][0];
+               PMV[1][back][1] = mb->MV[1][back][1];
+       }
+       else
+       {
+               /* dual prime prediction */
+               putmv(engine, mb->MV[0][back][0] - PMV[0][back][0], hor_f_code);
+               putdmv(engine, mb->dmvector[0]);
+               putmv(engine, mb->MV[0][back][1] - PMV[0][back][1], vert_f_code);
+               putdmv(engine, mb->dmvector[1]);
+               PMV[0][back][0] = PMV[1][back][0] = mb->MV[0][back][0];
+               PMV[0][back][1] = PMV[1][back][1] = mb->MV[0][back][1];
+       }
+       }
+}
+
+
+void* slice_engine_loop(slice_engine_t *engine)
+{
+       while(!engine->done)
+       {
+               pthread_mutex_lock(&(engine->input_lock));
+
+               if(!engine->done)
+               {
+                       pict_data_s *picture = engine->picture;
+                       int i, j, k, comp, cc;
+                       int mb_type;
+                       int PMV[2][2][2];
+                       int cbp, MBAinc = 0;
+                       short (*quant_blocks)[64] = picture->qblocks;
+
+                       k = engine->start_row * mb_width;
+                       for(j = engine->start_row; j < engine->end_row; j++)
+                       {
+/* macroblock row loop */
+//printf("putpic 1\n");
+//slice_testbits(engine);
+                       for(i = 0; i < mb_width; i++)
+                       {
+                                       mbinfo_s *cur_mb = &picture->mbinfo[k];
+                                       int cur_mb_blocks = k * block_count;
+//pthread_mutex_lock(&test_lock);
+/* macroblock loop */
+                               if(i == 0)
+                               {
+                                               slice_alignbits(engine);
+/* slice header (6.2.4) */
+                                       if(mpeg1 || vertical_size <= 2800)
+                                               slice_putbits(engine, SLICE_MIN_START + j, 32); /* slice_start_code */
+                                       else
+                                       {
+                                               slice_putbits(engine, SLICE_MIN_START + (j & 127), 32); /* slice_start_code */
+                                               slice_putbits(engine, j >> 7, 3); /* slice_vertical_position_extension */
+                                       }
+/* quantiser_scale_code */
+//printf("putpic 1\n");slice_testbits(engine);
+                                       slice_putbits(engine, 
+                                                       picture->q_scale_type ? map_non_linear_mquant[engine->prev_mquant] : engine->prev_mquant >> 1, 
+                                                       5);
+
+                                       slice_putbits(engine, 0, 1); /* extra_bit_slice */
+
+//printf("putpic 1 %d %d %d\n",engine->prev_mquant, map_non_linear_mquant[engine->prev_mquant], engine->prev_mquant >> 1);
+//slice_testbits(engine);
+                                       /* reset predictors */
+
+                                       for(cc = 0; cc < 3; cc++)
+                                               engine->dc_dct_pred[cc] = 0;
+
+                                       PMV[0][0][0] = PMV[0][0][1] = PMV[1][0][0] = PMV[1][0][1] = 0;
+                                       PMV[0][1][0] = PMV[0][1][1] = PMV[1][1][0] = PMV[1][1][1] = 0;
+
+                                       MBAinc = i + 1; /* first MBAinc denotes absolute position */
+                               }
+
+                               mb_type = cur_mb->mb_type;
+
+/* determine mquant (rate control) */
+                               cur_mb->mquant = ratectl_calc_mquant(engine->ratectl, picture, k);
+
+/* quantize macroblock */
+//printf("putpic 1\n");
+//printf("putpic 1\n");
+//slice_testbits(engine);
+                               if(mb_type & MB_INTRA)
+                               {
+//printf("putpic 2 %d\n", cur_mb->mquant);
+                                               quant_intra( picture,
+                                                                picture->blocks[cur_mb_blocks],
+                                                                        quant_blocks[cur_mb_blocks],
+                                                                        cur_mb->mquant, 
+                                                                        &cur_mb->mquant );
+
+//printf("putpic 3\n");
+                                               cur_mb->cbp = cbp = (1<<block_count) - 1;
+                               }
+                               else
+                               {
+//printf("putpic 4 %p %d\n", picture->blocks[cur_mb_blocks], cur_mb_blocks);
+                                               cbp = (*pquant_non_intra)(picture,
+                                                                                         picture->blocks[cur_mb_blocks],
+                                                                                         quant_blocks[cur_mb_blocks],
+                                                                                         cur_mb->mquant,
+                                                                                         &cur_mb->mquant);
+//printf("putpic 5\n");
+                                               cur_mb->cbp = cbp;
+                                               if (cbp)
+                                                       mb_type|= MB_PATTERN;
+                               }
+//printf("putpic 6\n");
+//printf("putpic 2\n");
+//slice_testbits(engine);
+
+/* output mquant if it has changed */
+                               if(cbp && engine->prev_mquant != cur_mb->mquant)
+                                       mb_type |= MB_QUANT;
+
+/* check if macroblock can be skipped */
+                               if(i != 0 && i != mb_width - 1 && !cbp)
+                               {
+/* no DCT coefficients and neither first nor last macroblock of slice */
+
+                                       if(picture->pict_type == P_TYPE && 
+                                                       !(mb_type & MB_FORWARD))
+                                       {
+                                               /* P picture, no motion vectors -> skip */
+
+                                               /* reset predictors */
+
+                                               for(cc = 0; cc < 3; cc++)
+                                                engine->dc_dct_pred[cc] = 0;
+
+                                               PMV[0][0][0] = PMV[0][0][1] = PMV[1][0][0] = PMV[1][0][1] = 0;
+                                               PMV[0][1][0] = PMV[0][1][1] = PMV[1][1][0] = PMV[1][1][1] = 0;
+
+                                               cur_mb->mb_type = mb_type;
+                                               cur_mb->skipped = 1;
+                                               MBAinc++;
+                                               k++;
+                                               continue;
+                                       }
+
+                                       if(picture->pict_type == B_TYPE && 
+                                                       picture->pict_struct == FRAME_PICTURE &&
+                                       cur_mb->motion_type == MC_FRAME && 
+                                       ((picture->mbinfo[k - 1].mb_type ^ mb_type) & (MB_FORWARD | MB_BACKWARD)) == 0 &&
+                                       (!(mb_type & MB_FORWARD) ||
+                                               (PMV[0][0][0] == cur_mb->MV[0][0][0] &&
+                                                PMV[0][0][1] == cur_mb->MV[0][0][1])) && 
+                                       (!(mb_type & MB_BACKWARD) ||
+                                               (PMV[0][1][0] == cur_mb->MV[0][1][0] &&
+                                                PMV[0][1][1] == cur_mb->MV[0][1][1])))
+                                       {
+/* conditions for skipping in B frame pictures:
+ * - must be frame predicted
+ * - must be the same prediction type (forward/backward/interp.)
+ *   as previous macroblock
+ * - relevant vectors (forward/backward/both) have to be the same
+ *   as in previous macroblock
+ */
+
+                                               cur_mb->mb_type = mb_type;
+                                               cur_mb->skipped = 1;
+                                               MBAinc++;
+                                               k++;
+                                               continue;
+                                       }
+
+                                       if (picture->pict_type == B_TYPE && 
+                                                       picture->pict_struct != FRAME_PICTURE && 
+                                       cur_mb->motion_type == MC_FIELD && 
+                                       ((picture->mbinfo[k - 1].mb_type ^ mb_type) & (MB_FORWARD | MB_BACKWARD))==0 && 
+                                       (!(mb_type & MB_FORWARD) ||
+                                               (PMV[0][0][0] == cur_mb->MV[0][0][0] &&
+                                                PMV[0][0][1] == cur_mb->MV[0][0][1] &&
+                                                cur_mb->mv_field_sel[0][0] == (picture->pict_struct == BOTTOM_FIELD))) && 
+                                       (!(mb_type & MB_BACKWARD) ||
+                                               (PMV[0][1][0] == cur_mb->MV[0][1][0] &&
+                                                PMV[0][1][1] == cur_mb->MV[0][1][1] &&
+                                                cur_mb->mv_field_sel[0][1] == (picture->pict_struct == BOTTOM_FIELD))))
+                                       {
+/* conditions for skipping in B field pictures:
+ * - must be field predicted
+ * - must be the same prediction type (forward/backward/interp.)
+ *   as previous macroblock
+ * - relevant vectors (forward/backward/both) have to be the same
+ *   as in previous macroblock
+ * - relevant motion_vertical_field_selects have to be of same
+ *   parity as current field
+ */
+
+                                               cur_mb->mb_type = mb_type;
+                                               cur_mb->skipped = 1;
+                                               MBAinc++;
+                                               k++;
+                                               continue;
+                                       }
+                               }
+//printf("putpic 3\n");
+//slice_testbits(engine);
+
+/* macroblock cannot be skipped */
+                               cur_mb->skipped = 0;
+
+/* there's no VLC for 'No MC, Not Coded':
+ * we have to transmit (0,0) motion vectors
+ */
+                               if(picture->pict_type == P_TYPE && 
+                                               !cbp && 
+                                               !(mb_type & MB_FORWARD))
+                                       mb_type |= MB_FORWARD;
+
+/* macroblock_address_increment */
+                               putaddrinc(engine, MBAinc); 
+                               MBAinc = 1;
+                               putmbtype(engine, picture->pict_type, mb_type); /* macroblock type */
+//printf("putpic 3\n");
+//slice_testbits(engine);
+
+                               if(mb_type & (MB_FORWARD | MB_BACKWARD) && 
+                                               !picture->frame_pred_dct)
+                                       slice_putbits(engine, cur_mb->motion_type, 2);
+
+//printf("putpic %x %d %d %d\n", mb_type, picture->pict_struct, cbp, picture->frame_pred_dct);
+                               if(picture->pict_struct == FRAME_PICTURE && 
+                                               cbp && 
+                                               !picture->frame_pred_dct)
+                                       slice_putbits(engine, cur_mb->dct_type, 1);
+
+                               if(mb_type & MB_QUANT)
+                               {
+                                       slice_putbits(engine, 
+                                                       picture->q_scale_type ? map_non_linear_mquant[cur_mb->mquant] : cur_mb->mquant >> 1,
+                                                       5);
+                                       engine->prev_mquant = cur_mb->mquant;
+                               }
+
+                               if(mb_type & MB_FORWARD)
+                               {
+/* forward motion vectors, update predictors */
+                                       putmvs(engine, picture, cur_mb, PMV, 0);
+                               }
+
+                               if(mb_type & MB_BACKWARD)
+                               {
+/* backward motion vectors, update predictors */
+                                       putmvs(engine, picture,  cur_mb, PMV, 1);
+                               }
+
+                               if(mb_type & MB_PATTERN)
+                               {
+                                       putcbp(engine, (cbp >> (block_count - 6)) & 63);
+                                       if(chroma_format != CHROMA420)
+                                               slice_putbits(engine, cbp, block_count - 6);
+                               }
+
+//printf("putpic 4\n");
+//slice_testbits(engine);
+
+                               for(comp = 0; comp < block_count; comp++)
+                               {
+/* block loop */
+                                       if(cbp & (1 << (block_count - 1 - comp)))
+                                       {
+                                         if(mb_type & MB_INTRA)
+                                         {
+                                         cc = (comp < 4) ? 0 : (comp & 1) + 1;
+                                         putintrablk(engine, picture, quant_blocks[cur_mb_blocks + comp], cc);
+
+                                         }
+                                         else
+                                         putnonintrablk(engine, picture, quant_blocks[cur_mb_blocks + comp]);
+                                       }
+                               }
+//printf("putpic 5\n");
+//slice_testbits(engine);
+//sleep(1);
+
+                               /* reset predictors */
+                               if(!(mb_type & MB_INTRA))
+                                       for(cc = 0; cc < 3; cc++)
+                                               engine->dc_dct_pred[cc] = 0;
+
+                               if(mb_type & MB_INTRA || 
+                                               (picture->pict_type == P_TYPE && !(mb_type & MB_FORWARD)))
+                               {
+                                       PMV[0][0][0] = PMV[0][0][1] = PMV[1][0][0] = PMV[1][0][1] = 0;
+                                       PMV[0][1][0] = PMV[0][1][1] = PMV[1][1][0] = PMV[1][1][1] = 0;
+                               }
+
+                               cur_mb->mb_type = mb_type;
+                               k++;
+//pthread_mutex_unlock(&test_lock);
+                       }
+                       }
+               }
+               pthread_mutex_unlock(&(engine->output_lock));
+       }
+}
+
+
+/* quantization / variable length encoding of a complete picture */
+void putpict(pict_data_s *picture)
+{
+       int i, prev_mquant;
+
+       for(i = 0; i < processors; i++)
+       {
+               ratectl_init_pict(ratectl[i], picture); /* set up rate control */
+       }
+
+/* picture header and picture coding extension */
+       putpicthdr(picture);
+       if(!mpeg1) putpictcodext(picture);
+/* Flush buffer before switching to slice mode */
+       alignbits();
+
+/* Start loop */
+       for(i = 0; i < processors; i++)
+       {
+               slice_engines[i].prev_mquant = ratectl_start_mb(ratectl[i], picture);
+               slice_engines[i].picture = picture;
+               slice_engines[i].ratectl = ratectl[i];
+               slice_initbits(&slice_engines[i]);
+
+               pthread_mutex_unlock(&(slice_engines[i].input_lock));
+       }
+
+/* Wait for completion and write slices */
+       for(i = 0; i < processors; i++)
+       {
+               pthread_mutex_lock(&(slice_engines[i].output_lock));
+               slice_finishslice(&slice_engines[i]);
+       }
+
+       for(i = 0; i < processors; i++)
+               ratectl_update_pict(ratectl[i], picture);
+}
+
+void start_slice_engines()
+{
+       int i;
+       int rows_per_processor = (int)((float)mb_height2 / processors + 0.5);
+       int current_row = 0;
+       pthread_attr_t attr;
+       pthread_mutexattr_t mutex_attr;
+
+       pthread_mutexattr_init(&mutex_attr);
+       pthread_attr_init(&attr);
+//     pthread_mutex_init(&test_lock, &mutex_attr);
+
+       slice_engines = calloc(1, sizeof(slice_engine_t) * processors);
+       for(i = 0; i < processors; i++)
+       {
+               slice_engines[i].start_row = current_row;
+               current_row += rows_per_processor;
+       &nbs