r1019: Fix crash in Ogg file handling.
[cinelerra/simeon] / mpeg2enc / motion.c
1 /* motion.c, motion estimation                                              */
2
3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
4
5 /*
6  * Disclaimer of Warranty
7  *
8  * These software programs are available to the user without any license fee or
9  * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
10  * any and all warranties, whether express, implied, or statuary, including any
11  * implied warranties or merchantability or of fitness for a particular
12  * purpose.  In no event shall the copyright-holder be liable for any
13  * incidental, punitive, or consequential damages of any kind whatsoever
14  * arising from the use of these programs.
15  *
16  * This disclaimer of warranty extends to the user of these programs and user's
17  * customers, employees, agents, transferees, successors, and assigns.
18  *
19  * The MPEG Software Simulation Group does not represent or warrant that the
20  * programs furnished hereunder are free of infringement of any third-party
21  * patents.
22  *
23  * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
24  * are subject to royalty fees to patent holders.  Many of these patents are
25  * general enough such that they are unavoidable regardless of implementation
26  * design.
27  *
28  */
29
30 #include <limits.h>
31 #include <stdio.h>
32 #include "config.h"
33 #include "global.h"
34 #include "cpu_accel.h"
35 #include "simd.h"
36
37 /* Macro-block Motion compensation results record */
38
39 typedef struct _blockcrd {
40         int16_t x;
41         int16_t y;
42 } blockxy;
43
44 struct mb_motion
45 {
46         blockxy pos;        // Half-pel co-ordinates of source block
47         int sad;                        // Sum of absolute difference
48         int var;
49         uint8_t *blk ;          // Source block data (in luminace data array)
50         int hx, hy;                     // Half-pel offsets
51         int fieldsel;           // 0 = top 1 = bottom
52         int fieldoff;       // Offset from start of frame data to first line
53                                                 // of field.    (top = 0, bottom = width );
54 };
55
56 typedef struct mb_motion mb_motion_s;
57
58
59 struct subsampled_mb
60 {
61         uint8_t *mb;            // One pel
62         uint8_t *fmb;           // Two-pel subsampled
63         uint8_t *qmb;           // Four-pel subsampled
64         uint8_t *umb;           // U compoenent one-pel
65         uint8_t *vmb;       // V component  one-pel
66 };
67
68 typedef struct subsampled_mb subsampled_mb_s;
69
70
71 static void frame_ME (motion_engine_t *engine,
72                                                                         pict_data_s *picture,
73                                                                   motion_comp_s *mc,
74                                                                   int mboffset,
75                                                                   int i, int j, struct mbinfo *mbi);
76
77 static void field_ME (motion_engine_t *engine,
78                                                                         pict_data_s *picture,
79                                                                   motion_comp_s *mc,
80                                                                   int mboffset,
81                                                                   int i, int j, 
82                                                                   struct mbinfo *mbi, 
83                                                                   int secondfield, 
84                                                                   int ipflag);
85
86 static void frame_estimate (motion_engine_t *engine,
87         uint8_t *org,
88          uint8_t *ref, 
89          subsampled_mb_s *ssmb,
90          int i, int j,
91          int sx, int sy, 
92           mb_motion_s *bestfr,
93           mb_motion_s *besttop,
94           mb_motion_s *bestbot,
95          int imins[2][2], int jmins[2][2]);
96
97 static void field_estimate (motion_engine_t *engine,
98                                                         pict_data_s *picture,
99                                                         uint8_t *toporg,
100                                                         uint8_t *topref, 
101                                                         uint8_t *botorg, 
102                                                         uint8_t *botref,
103                                                         subsampled_mb_s *ssmb,
104                                                         int i, int j, int sx, int sy, int ipflag,
105                                                         mb_motion_s *bestfr,
106                                                         mb_motion_s *best8u,
107                                                         mb_motion_s *best8l,
108                                                         mb_motion_s *bestsp);
109
110 static void dpframe_estimate (motion_engine_t *engine,
111         pict_data_s *picture,
112         uint8_t *ref,
113         subsampled_mb_s *ssmb,
114         int i, int j, int iminf[2][2], int jminf[2][2],
115         mb_motion_s *dpbest,
116         int *imindmvp, int *jmindmvp, 
117         int *vmcp);
118
119 static void dpfield_estimate (motion_engine_t *engine,
120         pict_data_s *picture,
121         uint8_t *topref,
122         uint8_t *botref, 
123         uint8_t *mb,
124         int i, int j, 
125         int imins, int jmins, 
126         mb_motion_s *dpbest,
127         int *vmcp);
128
129 static void fullsearch (motion_engine_t *engine, 
130         uint8_t *org, uint8_t *ref,
131         subsampled_mb_s *ssblk,
132         int lx, int i0, int j0, 
133         int sx, int sy, int h, 
134         int xmax, int ymax,
135         mb_motion_s *motion );
136
137 static void find_best_one_pel( motion_engine_t *engine, 
138                                                                 uint8_t *org, uint8_t *blk,
139                                                            int searched_size,
140                                                            int i0, int j0,
141                                                            int ilow, int jlow,
142                                                            int xmax, int ymax,
143                                                            int lx, int h, 
144                                                            mb_motion_s *res
145         );
146 static int build_sub22_mcomps( motion_engine_t *engine, 
147                                                                 int i0,  int j0, int ihigh, int jhigh, 
148                                                                 int null_mc_sad,
149                                                                 uint8_t *s22org,  uint8_t *s22blk, 
150                                                            int flx, int fh,  int searched_sub44_size );
151
152 #ifdef X86_CPU
153 static void find_best_one_pel_mmxe( motion_engine_t *engine, 
154                                                                 uint8_t *org, uint8_t *blk,
155                                                            int searched_size,
156                                                            int i0, int j0,
157                                                            int ilow, int jlow,
158                                                            int xmax, int ymax,
159                                                            int lx, int h, 
160                                                            mb_motion_s *res
161         );
162
163 static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0,  int j0, int ihigh, int jhigh, 
164                                                          int null_mc_sad,
165                                                          uint8_t *s22org,  uint8_t *s22blk, 
166                                                          int flx, int fh,  int searched_sub44_size );
167 static int (*pmblock_sub44_dists)( uint8_t *blk,  uint8_t *ref,
168                                                         int ilow, int jlow,
169                                                         int ihigh, int jhigh, 
170                                                         int h, int rowstride, 
171                                                         int threshold,
172                                                         mc_result_s *resvec);
173 #endif
174
175 static int unidir_pred_var( const mb_motion_s *motion, 
176                                                         uint8_t *mb,  int lx, int h);
177 static int bidir_pred_var( const mb_motion_s *motion_f,  
178                                                    const mb_motion_s *motion_b, 
179                                                    uint8_t *mb,  int lx, int h);
180 static int bidir_pred_sad( const mb_motion_s *motion_f,  
181                                                    const mb_motion_s *motion_b, 
182                                                    uint8_t *mb,  int lx, int h);
183
184 static int variance(  uint8_t *mb, int size, int lx);
185
186 static int dist22 ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
187
188 static int dist44 ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
189 static int dist2_22( uint8_t *blk1, uint8_t *blk2,
190                                          int lx, int h);
191 static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b, 
192                                           uint8_t *blk2,
193                                           int lx, int h);
194
195
196 static void (*pfind_best_one_pel)( motion_engine_t *engine,
197                                                                 uint8_t *org, uint8_t *blk,
198                                                            int searched_size,
199                                                            int i0, int j0,
200                                                            int ilow, int jlow,
201                                                            int xmax, int ymax,
202                                                            int lx, int h, 
203                                                            mb_motion_s *res
204         );
205 static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
206                                                                         int i0,  int j0, int ihigh, int jhigh, 
207                                                                    int null_mc_sad,
208                                                                    uint8_t *s22org,  uint8_t *s22blk, 
209                                                                    int flx, int fh,  int searched_sub44_size );
210
211 static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
212                                                  int lx, int h);
213 static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b, 
214                                                   uint8_t *blk2,
215                                                   int lx, int h);
216
217 static int dist1_00( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
218 static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
219 static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
220 static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
221 static int dist2 (uint8_t *blk1, uint8_t *blk2,
222                                                           int lx, int hx, int hy, int h);
223 static int bdist2 (uint8_t *pf, uint8_t *pb,
224         uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
225 static int bdist1 (uint8_t *pf, uint8_t *pb,
226                                    uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
227
228
229 static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
230 static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
231 static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
232 static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
233 static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
234 static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
235
236 static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
237                                           int lx, int hx, int hy, int h);
238   
239   
240 static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
241                                            uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
242
243 static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
244                                            uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
245
246
247 /* DEBUGGER...
248 static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
249 {
250         rx *= 2; ry *= 2;
251         if( motion->sad < 0 || motion->sad > 0x10000 )
252         {
253                 printf( "SAD ooops %s\n", str );
254                 exit(1);
255         }
256         if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
257         {
258                 printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
259                 exit(1);
260         }
261         if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
262         {
263                 printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
264                 exit(1);
265         }
266 }
267
268 static void init_mc( mb_motion_s *motion )
269 {
270         motion->sad = -123;
271         motion->pos.x = -1000;
272         motion->pos.y = -1000;
273 }
274 */
275
276 /* unidir_NI_var_sum
277    Compute the combined variance of luminance and chrominance information
278    for a particular non-intra macro block after unidirectional
279    motion compensation...  
280
281    Note: results are scaled to give chrominance equal weight to
282    chrominance.  The variance of the luminance portion is computed
283    at the time the motion compensation is computed.
284
285    TODO: Perhaps we should compute the whole thing in fullsearch not
286    seperate it.  However, that would involve a lot of fiddling with
287    field_* and until its thoroughly debugged and tested I think I'll
288    leave that alone. Furthermore, it is unclear if its really worth
289    doing these computations for B *and* P frames.
290
291    TODO: BUG: ONLY works for 420 video...
292
293 */
294
295 static int unidir_chrom_var_sum( mb_motion_s *lum_mc, 
296                                                           uint8_t **ref, 
297                                                           subsampled_mb_s *ssblk,
298                                                           int lx, int h )
299 {
300         int uvlx = (lx>>1);
301         int uvh = (h>>1);
302         /* N.b. MC co-ordinates are computed in half-pel units! */
303         int cblkoffset = (lum_mc->fieldoff>>1) +
304                 (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
305         
306         return  ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
307                          (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
308 }
309
310 /*
311   bidir_NI_var_sum
312    Compute the combined variance of luminance and chrominance information
313    for a particular non-intra macro block after unidirectional
314    motion compensation...  
315
316    Note: results are scaled to give chrominance equal weight to
317    chrominance.  The variance of the luminance portion is computed
318    at the time the motion compensation is computed.
319
320    Note: results scaled to give chrominance equal weight to chrominance.
321   
322   TODO: BUG: ONLY works for 420 video...
323
324   NOTE: Currently unused but may be required if it turns out that taking
325   chrominance into account in B frames is needed.
326
327  */
328
329 int bidir_chrom_var_sum( mb_motion_s *lum_mc_f, 
330                                            mb_motion_s *lum_mc_b, 
331                                            uint8_t **ref_f, 
332                                            uint8_t **ref_b,
333                                            subsampled_mb_s *ssblk,
334                                            int lx, int h )
335 {
336         int uvlx = (lx>>1);
337         int uvh = (h>>1);
338         /* N.b. MC co-ordinates are computed in half-pel units! */
339         int cblkoffset_f = (lum_mc_f->fieldoff>>1) + 
340                 (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
341         int cblkoffset_b = (lum_mc_b->fieldoff>>1) + 
342                 (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
343         
344         return  (
345                 (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
346                                            ssblk->umb, uvlx, uvh ) +
347                 (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
348                                            ssblk->vmb, uvlx, uvh ))*2;
349
350 }
351
352 static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
353 {
354         return (variance(ssblk->umb,(h>>1),(lx>>1)) + 
355                         variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
356 }
357
358 /*
359  * frame picture motion estimation
360  *
361  * org: top left pel of source reference frame
362  * ref: top left pel of reconstructed reference frame
363  * ssmb:  macroblock to be matched
364  * i,j: location of mb relative to ref (=center of search window)
365  * sx,sy: half widths of search window
366  * besfr: location and SAD of best frame prediction
367  * besttop: location of best field pred. for top field of mb
368  * bestbo : location of best field pred. for bottom field of mb
369  */
370
371 static void frame_estimate(motion_engine_t *engine, 
372         uint8_t *org,
373         uint8_t *ref,
374         subsampled_mb_s *ssmb,
375         int i, int j, int sx, int sy,
376         mb_motion_s *bestfr,
377         mb_motion_s *besttop,
378         mb_motion_s *bestbot,
379         int imins[2][2],
380         int jmins[2][2]
381         )
382 {
383         subsampled_mb_s  botssmb;
384         mb_motion_s topfld_mc;
385         mb_motion_s botfld_mc;
386
387         botssmb.mb = ssmb->mb+width;
388         botssmb.fmb = ssmb->mb+(width>>1);
389         botssmb.qmb = ssmb->qmb+(width>>2);
390         botssmb.umb = ssmb->umb+(width>>1);
391         botssmb.vmb = ssmb->vmb+(width>>1);
392
393         /* frame prediction */
394         fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
395                                                   bestfr );
396         bestfr->fieldsel = 0;
397         bestfr->fieldoff = 0;
398
399         /* predict top field from top field */
400         fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
401                            &topfld_mc);
402
403         /* predict top field from bottom field */
404         fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
405                            width,height>>1, &botfld_mc);
406
407         /* set correct field selectors... */
408         topfld_mc.fieldsel = 0;
409         botfld_mc.fieldsel = 1;
410         topfld_mc.fieldoff = 0;
411         botfld_mc.fieldoff = width;
412
413         imins[0][0] = topfld_mc.pos.x;
414         jmins[0][0] = topfld_mc.pos.y;
415         imins[1][0] = botfld_mc.pos.x;
416         jmins[1][0] = botfld_mc.pos.y;
417
418         /* select prediction for top field */
419         if (topfld_mc.sad<=botfld_mc.sad)
420         {
421                 *besttop = topfld_mc;
422         }
423         else
424         {
425                 *besttop = botfld_mc;
426         }
427
428         /* predict bottom field from top field */
429         fullsearch(engine, org,ref,&botssmb,
430                                         width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
431                                         &topfld_mc);
432
433         /* predict bottom field from bottom field */
434         fullsearch(engine, org+width,ref+width,&botssmb,
435                                         width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
436                                         &botfld_mc);
437
438         /* set correct field selectors... */
439         topfld_mc.fieldsel = 0;
440         botfld_mc.fieldsel = 1;
441         topfld_mc.fieldoff = 0;
442         botfld_mc.fieldoff = width;
443
444         imins[0][1] = topfld_mc.pos.x;
445         jmins[0][1] = topfld_mc.pos.y;
446         imins[1][1] = botfld_mc.pos.x;
447         jmins[1][1] = botfld_mc.pos.y;
448
449         /* select prediction for bottom field */
450         if (botfld_mc.sad<=topfld_mc.sad)
451         {
452                 *bestbot = botfld_mc;
453         }
454         else
455         {
456                 *bestbot = topfld_mc;
457         }
458 }
459
460 /*
461  * field picture motion estimation subroutine
462  *
463  * toporg: address of original top reference field
464  * topref: address of reconstructed top reference field
465  * botorg: address of original bottom reference field
466  * botref: address of reconstructed bottom reference field
467  * ssmmb:  macroblock to be matched
468  * i,j: location of mb (=center of search window)
469  * sx,sy: half width/height of search window
470  *
471  * bestfld: location and distance of best field prediction
472  * best8u: location of best 16x8 pred. for upper half of mb
473  * best8lp: location of best 16x8 pred. for lower half of mb
474  * bdestsp: location and distance of best same parity field
475  *                    prediction (needed for dual prime, only valid if
476  *                    ipflag==0)
477  */
478
479 static void field_estimate (motion_engine_t *engine,
480         pict_data_s *picture,
481         uint8_t *toporg,
482         uint8_t *topref, 
483         uint8_t *botorg, 
484         uint8_t *botref,
485         subsampled_mb_s *ssmb,
486         int i, int j, int sx, int sy, int ipflag,
487         mb_motion_s *bestfld,
488         mb_motion_s *best8u,
489         mb_motion_s *best8l,
490         mb_motion_s *bestsp)
491
492 {
493         mb_motion_s topfld_mc;
494         mb_motion_s botfld_mc;
495         int dt, db;
496         int notop, nobot;
497         subsampled_mb_s botssmb;
498
499         botssmb.mb = ssmb->mb+width;
500         botssmb.umb = ssmb->umb+(width>>1);
501         botssmb.vmb = ssmb->vmb+(width>>1);
502         botssmb.fmb = ssmb->fmb+(width>>1);
503         botssmb.qmb = ssmb->qmb+(width>>2);
504
505         /* if ipflag is set, predict from field of opposite parity only */
506         notop = ipflag && (picture->pict_struct==TOP_FIELD);
507         nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
508
509         /* field prediction */
510
511         /* predict current field from top field */
512         if (notop)
513                 topfld_mc.sad = dt = 65536; /* infinity */
514         else
515                 fullsearch(engine, toporg,topref,ssmb,width<<1,
516                                    i,j,sx,sy>>1,16,width,height>>1,
517                                    &topfld_mc);
518         dt = topfld_mc.sad;
519         /* predict current field from bottom field */
520         if (nobot)
521                 botfld_mc.sad = db = 65536; /* infinity */
522         else
523                 fullsearch(engine, botorg,botref,ssmb,width<<1,
524                                    i,j,sx,sy>>1,16,width,height>>1,
525                                    &botfld_mc);
526         db = botfld_mc.sad;
527         /* Set correct field selectors */
528         topfld_mc.fieldsel = 0;
529         botfld_mc.fieldsel = 1;
530         topfld_mc.fieldoff = 0;
531         botfld_mc.fieldoff = width;
532
533         /* same parity prediction (only valid if ipflag==0) */
534         if (picture->pict_struct==TOP_FIELD)
535         {
536                 *bestsp = topfld_mc;
537         }
538         else
539         {
540                 *bestsp = botfld_mc;
541         }
542
543         /* select field prediction */
544         if (dt<=db)
545         {
546                 *bestfld = topfld_mc;
547         }
548         else
549         {
550                 *bestfld = botfld_mc;
551         }
552
553
554         /* 16x8 motion compensation */
555
556         /* predict upper half field from top field */
557         if (notop)
558                 topfld_mc.sad = dt = 65536;
559         else
560                 fullsearch(engine, toporg,topref,ssmb,width<<1,
561                                    i,j,sx,sy>>1,8,width,height>>1,
562                                     &topfld_mc);
563         dt = topfld_mc.sad;
564         /* predict upper half field from bottom field */
565         if (nobot)
566                 botfld_mc.sad = db = 65536;
567         else
568                 fullsearch(engine, botorg,botref,ssmb,width<<1,
569                                    i,j,sx,sy>>1,8,width,height>>1,
570                                     &botfld_mc);
571         db = botfld_mc.sad;
572
573         /* Set correct field selectors */
574         topfld_mc.fieldsel = 0;
575         botfld_mc.fieldsel = 1;
576         topfld_mc.fieldoff = 0;
577         botfld_mc.fieldoff = width;
578
579         /* select prediction for upper half field */
580         if (dt<=db)
581         {
582                 *best8u = topfld_mc;
583         }
584         else
585         {
586                 *best8u = botfld_mc;
587         }
588
589         /* predict lower half field from top field */
590         /*
591           N.b. For interlaced data width<<4 (width*16) takes us 8 rows
592           down in the same field.  
593           Thus for the fast motion data (2*2
594           sub-sampled) we need to go 4 rows down in the same field.
595           This requires adding width*4 = (width<<2).
596           For the 4*4 sub-sampled motion data we need to go down 2 rows.
597           This requires adding width = width
598          
599         */
600         if (notop)
601                 topfld_mc.sad = dt = 65536;
602         else
603                 fullsearch(engine, toporg,topref,&botssmb,
604                                                 width<<1,
605                                                 i,j+8,sx,sy>>1,8,width,height>>1,
606                                    /* &imint,&jmint, &dt,*/ &topfld_mc);
607         dt = topfld_mc.sad;
608         /* predict lower half field from bottom field */
609         if (nobot)
610                 botfld_mc.sad = db = 65536;
611         else
612                 fullsearch(engine, botorg,botref,&botssmb,width<<1,
613                                                 i,j+8,sx,sy>>1,8,width,height>>1,
614                                    /* &iminb,&jminb, &db,*/ &botfld_mc);
615         db = botfld_mc.sad;
616         /* Set correct field selectors */
617         topfld_mc.fieldsel = 0;
618         botfld_mc.fieldsel = 1;
619         topfld_mc.fieldoff = 0;
620         botfld_mc.fieldoff = width;
621
622         /* select prediction for lower half field */
623         if (dt<=db)
624         {
625                 *best8l = topfld_mc;
626         }
627         else
628         {
629                 *best8l = botfld_mc;
630         }
631 }
632
633 static void dpframe_estimate (motion_engine_t *engine,
634         pict_data_s *picture,
635         uint8_t *ref,
636         subsampled_mb_s *ssmb,
637         
638         int i, int j, int iminf[2][2], int jminf[2][2],
639         mb_motion_s *best_mc,
640         int *imindmvp, int *jmindmvp,
641         int *vmcp)
642 {
643         int pref,ppred,delta_x,delta_y;
644         int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
645         int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
646         int vmc,local_dist;
647
648         /* Calculate Dual Prime distortions for 9 delta candidates
649          * for each of the four minimum field vectors
650          * Note: only for P pictures!
651          */
652
653         /* initialize minimum dual prime distortion to large value */
654         vmc = INT_MAX;
655
656         for (pref=0; pref<2; pref++)
657         {
658                 for (ppred=0; ppred<2; ppred++)
659                 {
660                         /* convert Cartesian absolute to relative motion vector
661                          * values (wrt current macroblock address (i,j)
662                          */
663                         is = iminf[pref][ppred] - (i<<1);
664                         js = jminf[pref][ppred] - (j<<1);
665
666                         if (pref!=ppred)
667                         {
668                                 /* vertical field shift adjustment */
669                                 if (ppred==0)
670                                         js++;
671                                 else
672                                         js--;
673
674                                 /* mvxs and mvys scaling*/
675                                 is<<=1;
676                                 js<<=1;
677                                 if (picture->topfirst == ppred)
678                                 {
679                                         /* second field: scale by 1/3 */
680                                         is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
681                                         js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
682                                 }
683                                 else
684                                         continue;
685                         }
686
687                         /* vector for prediction from field of opposite 'parity' */
688                         if (picture->topfirst)
689                         {
690                                 /* vector for prediction of top field from bottom field */
691                                 it0 = ((is+(is>0))>>1);
692                                 jt0 = ((js+(js>0))>>1) - 1;
693
694                                 /* vector for prediction of bottom field from top field */
695                                 ib0 = ((3*is+(is>0))>>1);
696                                 jb0 = ((3*js+(js>0))>>1) + 1;
697                         }
698                         else
699                         {
700                                 /* vector for prediction of top field from bottom field */
701                                 it0 = ((3*is+(is>0))>>1);
702                                 jt0 = ((3*js+(js>0))>>1) - 1;
703
704                                 /* vector for prediction of bottom field from top field */
705                                 ib0 = ((is+(is>0))>>1);
706                                 jb0 = ((js+(js>0))>>1) + 1;
707                         }
708
709                         /* convert back to absolute half-pel field picture coordinates */
710                         is += i<<1;
711                         js += j<<1;
712                         it0 += i<<1;
713                         jt0 += j<<1;
714                         ib0 += i<<1;
715                         jb0 += j<<1;
716
717                         if (is >= 0 && is <= (width-16)<<1 &&
718                                 js >= 0 && js <= (height-16))
719                         {
720                                 for (delta_y=-1; delta_y<=1; delta_y++)
721                                 {
722                                         for (delta_x=-1; delta_x<=1; delta_x++)
723                                         {
724                                                 /* opposite field coordinates */
725                                                 it = it0 + delta_x;
726                                                 jt = jt0 + delta_y;
727                                                 ib = ib0 + delta_x;
728                                                 jb = jb0 + delta_y;
729
730                                                 if (it >= 0 && it <= (width-16)<<1 &&
731                                                         jt >= 0 && jt <= (height-16) &&
732                                                         ib >= 0 && ib <= (width-16)<<1 &&
733                                                         jb >= 0 && jb <= (height-16))
734                                                 {
735                                                         /* compute prediction error */
736                                                         local_dist = (*pbdist2)(
737                                                                 ref + (is>>1) + (width<<1)*(js>>1),
738                                                                 ref + width + (it>>1) + (width<<1)*(jt>>1),
739                                                                 ssmb->mb,             /* current mb location */
740                                                                 width<<1,       /* adjacent line distance */
741                                                                 is&1, js&1, it&1, jt&1, /* half-pel flags */
742                                                                 8);             /* block height */
743                                                         local_dist += (*pbdist2)(
744                                                                 ref + width + (is>>1) + (width<<1)*(js>>1),
745                                                                 ref + (ib>>1) + (width<<1)*(jb>>1),
746                                                                 ssmb->mb + width,     /* current mb location */
747                                                                 width<<1,       /* adjacent line distance */
748                                                                 is&1, js&1, ib&1, jb&1, /* half-pel flags */
749                                                                 8);             /* block height */
750
751                                                         /* update delta with least distortion vector */
752                                                         if (local_dist < vmc)
753                                                         {
754                                                                 imins = is;
755                                                                 jmins = js;
756                                                                 imint = it;
757                                                                 jmint = jt;
758                                                                 iminb = ib;
759                                                                 jminb = jb;
760                                                                 imindmv = delta_x;
761                                                                 jmindmv = delta_y;
762                                                                 vmc = local_dist;
763                                                         }
764                                                 }
765                                         }  /* end delta x loop */
766                                 } /* end delta y loop */
767                         }
768                 }
769         }
770
771         /* TODO: This is now likely to be obsolete... */
772         /* Compute L1 error for decision purposes */
773         local_dist = (*pbdist1)(
774                 ref + (imins>>1) + (width<<1)*(jmins>>1),
775                 ref + width + (imint>>1) + (width<<1)*(jmint>>1),
776                 ssmb->mb,
777                 width<<1,
778                 imins&1, jmins&1, imint&1, jmint&1,
779                 8);
780 //printf("motion 1 %p\n", pbdist1);
781         local_dist += (*pbdist1)(
782                 ref + width + (imins>>1) + (width<<1)*(jmins>>1),
783                 ref + (iminb>>1) + (width<<1)*(jminb>>1),
784                 ssmb->mb + width,
785                 width<<1,
786                 imins&1, jmins&1, iminb&1, jminb&1,
787                 8);
788 //printf("motion 2\n");
789
790         best_mc->sad = local_dist;
791         best_mc->pos.x = imins;
792         best_mc->pos.y = jmins;
793         *vmcp = vmc;
794         *imindmvp = imindmv;
795         *jmindmvp = jmindmv;
796 //printf("motion 2\n");
797 }
798
799 static void dpfield_estimate(motion_engine_t *engine,
800         pict_data_s *picture,
801         uint8_t *topref,
802         uint8_t *botref, 
803         uint8_t *mb,
804         int i, int j, int imins, int jmins, 
805         mb_motion_s *bestdp_mc,
806         int *vmcp
807         )
808
809 {
810         uint8_t *sameref, *oppref;
811         int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
812         int imino,jmino,imindmv,jmindmv,vmc_dp,local_dist;
813
814         /* Calculate Dual Prime distortions for 9 delta candidates */
815         /* Note: only for P pictures! */
816
817         /* Assign opposite and same reference pointer */
818         if (picture->pict_struct==TOP_FIELD)
819         {
820                 sameref = topref;    
821                 oppref = botref;
822         }
823         else 
824         {
825                 sameref = botref;
826                 oppref = topref;
827         }
828
829         /* convert Cartesian absolute to relative motion vector
830          * values (wrt current macroblock address (i,j)
831          */
832         mvxs = imins - (i<<1);
833         mvys = jmins - (j<<1);
834
835         /* vector for prediction from field of opposite 'parity' */
836         mvxo0 = (mvxs+(mvxs>0)) >> 1;  /* mvxs / / */
837         mvyo0 = (mvys+(mvys>0)) >> 1;  /* mvys / / 2*/
838
839                         /* vertical field shift correction */
840         if (picture->pict_struct==TOP_FIELD)
841                 mvyo0--;
842         else
843                 mvyo0++;
844
845                         /* convert back to absolute coordinates */
846         io0 = mvxo0 + (i<<1);
847         jo0 = mvyo0 + (j<<1);
848
849                         /* initialize minimum dual prime distortion to large value */
850         vmc_dp = 1 << 30;
851
852         for (delta_y = -1; delta_y <= 1; delta_y++)
853         {
854                 for (delta_x = -1; delta_x <=1; delta_x++)
855                 {
856                         /* opposite field coordinates */
857                         io = io0 + delta_x;
858                         jo = jo0 + delta_y;
859
860                         if (io >= 0 && io <= (width-16)<<1 &&
861                                 jo >= 0 && jo <= (height2-16)<<1)
862                         {
863                                 /* compute prediction error */
864                                 local_dist = (*pbdist2)(
865                                         sameref + (imins>>1) + width2*(jmins>>1),
866                                         oppref  + (io>>1)    + width2*(jo>>1),
867                                         mb,             /* current mb location */
868                                         width2,         /* adjacent line distance */
869                                         imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
870                                         16);            /* block height */
871
872                                 /* update delta with least distortion vector */
873                                 if (local_dist < vmc_dp)
874                                 {
875                                         imino = io;
876                                         jmino = jo;
877                                         imindmv = delta_x;
878                                         jmindmv = delta_y;
879                                         vmc_dp = local_dist;
880                                 }
881                         }
882                 }  /* end delta x loop */
883         } /* end delta y loop */
884
885         /* Compute L1 error for decision purposes */
886         bestdp_mc->sad =
887                 (*pbdist1)(
888                         sameref + (imins>>1) + width2*(jmins>>1),
889                         oppref  + (imino>>1) + width2*(jmino>>1),
890                         mb,             /* current mb location */
891                         width2,         /* adjacent line distance */
892                         imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
893                         16);            /* block height */
894
895         bestdp_mc->pos.x = imindmv;
896         bestdp_mc->pos.x = imindmv;
897         *vmcp = vmc_dp;
898 }
899
900
901 /* 
902  *   Vectors of motion compensations 
903  */
904
905 /*
906         Take a vector of motion compensations and repeatedly make passes
907         discarding all elements whose sad "weight" is above the current mean weight.
908 */
909
910 static void sub_mean_reduction( mc_result_s *matches, int len, 
911                                                                 int times,
912                                                             int *newlen_res, int *minweight_res)
913 {
914         int i,j;
915         int weight_sum;
916         int mean_weight;
917         int min_weight = 100000;
918         if( len == 0 )
919         {
920                 *minweight_res = 100000;
921                 *newlen_res = 0;
922                 return;
923         }
924
925         for(;;)
926         {
927                 weight_sum = 0;
928                 for( i = 0; i < len ; ++i )
929                         weight_sum += matches[i].weight;
930                 mean_weight = weight_sum / len;
931                 
932                 if( times <= 0)
933                         break;
934                         
935                 j = 0;
936                 for( i =0; i < len; ++i )
937                 {
938                         if( matches[i].weight <= mean_weight )
939                         {
940                                 if( times == 1)
941                                 {
942                                         min_weight = matches[i].weight ;
943                                 }
944                                 matches[j] = matches[i];
945                                 ++j;
946                         }
947                 }
948                 len = j;
949                 --times;
950         }
951         *newlen_res = len;
952         *minweight_res = mean_weight;
953 }
954
955 /*
956   Build a vector of the top   4*4 sub-sampled motion compensations in the box
957   (ilow,jlow) to (ihigh,jhigh).
958
959         The algorithm is as follows:
960         1. coarse matches on an 8*8 grid of positions are collected that fall below
961         a (very conservative) sad threshold (basically 50% more than moving average of
962         the mean sad of such matches).
963         
964         2. The worse than-average matches are discarded.
965         
966         3. The remaining coarse matches are expanded with the left/lower neighbouring
967         4*4 grid matches. Again only those better than a threshold (this time the mean
968         of the 8*8 grid matches are retained.
969         
970         4. Multiple passes are made discarding worse than-average matches.  The number of
971         passes is specified by the user.  The default it 2 (leaving roughly 1/4 of the matches).
972         
973         The net result is very fast and find good matches if they're to be found.  I.e. the
974         penalty over exhaustive search is pretty low.
975         
976         NOTE: The "discard below average" trick depends critically on having some variation in
977         the matches.  The slight penalty imposed for distant matches (reasonably since the 
978         motion vectors have to be encoded) is *vital* as otherwise pathologically bad
979         performance results on highly uniform images.
980         
981         TODO: We should probably allow the user to eliminate the initial thinning of 8*8
982         grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
983
984 */
985
986
987 static int build_sub44_mcomps(motion_engine_t *engine,
988          int ilow, int jlow, int ihigh, int jhigh, 
989                                                         int i0, int j0,
990                                                                 int null_mc_sad,
991                                                         uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
992 {
993         uint8_t *s44orgblk;
994         int istrt = ilow-i0;
995         int jstrt = jlow-j0;
996         int iend = ihigh-i0;
997         int jend = jhigh-j0;
998         int mean_weight;
999         int threshold;
1000
1001 #ifdef X86_CPU
1002
1003         /*int rangex, rangey;
1004         static int rough_num_mcomps;
1005         static mc_result_s rough_mcomps[MAX_44_MATCHES];
1006         int k;
1007         */
1008 #else
1009         int i,j;
1010         int s1;
1011         uint8_t *old_s44orgblk;
1012 #endif
1013         /* N.b. we may ignore the right hand block of the pair going over the
1014            right edge as we have carefully allocated the buffer oversize to ensure
1015            no memory faults.  The later motion compensation calculations
1016            performed on the results of this pass will filter out
1017            out-of-range blocks...
1018         */
1019
1020
1021         engine->sub44_num_mcomps = 0;
1022         
1023         threshold = 6*null_mc_sad / (4*4*mc_44_red);
1024         s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1025         
1026         /* Exhaustive search on 4*4 sub-sampled data.  This is affordable because
1027                 (a)     it is only 16th of the size of the real 1-pel data 
1028                 (b) we ignore those matches with an sad above our threshold.    
1029         */
1030 #ifndef X86_CPU
1031
1032                 /* Invariant:  s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
1033                 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1034                 for( j = jstrt; j <= jend; j += 4 )
1035                 {
1036                         old_s44orgblk = s44orgblk;
1037                         for( i = istrt; i <= iend; i += 4 )
1038                         {
1039                                 s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
1040                                 if( s1 < threshold )
1041                                 {
1042                                         engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
1043                                         engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
1044                                         engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
1045                                         ++engine->sub44_num_mcomps;
1046                                 }
1047                                 s44orgblk += 1;
1048                         }
1049                         s44orgblk = old_s44orgblk + qlx;
1050                 }
1051                         
1052 #else
1053
1054                 engine->sub44_num_mcomps
1055                         = (*pmblock_sub44_dists)( s44orgblk, s44blk,
1056                                                                   istrt, jstrt,
1057                                                                   iend, jend, 
1058                                                                   qh, qlx, 
1059                                                                   threshold,
1060                                                                   engine->sub44_mcomps);
1061
1062 #endif  
1063                 /* If we're really pushing quality we reduce once otherwise twice. */
1064                         
1065                 sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
1066                                                     &engine->sub44_num_mcomps, &mean_weight);
1067
1068
1069         return engine->sub44_num_mcomps;
1070 }
1071
1072
1073 /* Build a vector of the best 2*2 sub-sampled motion
1074   compensations using the best 4*4 matches as starting points.  As
1075   with with the 4*4 matches We don't collect them densely as they're
1076   just search starting points for 1-pel search and ones that are 1 out
1077   should still give better than average matches...
1078
1079 */
1080
1081
1082 static int build_sub22_mcomps(motion_engine_t *engine,
1083                 int i0,  int j0, int ihigh, int jhigh, 
1084                 int null_mc_sad,
1085                 uint8_t *s22org,  uint8_t *s22blk, 
1086                 int flx, int fh,  int searched_sub44_size )
1087 {
1088         int i,k,s;
1089         int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1090
1091         int min_weight;
1092         int ilim = ihigh-i0;
1093         int jlim = jhigh-j0;
1094         blockxy matchrec;
1095         uint8_t *s22orgblk;
1096
1097         engine->sub22_num_mcomps = 0;
1098         for( k = 0; k < searched_sub44_size; ++k )
1099         {
1100
1101                 matchrec.x = engine->sub44_mcomps[k].x;
1102                 matchrec.y = engine->sub44_mcomps[k].y;
1103
1104                 s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1105                 for( i = 0; i < 4; ++i )
1106                 {
1107                         if( matchrec.x <= ilim && matchrec.y <= jlim )
1108                         {       
1109                                 s = (*pdist22)( s22orgblk,s22blk,flx,fh);
1110                                 if( s < threshold )
1111                                 {
1112                                         engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1113                                         engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1114                                         engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1115                                         ++engine->sub22_num_mcomps;
1116                                 }
1117                         }
1118
1119                         if( i == 1 )
1120                         {
1121                                 s22orgblk += flx-1;
1122                                 matchrec.x -= 2;
1123                                 matchrec.y += 2;
1124                         }
1125                         else
1126                         {
1127                                 s22orgblk += 1;
1128                                 matchrec.x += 2;
1129                                 
1130                         }
1131                 }
1132
1133         }
1134
1135         
1136         sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1137                                                 &engine->sub22_num_mcomps, &min_weight );
1138         return engine->sub22_num_mcomps;
1139 }
1140
1141 #ifdef X86_CPU
1142 int build_sub22_mcomps_mmxe(motion_engine_t *engine,
1143         int i0,  int j0, int ihigh, int jhigh, 
1144         int null_mc_sad,
1145         uint8_t *s22org,  uint8_t *s22blk, 
1146         int flx, int fh,  int searched_sub44_size )
1147 {
1148         int i,k,s;
1149         int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1150
1151         int min_weight;
1152         int ilim = ihigh-i0;
1153         int jlim = jhigh-j0;
1154         blockxy matchrec;
1155         uint8_t *s22orgblk;
1156         int resvec[4];
1157
1158         /* TODO: The calculation of the lstrow offset really belongs in
1159        asm code... */
1160         int lstrow=(fh-1)*flx;
1161
1162         engine->sub22_num_mcomps = 0;
1163         for( k = 0; k < searched_sub44_size; ++k )
1164         {
1165
1166                 matchrec.x = engine->sub44_mcomps[k].x;
1167                 matchrec.y = engine->sub44_mcomps[k].y;
1168
1169                 s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1170                 mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
1171                 for( i = 0; i < 4; ++i )
1172                 {
1173                         if( matchrec.x <= ilim && matchrec.y <= jlim )
1174                         {       
1175                                 s =resvec[i];
1176                                 if( s < threshold )
1177                                 {
1178                                         engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1179                                         engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1180                                         engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1181                                         ++engine->sub22_num_mcomps;
1182                                 }
1183                         }
1184
1185                         if( i == 1 )
1186                         {
1187                                 matchrec.x -= 2;
1188                                 matchrec.y += 2;
1189                         }
1190                         else
1191                         {
1192                                 matchrec.x += 2;
1193                         }
1194                 }
1195
1196         }
1197
1198         
1199         sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1200                                                 &engine->sub22_num_mcomps, &min_weight );
1201         return engine->sub22_num_mcomps;
1202 }
1203
1204 #endif
1205
1206 /*
1207   Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
1208         TODO: Its a bit silly to cart around absolute M/C co-ordinates that
1209         eventually get turned into relative ones anyway...
1210 */
1211
1212
1213 static void find_best_one_pel(motion_engine_t *engine,
1214          uint8_t *org, uint8_t *blk,
1215          int searched_size,
1216          int i0, int j0,
1217          int ilow, int jlow,
1218          int xmax, int ymax,
1219          int lx, int h, 
1220          mb_motion_s *res
1221         )
1222
1223 {
1224         int i,k;
1225         int d;
1226         blockxy minpos = res->pos;
1227         int dmin = INT_MAX;
1228         uint8_t *orgblk;
1229         int penalty;
1230         int init_search;
1231         int init_size;
1232         blockxy matchrec;
1233  
1234         init_search = searched_size;
1235         init_size = engine->sub22_num_mcomps;
1236         for( k = 0; k < searched_size; ++k )
1237         {       
1238                 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1239                 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1240                 orgblk = org + matchrec.x+lx*matchrec.y;
1241                 penalty = abs(matchrec.x)+abs(matchrec.y);
1242
1243                 for( i = 0; i < 4; ++i )
1244                 {
1245                         if( matchrec.x <= xmax && matchrec.y <= ymax )
1246                         {
1247                 
1248                                 d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
1249                                 if (d<dmin)
1250                                 {
1251                                         dmin = d;
1252                                         minpos = matchrec;
1253                                 }
1254                         }
1255                         if( i == 1 )
1256                         {
1257                                 orgblk += lx-1;
1258                                 matchrec.x -= 1;
1259                                 matchrec.y += 1;
1260                         }
1261                         else
1262                         {
1263                                 orgblk += 1;
1264                                 matchrec.x += 1;
1265                         }
1266                 }
1267         }
1268
1269         res->pos = minpos;
1270         res->blk = org + minpos.x+lx*minpos.y;
1271         res->sad = dmin;
1272
1273 }
1274
1275 #ifdef X86_CPU
1276 void find_best_one_pel_mmxe(motion_engine_t *engine,
1277         uint8_t *org, uint8_t *blk,
1278         int searched_size,
1279         int i0, int j0,
1280         int ilow, int jlow,
1281         int xmax, int ymax,
1282         int lx, int h, 
1283         mb_motion_s *res
1284         )
1285
1286 {
1287         int i,k;
1288         int d;
1289         blockxy minpos = res->pos;
1290         int dmin = INT_MAX;
1291         uint8_t *orgblk;
1292         int penalty;
1293         int init_search;
1294         int init_size;
1295         blockxy matchrec;
1296         int resvec[4];
1297
1298  
1299         init_search = searched_size;
1300         init_size = engine->sub22_num_mcomps;
1301         for( k = 0; k < searched_size; ++k )
1302         {       
1303                 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1304                 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1305                 orgblk = org + matchrec.x+lx*matchrec.y;
1306                 penalty = abs(matchrec.x)+abs(matchrec.y);
1307                 
1308
1309                 mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
1310
1311                 for( i = 0; i < 4; ++i )
1312                 {
1313                         if( matchrec.x <= xmax && matchrec.y <= ymax )
1314                         {
1315                 
1316                                 d = penalty+resvec[i];
1317                                 if (d<dmin)
1318                                 {
1319                                         dmin = d;
1320                                         minpos = matchrec;
1321                                 }
1322                         }
1323                         if( i == 1 )
1324                         {
1325                                 orgblk += lx-1;
1326                                 matchrec.x -= 1;
1327                                 matchrec.y += 1;
1328                         }
1329                         else
1330                         {
1331                                 orgblk += 1;
1332                                 matchrec.x += 1;
1333                         }
1334                 }
1335         }
1336
1337         res->pos = minpos;
1338         res->blk = org + minpos.x+lx*minpos.y;
1339         res->sad = dmin;
1340
1341 }
1342 #endif 
1343  
1344 /*
1345  * full search block matching
1346  *
1347  * A.Stevens 2000: This is now a big misnomer.  The search is now a hierarchical/sub-sampling
1348  * search not a full search.  However, experiments have shown it is always close to
1349  * optimal and almost always very close or optimal.
1350  *
1351  * blk: top left pel of (16*h) block
1352  * s22blk: top element of fast motion compensation block corresponding to blk
1353  * h: height of block
1354  * lx: distance (in bytes) of vertically adjacent pels in ref,blk
1355  * org: top left pel of source reference picture
1356  * ref: top left pel of reconstructed reference picture
1357  * i0,j0: center of search window
1358  * sx,sy: half widths of search window
1359  * xmax,ymax: right/bottom limits of search area
1360  * iminp,jminp: pointers to where the result is stored
1361  *              result is given as half pel offset from ref(0,0)
1362  *              i.e. NOT relative to (i0,j0)
1363  */
1364
1365
1366 static void fullsearch(motion_engine_t *engine,
1367         uint8_t *org,
1368         uint8_t *ref,
1369         subsampled_mb_s *ssblk,
1370         int lx, int i0, int j0, 
1371         int sx, int sy, int h,
1372         int xmax, int ymax,
1373         /* int *iminp, int *jminp, int *sadminp, */
1374         mb_motion_s *res
1375         )
1376 {
1377         mb_motion_s best;
1378         /* int imin, jmin, dmin */
1379         int i,j,ilow,ihigh,jlow,jhigh;
1380         int d;
1381
1382         /* NOTE: Surprisingly, the initial motion compensation search
1383            works better when the original image not the reference (reconstructed)
1384            image is used. 
1385         */
1386         uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
1387         uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
1388         uint8_t *orgblk;
1389         int flx = lx >> 1;
1390         int qlx = lx >> 2;
1391         int fh = h >> 1;
1392         int qh = h >> 2;
1393
1394
1395         /* xmax and ymax into more useful form... */
1396         xmax -= 16;
1397         ymax -= h;
1398   
1399   
1400         /* The search radii are *always* multiples of 4 to avoid messiness
1401            in the initial 4*4 pel search.  This is handled by the
1402            parameter checking/processing code in readparmfile() */
1403   
1404         /* Create a distance-order mcomps of possible motion compensations
1405           based on the fast estimation data - 4*4 pel sums (4*4
1406           sub-sampled) rather than actual pel's.  1/16 the size...  */
1407         jlow = j0-sy;
1408         jlow = jlow < 0 ? 0 : jlow;
1409         jhigh =  j0+(sy-1);
1410         jhigh = jhigh > ymax ? ymax : jhigh;
1411         ilow = i0-sx;
1412         ilow = ilow < 0 ? 0 : ilow;
1413         ihigh =  i0+(sx-1);
1414         ihigh = ihigh > xmax ? xmax : ihigh;
1415
1416         /*
1417            Very rarely this may fail to find matchs due to all the good
1418            looking ones being over threshold. hence we make sure we
1419            fall back to a 0 motion compensation in this case.
1420            
1421                  The sad for the 0 motion compensation is also very useful as
1422                  a basis for setting thresholds for rejecting really dud 4*4
1423                  and 2*2 sub-sampled matches.
1424         */
1425         best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
1426                 ssblk->mb,
1427                 lx,
1428                 h,
1429                 best.sad);
1430         best.pos.x = i0;
1431         best.pos.y = j0;
1432
1433         engine->sub44_num_mcomps = build_sub44_mcomps(engine,
1434                                                                          ilow, jlow, ihigh, jhigh,
1435                                                                           i0, j0,
1436                                                                           best.sad,
1437                                                                           s44org, 
1438                                                                           ssblk->qmb, qlx, qh );
1439
1440         
1441         /* Now create a distance-ordered mcomps of possible motion
1442            compensations based on the fast estimation data - 2*2 pel sums
1443            using the best fraction of the 4*4 estimates However we cover
1444            only coarsely... on 4-pel boundaries...  */
1445
1446         engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh,  jhigh, 
1447                                                                                            best.sad,
1448                                                                                            s22org, ssblk->fmb, flx, fh, 
1449                                                                                            engine->sub44_num_mcomps );
1450
1451                 
1452     /* Now choose best 1-pel match from what approximates (not exact
1453            due to the pre-processing trick with the mean) the top 1/2 of
1454            the 2*2 matches 
1455         */
1456         
1457
1458         (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
1459                                            i0, j0,
1460                                            ilow, jlow, xmax, ymax, 
1461                                            lx, h, &best );
1462
1463         /* Final polish: half-pel search of best candidate against
1464            reconstructed image. 
1465         */
1466
1467         best.pos.x <<= 1; 
1468         best.pos.y <<= 1;
1469         best.hx = 0;
1470         best.hy = 0;
1471
1472         ilow = best.pos.x - (best.pos.x>(ilow<<1));
1473         ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
1474         jlow = best.pos.y - (best.pos.y>(jlow<<1));
1475         jhigh =  best.pos.y+ (best.pos.y<((jhigh)<<1));
1476
1477         for (j=jlow; j<=jhigh; j++)
1478         {
1479                 for (i=ilow; i<=ihigh; i++)
1480                 {
1481                         orgblk = ref+(i>>1)+((j>>1)*lx);
1482                         if( i&1 )
1483                         {
1484                                 if( j & 1 )
1485                                         d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
1486                                 else
1487                                         d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
1488                         }
1489                         else
1490                         {
1491                                 if( j & 1 )
1492                                         d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
1493                                 else
1494                                         d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
1495                         }
1496                         if (d<best.sad)
1497                         {
1498                                 best.sad = d;
1499                                 best.pos.x = i;
1500                                 best.pos.y = j;
1501                                 best.blk = orgblk;
1502                                 best.hx = i&1;
1503                                 best.hy = j&1;
1504                         }
1505                 }
1506         }
1507         best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
1508         *res = best;
1509 }
1510
1511 /*
1512  * total absolute difference between two (16*h) blocks
1513  * including optional half pel interpolation of blk1 (hx,hy)
1514  * blk1,blk2: addresses of top left pels of both blocks
1515  * lx:        distance (in bytes) of vertically adjacent pels
1516  * hx,hy:     flags for horizontal and/or vertical interpolation
1517  * h:         height of block (usually 8 or 16)
1518  * distlim:   bail out if sum exceeds this value
1519  */
1520
1521 /* A.Stevens 2000: New version for highly pipelined CPUs where branching is
1522    costly.  Really it sucks that C doesn't define a stdlib abs that could
1523    be realised as a compiler intrinsic using appropriate CPU instructions.
1524    That 1970's heritage...
1525 */
1526
1527
1528 static int dist1_00(uint8_t *blk1,uint8_t *blk2,
1529                                         int lx, int h,int distlim)
1530 {
1531         uint8_t *p1,*p2;
1532         int j;
1533         int s;
1534         register int v;
1535
1536         s = 0;
1537         p1 = blk1;
1538         p2 = blk2;
1539         for (j=0; j<h; j++)
1540         {
1541
1542 #define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
1543                 pipestep(0);  pipestep(1);  pipestep(2);  pipestep(3);
1544                 pipestep(4);  pipestep(5);  pipestep(6);  pipestep(7);
1545                 pipestep(8);  pipestep(9);  pipestep(10); pipestep(11);
1546                 pipestep(12); pipestep(13); pipestep(14); pipestep(15);
1547 #undef pipestep
1548
1549                 if (s >= distlim)
1550                         break;
1551                         
1552                 p1+= lx;
1553                 p2+= lx;
1554         }
1555         return s;
1556 }
1557
1558 static int dist1_01(uint8_t *blk1,uint8_t *blk2,
1559                                         int lx, int h)
1560 {
1561         uint8_t *p1,*p2;
1562         int i,j;
1563         int s;
1564         register int v;
1565
1566         s = 0;
1567         p1 = blk1;
1568         p2 = blk2;
1569         for (j=0; j<h; j++)
1570         {
1571                 for (i=0; i<16; i++)
1572                 {
1573
1574                         v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
1575                         /*
1576                           v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
1577                         */
1578                         s+=abs(v);
1579                 }
1580                 p1+= lx;
1581                 p2+= lx;
1582         }
1583         return s;
1584 }
1585
1586 static int dist1_10(uint8_t *blk1,uint8_t *blk2,
1587                                         int lx, int h)
1588 {
1589         uint8_t *p1,*p1a,*p2;
1590         int i,j;
1591         int s;
1592         register int v;
1593
1594         s = 0;
1595         p1 = blk1;
1596         p2 = blk2;
1597         p1a = p1 + lx;
1598         for (j=0; j<h; j++)
1599         {
1600                 for (i=0; i<16; i++)
1601                 {
1602                         v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
1603                         s+=abs(v);
1604                 }
1605                 p1 = p1a;
1606                 p1a+= lx;
1607                 p2+= lx;
1608         }
1609
1610         return s;
1611 }
1612
1613 static int dist1_11(uint8_t *blk1,uint8_t *blk2,
1614                                         int lx, int h)
1615 {
1616         uint8_t *p1,*p1a,*p2;
1617         int i,j;
1618         int s;
1619         register int v;
1620
1621         s = 0;
1622         p1 = blk1;
1623         p2 = blk2;
1624         p1a = p1 + lx;
1625           
1626         for (j=0; j<h; j++)
1627         {
1628                 for (i=0; i<16; i++)
1629                 {
1630                         v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
1631                         s+=abs(v);
1632                 }
1633                 p1 = p1a;
1634                 p1a+= lx;
1635                 p2+= lx;
1636         }
1637         return s;
1638 }
1639
1640 /* USED only during debugging...
1641 void check_fast_motion_data(uint8_t *blk, char *label )
1642 {
1643   uint8_t *b, *nb;
1644   uint8_t *pb,*p;
1645   uint8_t *qb;
1646   uint8_t *start_s22blk, *start_s44blk;
1647   int i;
1648   unsigned int mismatch;
1649   int nextfieldline;
1650   start_s22blk = (blk+height*width);
1651   start_s44blk = (blk+height*width+(height>>1)*(width>>1));
1652
1653   if (pict_struct==FRAME_PICTURE)
1654         {
1655           nextfieldline = width;
1656         }
1657   else
1658         {
1659           nextfieldline = 2*width;
1660         }
1661
1662         mismatch = 0;
1663         pb = start_s22blk;
1664         qb = start_s44blk;
1665         p = blk;
1666         while( pb < qb )
1667         {
1668                 for( i = 0; i < nextfieldline/2; ++i )
1669                 {
1670                         mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
1671                         p += 2;
1672                         ++pb;                   
1673                 }
1674                 p += nextfieldline;
1675         }
1676                 if( mismatch )
1677                         {
1678                                 printf( "Mismatch detected check %s for buffer %08x\n", label,  blk );
1679                                         exit(1);
1680                         }
1681 }
1682 */
1683
1684 /* 
1685    Append fast motion estimation data to original luminance
1686    data.  N.b. memory allocation for luminance data allows space
1687    for this information...
1688  */
1689
1690 void fast_motion_data(uint8_t *blk, int picture_struct )
1691 {
1692         uint8_t *b, *nb;
1693         uint8_t *pb;
1694         uint8_t *qb;
1695         uint8_t *start_s22blk, *start_s44blk;
1696         uint16_t *start_rowblk, *start_colblk;
1697         int i;
1698         int nextfieldline;
1699 #ifdef TEST_RCSEARCH
1700         uint16_t *pc, *pr,*p;
1701         int rowsum;
1702         int j,s;
1703         int down16 = width*16;
1704         uint16_t sums[32];
1705         uint16_t rowsums[2048];
1706         uint16_t colsums[2048];  /* TODO: BUG: should resize with width */
1707 #endif 
1708   
1709
1710         /* In an interlaced field the "next" line is 2 width's down 
1711            rather than 1 width down                                 */
1712
1713         if (picture_struct==FRAME_PICTURE)
1714         {
1715                 nextfieldline = width;
1716         }
1717         else
1718         {
1719                 nextfieldline = 2*width;
1720         }
1721
1722         start_s22blk   = blk+fsubsample_offset;
1723         start_s44blk   = blk+qsubsample_offset;
1724         start_rowblk = (uint16_t *)blk+rowsums_offset;
1725         start_colblk = (uint16_t *)blk+colsums_offset;
1726         b = blk;
1727         nb = (blk+nextfieldline);
1728         /* Sneaky stuff here... we can do lines in both fields at once */
1729         pb = (uint8_t *) start_s22blk;
1730
1731         while( nb < start_s22blk )
1732         {
1733                 for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
1734                 {
1735                         /* TODO: A.Stevens this has to be the most word-length dependent
1736                            code in the world.  Better than MMX assembler though I guess... */
1737                         pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1738                         pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;     
1739                         pb += 2;
1740                         b += 4;
1741                         nb += 4;
1742                 }
1743                 b += nextfieldline;
1744                 nb = b + nextfieldline;
1745         }
1746
1747
1748         /* Now create the 4*4 sub-sampled data from the 2*2 
1749            N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
1750            original.  Albeit half as many lines and pixels...
1751         */
1752
1753         nextfieldline = nextfieldline >> 1;
1754
1755         qb = start_s44blk;
1756         b  = start_s22blk;
1757         nb = (start_s22blk+nextfieldline);
1758
1759         while( nb < start_s44blk )
1760         {
1761                 for( i = 0; i < nextfieldline/4; ++i )
1762                 {
1763                         /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
1764                         qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1765                         qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1766                         qb += 2;
1767                         b += 4;
1768                         nb += 4;
1769                 }
1770                 b += nextfieldline;
1771                 nb = b + nextfieldline;
1772         }
1773
1774 #ifdef TEST_RCSEARCH
1775         /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
1776   
1777         /*
1778           Initial row sums....
1779         */
1780         pb = blk;
1781         for(j = 0; j < height; ++j )
1782         {
1783                 rowsum = 0;
1784                 for( i = 0; i < 16; ++ i )
1785                 {
1786                         rowsum += pb[i];
1787                 }
1788                 rowsums[j] = rowsum;
1789                 pb += width;
1790         }
1791   
1792         /*
1793           Initial column sums
1794         */
1795         for( i = 0; i < width; ++i )
1796         {
1797                 colsums[i] = 0;
1798         }
1799         pb = blk;
1800         for( j = 0; j < 16; ++j )
1801         {
1802                 for( i = 0; i < width; ++i )
1803                 {
1804                         colsums[i] += *pb;
1805                         ++pb;
1806                 }
1807         }
1808   
1809         /* Now fill in the row/column sum tables...
1810            Note: to allow efficient construction of sum/col differences for a
1811            given position row sums are held in a *column major* array
1812            (changing y co-ordinate makes for small index changes)
1813            the col sums are held in a *row major* array
1814         */
1815   
1816         pb = blk;
1817         pc = start_colblk;
1818         for(j = 0; j <32; ++j )
1819         {
1820                 pr = start_rowblk;
1821                 rowsum = rowsums[j];
1822                 for( i = 0; i < width-16; ++i )
1823                 {
1824                         pc[i] = colsums[i];
1825                         pr[j] = rowsum;
1826                         colsums[i] = (colsums[i] + pb[down16] )-pb[0];
1827                         rowsum = (rowsum + pb[16]) - pb[0];
1828                         ++pb;
1829                         pr += height;
1830                 }
1831                 pb += 16;   /* move pb on to next row... rememember we only did width-16! */
1832                 pc += width;
1833         }
1834 #endif          
1835 }
1836
1837
1838 static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
1839 {
1840         uint8_t *p1 = s22blk1;
1841         uint8_t *p2 = s22blk2;
1842         int s = 0;
1843         int j;
1844
1845         for( j = 0; j < fh; ++j )
1846         {
1847                 register int diff;
1848 #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
1849                 pipestep(0); pipestep(1);
1850                 pipestep(2); pipestep(3);
1851                 pipestep(4); pipestep(5);
1852                 pipestep(6); pipestep(7);
1853                 p1 += flx;
1854                 p2 += flx;
1855 #undef pipestep
1856         }
1857
1858         return s;
1859 }
1860
1861
1862
1863 /*
1864   Sum absolute differences for 4*4 sub-sampled data.  
1865
1866   TODO: currently assumes  only 16*16 or 16*8 motion compensation will be used...
1867   I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
1868  */
1869
1870
1871 static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
1872 {
1873         register uint8_t *p1 = s44blk1;
1874         register uint8_t *p2 = s44blk2;
1875         int s = 0;
1876         register int diff;
1877
1878         /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
1879 #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
1880         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1881         if( qh > 1 )
1882         {
1883                 p1 += qlx; p2 += qlx;
1884                 pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1885                 if( qh > 2 )
1886                 {
1887                         p1 += qlx; p2 += qlx;
1888                         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1889                         p1 += qlx; p2 += qlx;
1890                         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1891                 }
1892         }
1893
1894
1895         return s;
1896 }
1897
1898 /*
1899  * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
1900  * blk1,blk2: addresses of top left pels of both blocks
1901  * lx:        distance (in bytes) of vertically adjacent pels
1902  * h:         height of block (usually 8 or 16)
1903  */
1904  
1905 static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
1906 {
1907         uint8_t *p1 = blk1, *p2 = blk2;
1908         int i,j,v;
1909         int s = 0;
1910         for (j=0; j<h; j++)
1911         {
1912                 for (i=0; i<8; i++)
1913                 {
1914                         v = p1[i] - p2[i];
1915                         s+= v*v;
1916                 }
1917                 p1+= lx;
1918                 p2+= lx;
1919         }
1920         return s;
1921 }
1922
1923 /* total squared difference between bidirection prediction of (8*h)
1924  * blocks of 2*2 sub-sampled pels and reference 
1925  * blk1f, blk1b,blk2: addresses of top left
1926  * pels of blocks 
1927  * lx: distance (in bytes) of vertically adjacent
1928  * pels 
1929  * h: height of block (usually 4 or 8)
1930  */
1931  
1932 static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2, 
1933                                          int lx, int h)
1934 {
1935         uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
1936         int i,j,v;
1937         int s = 0;
1938         for (j=0; j<h; j++)
1939         {
1940                 for (i=0; i<8; i++)
1941                 {
1942                         v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
1943                         s+= v*v;
1944                 }
1945                 p1f+= lx;
1946                 p1b+= lx;
1947                 p2+= lx;
1948         }
1949         return s;
1950 }
1951
1952 /*
1953  * total squared difference between two (16*h) blocks
1954  * including optional half pel interpolation of blk1 (hx,hy)
1955  * blk1,blk2: addresses of top left pels of both blocks
1956  * lx:        distance (in bytes) of vertically adjacent pels
1957  * hx,hy:     flags for horizontal and/or vertical interpolation
1958  * h:         height of block (usually 8 or 16)
1959  */
1960  
1961
1962 static int dist2(blk1,blk2,lx,hx,hy,h)
1963         uint8_t *blk1,*blk2;
1964         int lx,hx,hy,h;
1965 {
1966         uint8_t *p1,*p1a,*p2;
1967         int i,j;
1968         int s,v;
1969
1970         s = 0;
1971         p1 = blk1;
1972         p2 = blk2;
1973         if (!hx && !hy)
1974                 for (j=0; j<h; j++)
1975                 {
1976                         for (i=0; i<16; i++)
1977                         {
1978                                 v = p1[i] - p2[i];
1979                                 s+= v*v;
1980                         }
1981                         p1+= lx;
1982                         p2+= lx;
1983                 }
1984         else if (hx && !hy)
1985                 for (j=0; j<h; j++)
1986                 {
1987                         for (i=0; i<16; i++)
1988                         {
1989                                 v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
1990                                 s+= v*v;
1991                         }
1992                         p1+= lx;
1993                         p2+= lx;
1994                 }
1995         else if (!hx && hy)
1996         {
1997                 p1a = p1 + lx;
1998                 for (j=0; j<h; j++)
1999                 {
2000                         for (i=0; i<16; i++)
2001                         {
2002                                 v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
2003                                 s+= v*v;
2004                         }
2005                         p1 = p1a;
2006                         p1a+= lx;
2007                         p2+= lx;
2008                 }
2009         }
2010         else /* if (hx && hy) */
2011         {
2012                 p1a = p1 + lx;
2013                 for (j=0; j<h; j++)
2014                 {
2015                         for (i=0; i<16; i++)
2016                         {
2017                                 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
2018                                 s+= v*v;
2019                         }
2020                         p1 = p1a;
2021                         p1a+= lx;
2022                         p2+= lx;
2023                 }
2024         }
2025  
2026         return s;
2027 }
2028
2029
2030 /*
2031  * absolute difference error between a (16*h) block and a bidirectional
2032  * prediction
2033  *
2034  * p2: address of top left pel of block
2035  * pf,hxf,hyf: address and half pel flags of forward ref. block
2036  * pb,hxb,hyb: address and half pel flags of backward ref. block
2037  * h: height of block
2038  * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2039  */
2040  
2041
2042 static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2043         uint8_t *pf,*pb,*p2;
2044         int lx,hxf,hyf,hxb,hyb,h;
2045 {
2046         uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2047         int i,j;
2048         int s,v;
2049
2050         pfa = pf + hxf;
2051         pfb = pf + lx*hyf;
2052         pfc = pfb + hxf;
2053
2054         pba = pb + hxb;
2055         pbb = pb + lx*hyb;
2056         pbc = pbb + hxb;
2057
2058         s = 0;
2059
2060         for (j=0; j<h; j++)
2061         {
2062                 for (i=0; i<16; i++)
2063                 {
2064                         v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2065                                   ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2066                                 - *p2++;
2067                         s += abs(v);
2068                 }
2069                 p2+= lx-16;
2070                 pf+= lx-16;
2071                 pfa+= lx-16;
2072                 pfb+= lx-16;
2073                 pfc+= lx-16;
2074                 pb+= lx-16;
2075                 pba+= lx-16;
2076                 pbb+= lx-16;
2077                 pbc+= lx-16;
2078         }
2079
2080         return s;
2081 }
2082
2083 /*
2084  * squared error between a (16*h) block and a bidirectional
2085  * prediction
2086  *
2087  * p2: address of top left pel of block
2088  * pf,hxf,hyf: address and half pel flags of forward ref. block
2089  * pb,hxb,hyb: address and half pel flags of backward ref. block
2090  * h: height of block
2091  * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2092  */
2093  
2094
2095 static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2096         uint8_t *pf,*pb,*p2;
2097         int lx,hxf,hyf,hxb,hyb,h;
2098 {
2099         uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2100         int i,j;
2101         int s,v;
2102
2103         pfa = pf + hxf;
2104         pfb = pf + lx*hyf;
2105         pfc = pfb + hxf;
2106
2107         pba = pb + hxb;
2108         pbb = pb + lx*hyb;
2109         pbc = pbb + hxb;
2110
2111         s = 0;
2112
2113         for (j=0; j<h; j++)
2114         {
2115                 for (i=0; i<16; i++)
2116                 {
2117                         v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2118                                   ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2119                                 - *p2++;
2120                         s+=v*v;
2121                 }
2122                 p2+= lx-16;
2123                 pf+= lx-16;
2124                 pfa+= lx-16;
2125                 pfb+= lx-16;
2126                 pfc+= lx-16;
2127                 pb+= lx-16;
2128                 pba+= lx-16;
2129                 pbb+= lx-16;
2130                 pbc+= lx-16;
2131         }
2132
2133         return s;
2134 }
2135
2136
2137 /*
2138  * variance of a (size*size) block, multiplied by 256
2139  * p:  address of top left pel of block
2140  * lx: distance (in bytes) of vertically adjacent pels
2141  */
2142 static int variance(uint8_t *p, int size,       int lx)
2143 {
2144         int i,j;
2145         unsigned int v,s,s2;
2146
2147         s = s2 = 0;
2148
2149         for (j=0; j<size; j++)
2150         {
2151                 for (i=0; i<size; i++)
2152                 {
2153                         v = *p++;
2154                         s+= v;
2155                         s2+= v*v;
2156                 }
2157                 p+= lx-size;
2158         }
2159         return s2 - (s*s)/(size*size);
2160 }
2161
2162 /*
2163   Compute the variance of the residual of uni-directionally motion
2164   compensated block.
2165  */
2166
2167 static int unidir_pred_var( const mb_motion_s *motion,
2168                                                         uint8_t *mb,  
2169                                                         int lx, 
2170                                                         int h)
2171 {
2172         return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
2173 }
2174
2175
2176 /*
2177   Compute the variance of the residual of bi-directionally motion
2178   compensated block.
2179  */
2180
2181 static int bidir_pred_var( const mb_motion_s *motion_f, 
2182                                                    const mb_motion_s *motion_b,
2183                                                    uint8_t *mb,  
2184                                                    int lx, int h)
2185 {
2186         return (*pbdist2)( motion_f->blk, motion_b->blk,
2187                                            mb, lx, 
2188                                            motion_f->hx, motion_f->hy,
2189                                            motion_b->hx, motion_b->hy,
2190                                            h);
2191 }
2192
2193 /*
2194   Compute SAD for bi-directionally motion compensated blocks...
2195  */
2196
2197 static int bidir_pred_sad( const mb_motion_s *motion_f, 
2198                                                    const mb_motion_s *motion_b,
2199                                                    uint8_t *mb,  
2200                                                    int lx, int h)
2201 {
2202         return (*pbdist1)(motion_f->blk, motion_b->blk, 
2203                                          mb, lx, 
2204                                          motion_f->hx, motion_f->hy,
2205                                          motion_b->hx, motion_b->hy,
2206                                          h);
2207 }
2208
2209 static void frame_ME(motion_engine_t *engine,
2210                                         pict_data_s *picture,
2211                                          motion_comp_s *mc,
2212                                          int mb_row_start,
2213                                          int i, int j, 
2214                                          mbinfo_s *mbi)
2215 {
2216         mb_motion_s framef_mc;
2217         mb_motion_s frameb_mc;
2218         mb_motion_s dualpf_mc;
2219         mb_motion_s topfldf_mc;
2220         mb_motion_s botfldf_mc;
2221         mb_motion_s topfldb_mc;
2222         mb_motion_s botfldb_mc;
2223
2224         int var,v0;
2225         int vmc,vmcf,vmcr,vmci;
2226         int vmcfieldf,vmcfieldr,vmcfieldi;
2227         subsampled_mb_s ssmb;
2228         int imins[2][2],jmins[2][2];
2229         int imindmv,jmindmv,vmc_dp;
2230 //printf("frame_ME 1\n");
2231
2232
2233         /* A.Stevens fast motion estimation data is appended to actual
2234            luminance information 
2235         */
2236         ssmb.mb = mc->cur[0] + mb_row_start + i;
2237         ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
2238         ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
2239         ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset + 
2240                                                   ((i>>1) + (mb_row_start>>2)));
2241         ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset + 
2242                                                   (i>>2) + (mb_row_start>>4));
2243
2244         /* Compute variance MB as a measure of Intra-coding complexity
2245            We include chrominance information here, scaled to compensate
2246            for sub-sampling.  Silly MPEG forcing chrom/lum to have same
2247            quantisations...
2248          */
2249         var = variance(ssmb.mb,16,width);
2250
2251 //printf("motion %d\n", picture->pict_type);
2252         if (picture->pict_type==I_TYPE)
2253         {
2254                 mbi->mb_type = MB_INTRA;
2255         }
2256         else if (picture->pict_type==P_TYPE)
2257         {
2258                 /* For P pictures we take into account chrominance. This
2259                    provides much better performance at scene changes */
2260                 var += chrom_var_sum(&ssmb,16,width);
2261
2262                 if (picture->frame_pred_dct)
2263                 {
2264                         fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2265                                            width,i,j,mc->sxf,mc->syf,16,width,height,
2266                                             &framef_mc);
2267                         framef_mc.fieldoff = 0;
2268                         vmc = framef_mc.var +
2269                                 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2270                         mbi->motion_type = MC_FRAME;
2271                 }
2272                 else
2273                 {
2274 //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
2275                         frame_estimate(engine, mc->oldorg[0],
2276                                 mc->oldref[0],
2277                                 &ssmb,
2278                                 i,
2279                                 j,
2280                                 mc->sxf,
2281                                 mc->syf,
2282                                 &framef_mc,
2283                                 &topfldf_mc,
2284                                 &botfldf_mc,
2285                                 imins,
2286                                 jmins);
2287 //printf("frame_ME 3\n");
2288                         vmcf = framef_mc.var + 
2289                                 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2290                         vmcfieldf = 
2291                                 topfldf_mc.var + 
2292                                 unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
2293                                 botfldf_mc.var + 
2294                                 unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
2295                         /* DEBUG DP currently disabled... */
2296 //                      if ( M==1)
2297 //                      {
2298 //                              dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
2299 //                                                               i,j>>1,imins,jmins,
2300 //                                                               &dualpf_mc,
2301 //                                                               &imindmv,&jmindmv, &vmc_dp);
2302 //                      }
2303
2304                         /* NOTE: Typically M =3 so DP actually disabled... */
2305                         /* select between dual prime, frame and field prediction */
2306 //                      if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
2307 //                      {
2308 //                              mbi->motion_type = MC_DMV;
2309 //                              /* No chrominance squared difference  measure yet.
2310 //                                 Assume identical to luminance */
2311 //                              vmc = vmc_dp + vmc_dp;
2312 //                      }
2313 //                      else 
2314                         if ( vmcf < vmcfieldf)
2315                         {
2316                                 mbi->motion_type = MC_FRAME;
2317                                 vmc = vmcf;
2318                                         
2319                         }
2320                         else
2321                         {
2322                                 mbi->motion_type = MC_FIELD;
2323                                 vmc = vmcfieldf;
2324                         }
2325                 }
2326
2327
2328                 /* select between intra or non-intra coding:
2329                  *
2330                  * selection is based on intra block variance (var) vs.
2331                  * prediction error variance (vmc)
2332                  *
2333                  * Used to be: blocks with small prediction error are always 
2334                  * coded non-intra even if variance is smaller (is this reasonable?
2335                  *
2336                  * TODO: A.Stevens Jul 2000
2337                  * The bbmpeg guys have found this to be *unreasonable*.
2338                  * I'm not sure I buy their solution using vmc*2.  It is probabably
2339                  * the vmc>= 9*256 test that is suspect.
2340                  * 
2341                  */
2342
2343
2344                 if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
2345                 {
2346                         mbi->mb_type = MB_INTRA;
2347                         mbi->var = var;
2348                 }
2349                 
2350                 else
2351                 {
2352                         /* select between MC / No-MC
2353                          *
2354                          * use No-MC if var(No-MC) <= 1.25*var(MC)
2355                          * (i.e slightly biased towards No-MC)
2356                          *
2357                          * blocks with small prediction error are always coded as No-MC
2358                          * (requires no motion vectors, allows skipping)
2359                          */
2360                         v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
2361
2362                         if (4*v0>5*vmc )
2363                         {
2364                                 /* use MC */
2365                                 var = vmc;
2366                                 mbi->mb_type = MB_FORWARD;
2367                                 if (mbi->motion_type==MC_FRAME)
2368                                 {
2369                                         mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2370                                         mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2371                                 }
2372                                 else if (mbi->motion_type==MC_DMV)
2373                                 {
2374                                         /* these are FRAME vectors */
2375                                         /* same parity vector */
2376                                         mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
2377                                         mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
2378
2379                                         /* opposite parity vector */
2380                                         mbi->dmvector[0] = imindmv;
2381                                         mbi->dmvector[1] = jmindmv;
2382                                 }
2383                                 else
2384                                 {
2385                                         /* these are FRAME vectors */
2386                                         mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2387                                         mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2388                                         mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2389                                         mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2390                                         mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2391                                         mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2392                                 }
2393                         }
2394                         else
2395                         {
2396
2397                                 /* No-MC */
2398                                 var = v0;
2399                                 mbi->mb_type = 0;
2400                                 mbi->motion_type = MC_FRAME;
2401                                 mbi->MV[0][0][0] = 0;
2402                                 mbi->MV[0][0][1] = 0;
2403                         }
2404                 }
2405         }
2406         else /* if (pict_type==B_TYPE) */
2407         {
2408                 if (picture->frame_pred_dct)
2409                 {
2410                         var = variance(ssmb.mb,16,width);
2411                         /* forward */
2412                         fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2413                                            width,i,j,mc->sxf,mc->syf,
2414                                            16,width,height,
2415                                            &framef_mc
2416                                            );
2417                         framef_mc.fieldoff = 0;
2418                         vmcf = framef_mc.var;
2419
2420                         /* backward */
2421                         fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
2422                                            width,i,j,mc->sxb,mc->syb,
2423                                            16,width,height,
2424                                            &frameb_mc);
2425                         frameb_mc.fieldoff = 0;
2426                         vmcr = frameb_mc.var;
2427
2428                         /* interpolated (bidirectional) */
2429
2430                         vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2431
2432                         /* decisions */
2433
2434                         /* select between forward/backward/interpolated prediction:
2435                          * use the one with smallest mean sqaured prediction error
2436                          */
2437                         if (vmcf<=vmcr && vmcf<=vmci)
2438                         {
2439                                 vmc = vmcf;
2440                                 mbi->mb_type = MB_FORWARD;
2441                         }
2442                         else if (vmcr<=vmci)
2443                         {
2444                                 vmc = vmcr;
2445                                 mbi->mb_type = MB_BACKWARD;
2446                         }
2447                         else
2448                         {
2449                                 vmc = vmci;
2450                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2451                         }
2452
2453                         mbi->motion_type = MC_FRAME;
2454                 }
2455                 else
2456                 {
2457                         /* forward prediction */
2458                         frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2459                                                    i,j,mc->sxf,mc->syf,
2460                                                    &framef_mc,
2461                                                    &topfldf_mc,
2462                                                    &botfldf_mc,
2463                                                    imins,jmins);
2464
2465
2466                         /* backward prediction */
2467                         frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
2468                                                    i,j,mc->sxb,mc->syb,
2469                                                    &frameb_mc,
2470                                                    &topfldb_mc,
2471                                                    &botfldb_mc,
2472                                                    imins,jmins);
2473
2474                         vmcf = framef_mc.var;
2475                         vmcr = frameb_mc.var;
2476                         vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2477
2478                         vmcfieldf = topfldf_mc.var + botfldf_mc.var;
2479                         vmcfieldr = topfldb_mc.var + botfldb_mc.var;
2480                         vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb, 
2481                                                                                 width<<1, 8) +
2482                                         bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb, 
2483                                                                                 width<<1, 8);
2484
2485                         /* select prediction type of minimum distance from the
2486                          * six candidates (field/frame * forward/backward/interpolated)
2487                          */
2488                         if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
2489                                   && vmci<vmcr && vmci<vmcfieldr)
2490                         {
2491                                 /* frame, interpolated */
2492                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2493                                 mbi->motion_type = MC_FRAME;
2494                                 vmc = vmci;
2495                         }
2496                         else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
2497                                            && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
2498                         {
2499                                 /* field, interpolated */
2500                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2501                                 mbi->motion_type = MC_FIELD;
2502                                 vmc = vmcfieldi;
2503                         }
2504                         else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
2505                         {
2506                                 /* frame, forward */
2507                                 mbi->mb_type = MB_FORWARD;
2508                                 mbi->motion_type = MC_FRAME;
2509                                 vmc = vmcf;
2510
2511                         }
2512                         else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
2513                         {
2514                                 /* field, forward */
2515                                 mbi->mb_type = MB_FORWARD;
2516                                 mbi->motion_type = MC_FIELD;
2517                                 vmc = vmcfieldf;
2518                         }
2519                         else if (vmcr<vmcfieldr)
2520                         {
2521                                 /* frame, backward */
2522                                 mbi->mb_type = MB_BACKWARD;
2523                                 mbi->motion_type = MC_FRAME;
2524                                 vmc = vmcr;
2525                         }
2526                         else
2527                         {
2528                                 /* field, backward */
2529                                 mbi->mb_type = MB_BACKWARD;
2530                                 mbi->motion_type = MC_FIELD;
2531                                 vmc = vmcfieldr;
2532
2533                         }
2534                 }
2535
2536                 /* select between intra or non-intra coding:
2537                  *
2538                  * selection is based on intra block variance (var) vs.
2539                  * prediction error variance (vmc)
2540                  *
2541                  * Used to be: blocks with small prediction error are always 
2542                  * coded non-intra even if variance is smaller (is this reasonable?
2543                  *
2544                  * TODO: A.Stevens Jul 2000
2545                  * The bbmpeg guys have found this to be *unreasonable*.
2546                  * I'm not sure I buy their solution using vmc*2 in the first comparison.
2547                  * It is probabably the vmc>= 9*256 test that is suspect.
2548                  *
2549                  */
2550                 if (vmc>var && vmc>=9*256)
2551                         mbi->mb_type = MB_INTRA;
2552                 else
2553                 {
2554                         var = vmc;
2555                         if (mbi->motion_type==MC_FRAME)
2556                         {
2557                                 /* forward */
2558                                 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2559                                 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2560                                 /* backward */
2561                                 mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
2562                                 mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
2563                         }
2564                         else
2565                         {
2566                                 /* these are FRAME vectors */
2567                                 /* forward */
2568                                 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2569                                 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2570                                 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2571                                 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2572                                 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2573                                 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2574                                 /* backward */
2575                                 mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
2576                                 mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
2577                                 mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
2578                                 mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
2579                                 mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
2580                                 mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
2581                         }
2582                 }
2583         }
2584 }
2585
2586
2587
2588 /*
2589  * motion estimation for field pictures
2590  *
2591  * mbi:    pointer to macroblock info structure
2592  * secondfield: indicates second field of a frame (in P fields this means
2593  *              that reference field of opposite parity is in curref instead
2594  *              of oldref)
2595  * ipflag: indicates a P type field which is the second field of a frame
2596  *         in which the first field is I type (this restricts predictions
2597  *         to be based only on the opposite parity (=I) field)
2598  *
2599  * results:
2600  * mbi->
2601  *  mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
2602  *  MV[][][]: motion vectors (field format)
2603  *  mv_field_sel: top/bottom field
2604  *  motion_type: MC_FIELD, MC_16X8
2605  *
2606  * uses global vars: pict_type, pict_struct
2607  */
2608 static void field_ME(motion_engine_t *engine,
2609         pict_data_s *picture,
2610         motion_comp_s *mc,
2611         int mb_row_start,
2612         int i, int j, 
2613         mbinfo_s *mbi, 
2614         int secondfield, int ipflag)
2615 {
2616         int w2;
2617         uint8_t *toporg, *topref, *botorg, *botref;
2618         subsampled_mb_s ssmb;
2619         mb_motion_s fields_mc, dualp_mc;
2620         mb_motion_s fieldf_mc, fieldb_mc;
2621         mb_motion_s field8uf_mc, field8lf_mc;
2622         mb_motion_s field8ub_mc, field8lb_mc;
2623         int var,vmc,v0,dmc,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
2624         int imins,jmins;
2625         int dmc8f,dmc8r;
2626         int vmc_dp,dmc_dp;
2627
2628         w2 = width<<1;
2629
2630         /* Fast motion data sits at the end of the luminance buffer */
2631         ssmb.mb = mc->cur[0] + i + w2*j;
2632         ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
2633         ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
2634         ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
2635         ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
2636
2637         if (picture->pict_struct==BOTTOM_FIELD)
2638         {
2639                 ssmb.mb += width;
2640                 ssmb.umb += (width >> 1);
2641                 ssmb.vmb += (width >> 1);
2642                 ssmb.fmb += (width >> 1);
2643                 ssmb.qmb += (width >> 2);
2644         }
2645
2646         var = variance(ssmb.mb,16,w2) + 
2647                 ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
2648
2649         if (picture->pict_type==I_TYPE)
2650                 mbi->mb_type = MB_INTRA;
2651         else if (picture->pict_type==P_TYPE)
2652         {
2653                 toporg = mc->oldorg[0];
2654                 topref = mc->oldref[0];
2655                 botorg = mc->oldorg[0] + width;
2656                 botref = mc->oldref[0] + width;
2657
2658                 if (secondfield)
2659                 {
2660                         /* opposite parity field is in same frame */
2661                         if (picture->pict_struct==TOP_FIELD)
2662                         {
2663                                 /* current is top field */
2664                                 botorg = mc->cur[0] + width;
2665                                 botref = mc->curref[0] + width;
2666                         }
2667                         else
2668                         {
2669                                 /* current is bottom field */
2670                                 toporg = mc->cur[0];
2671                                 topref = mc->curref[0];
2672                         }
2673                 }
2674
2675                 field_estimate(engine,
2676                                                 picture,
2677                                            toporg,topref,botorg,botref,&ssmb,
2678                                            i,j,mc->sxf,mc->syf,ipflag,
2679                                            &fieldf_mc,
2680                                            &field8uf_mc,
2681                                            &field8lf_mc,
2682                                            &fields_mc);
2683                 dmcfield = fieldf_mc.sad;
2684                 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2685
2686 //              if (M==1 && !ipflag)  /* generic condition which permits Dual Prime */
2687 //              {
2688 //                      dpfield_estimate(engine,
2689 //                                                      picture,
2690 //                                                       topref,botref,ssmb.mb,i,j,imins,jmins,
2691 //                                                       &dualp_mc,
2692 //                                                       &vmc_dp);
2693 //                      dmc_dp = dualp_mc.sad;
2694 //              }
2695                 
2696 //              /* select between dual prime, field and 16x8 prediction */
2697 //              if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
2698 //              {
2699 //                      /* Dual Prime prediction */
2700 //                      mbi->motion_type = MC_DMV;
2701 //                      dmc = dualp_mc.sad;
2702 //                      vmc = vmc_dp;
2703 //
2704 //              }
2705 //              else 
2706                 if (dmc8f<dmcfield)
2707                 {
2708                         /* 16x8 prediction */
2709                         mbi->motion_type = MC_16X8;
2710                         /* upper and lower half blocks */
2711                         vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2712                         vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2713                 }
2714                 else
2715                 {
2716                         /* field prediction */
2717                         mbi->motion_type = MC_FIELD;
2718                         vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
2719                 }
2720
2721                 /* select between intra and non-intra coding */
2722
2723                 if ( vmc>var && vmc>=9*256)
2724                         mbi->mb_type = MB_INTRA;
2725                 else
2726                 {
2727                         /* zero MV field prediction from same parity ref. field
2728                          * (not allowed if ipflag is set)
2729                          */
2730                         if (!ipflag)
2731                                 v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
2732                                                            ssmb.mb,w2,0,0,16);
2733                         if (ipflag  || (4*v0>5*vmc && v0>=9*256))
2734                         {
2735                                 var = vmc;
2736                                 mbi->mb_type = MB_FORWARD;
2737                                 if (mbi->motion_type==MC_FIELD)
2738                                 {
2739                                         mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2740                                         mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2741                                         mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2742                                 }
2743                                 else if (mbi->motion_type==MC_DMV)
2744                                 {
2745                                         /* same parity vector */
2746                                         mbi->MV[0][0][0] = imins - (i<<1);
2747                                         mbi->MV[0][0][1] = jmins - (j<<1);
2748
2749                                         /* opposite parity vector */
2750                                         mbi->dmvector[0] = dualp_mc.pos.x;
2751                                         mbi->dmvector[1] = dualp_mc.pos.y;
2752                                 }
2753                                 else
2754                                 {
2755                                         mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2756                                         mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2757                                         mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2758                                         mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2759                                         mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2760                                         mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2761                                 }
2762                         }
2763                         else
2764                         {
2765                                 /* No MC */
2766                                 var = v0;
2767                                 mbi->mb_type = 0;
2768                                 mbi->motion_type = MC_FIELD;
2769                                 mbi->MV[0][0][0] = 0;
2770                                 mbi->MV[0][0][1] = 0;
2771                                 mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
2772                         }
2773                 }
2774         }
2775         else /* if (pict_type==B_TYPE) */
2776         {
2777                 /* forward prediction */
2778                 field_estimate(engine, 
2779                                                 picture,
2780                                            mc->oldorg[0],mc->oldref[0],
2781                                            mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
2782                                            i,j,mc->sxf,mc->syf,0,
2783                                            &fieldf_mc,
2784                                            &field8uf_mc,
2785                                            &field8lf_mc,
2786                                            &fields_mc);
2787                 dmcfieldf = fieldf_mc.sad;
2788                 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2789
2790                 /* backward prediction */
2791                 field_estimate(engine,
2792                                                 picture,
2793                                            mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
2794                                            &ssmb,
2795                                            i,j,mc->sxb,mc->syb,0,
2796                                            &fieldb_mc,
2797                                            &field8ub_mc,
2798                                            &field8lb_mc,
2799                                            &fields_mc);
2800                 dmcfieldr = fieldb_mc.sad;
2801                 dmc8r = field8ub_mc.sad + field8lb_mc.sad;
2802
2803                 /* calculate distances for bidirectional prediction */
2804                 /* field */
2805                 dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2806
2807                 /* 16x8 upper and lower half blocks */
2808                 dmc8i =  bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
2809                 dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
2810
2811                 /* select prediction type of minimum distance */
2812                 if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
2813                         && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
2814                 {
2815                         /* field, interpolated */
2816                         mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2817                         mbi->motion_type = MC_FIELD;
2818                         vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2819                 }
2820                 else if (dmc8i<dmcfieldf && dmc8i<dmc8f
2821                                  && dmc8i<dmcfieldr && dmc8i<dmc8r)
2822                 {
2823                         /* 16x8, interpolated */
2824                         mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2825                         mbi->motion_type = MC_16X8;
2826
2827                         /* upper and lower half blocks */
2828                         vmc =  bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
2829                         vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
2830                 }
2831                 else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
2832                 {
2833                         /* field, forward */
2834                         mbi->mb_type = MB_FORWARD;
2835                         mbi->motion_type = MC_FIELD;
2836                         vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
2837                 }
2838                 else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
2839                 {
2840                         /* 16x8, forward */
2841                         mbi->mb_type = MB_FORWARD;
2842                         mbi->motion_type = MC_16X8;
2843
2844                         /* upper and lower half blocks */
2845                         vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2846                         vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2847                 }
2848                 else if (dmcfieldr<dmc8r)
2849                 {
2850                         /* field, backward */
2851                         mbi->mb_type = MB_BACKWARD;
2852                         mbi->motion_type = MC_FIELD;
2853                         vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
2854                 }
2855                 else
2856                 {
2857                         /* 16x8, backward */
2858                         mbi->mb_type = MB_BACKWARD;
2859                         mbi->motion_type = MC_16X8;
2860
2861                         /* upper and lower half blocks */
2862                         vmc =  unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
2863                         vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
2864
2865                 }
2866
2867                 /* select between intra and non-intra coding */
2868                 if (vmc>var && vmc>=9*256)
2869                         mbi->mb_type = MB_INTRA;
2870                 else
2871                 {
2872                         var = vmc;
2873                         if (mbi->motion_type==MC_FIELD)
2874                         {
2875                                 /* forward */
2876                                 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2877                                 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2878                                 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2879                                 /* backward */
2880                                 mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
2881                                 mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
2882                                 mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
2883                         }
2884                         else /* MC_16X8 */
2885                         {
2886                                 /* forward */
2887                                 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2888                                 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2889                                 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2890                                 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2891                                 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2892                                 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2893                                 /* backward */
2894                                 mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
2895                                 mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
2896                                 mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
2897                                 mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
2898                                 mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
2899                                 mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
2900                         }
2901                 }
2902         }
2903
2904 }
2905
2906
2907
2908 /*
2909   Initialise motion compensation - currently purely selection of which
2910   versions of the various low level computation routines to use
2911   
2912   */
2913
2914 void init_motion()
2915 {
2916         int cpucap = cpu_accel();
2917
2918         if( cpucap  == 0 )      /* No MMX/SSE etc support available */
2919         {
2920                 pdist22 = dist22;
2921                 pdist44 = dist44;
2922                 pdist1_00 = dist1_00;
2923                 pdist1_01 = dist1_01;
2924                 pdist1_10 = dist1_10;
2925                 pdist1_11 = dist1_11;
2926                 pbdist1 = bdist1;
2927                 pdist2 = dist2;
2928                 pbdist2 = bdist2;
2929                 pdist2_22 = dist2_22;
2930                 pbdist2_22 = bdist2_22;
2931                 pfind_best_one_pel = find_best_one_pel;
2932                 pbuild_sub22_mcomps     = build_sub22_mcomps;
2933          }
2934 #ifdef X86_CPU
2935         else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
2936         {
2937                 if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
2938                 pdist22 = dist22_mmxe;
2939                 pdist44 = dist44_mmxe;
2940                 pdist1_00 = dist1_00_mmxe;
2941                 pdist1_01 = dist1_01_mmxe;
2942                 pdist1_10 = dist1_10_mmxe;
2943                 pdist1_11 = dist1_11_mmxe;
2944                 pbdist1 = bdist1_mmx;
2945                 pdist2 = dist2_mmx;
2946                 pbdist2 = bdist2_mmx;
2947                 pdist2_22 = dist2_22_mmx;
2948                 pbdist2_22 = bdist2_22_mmx;
2949                 pfind_best_one_pel = find_best_one_pel_mmxe;
2950                 pbuild_sub22_mcomps     = build_sub22_mcomps_mmxe;
2951                 pmblock_sub44_dists = mblock_sub44_dists_mmxe;
2952
2953         }
2954         else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
2955         {
2956                 if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
2957                 pdist22 = dist22_mmx;
2958                 pdist44 = dist44_mmx;
2959                 pdist1_00 = dist1_00_mmx;
2960                 pdist1_01 = dist1_01_mmx;
2961                 pdist1_10 = dist1_10_mmx;
2962                 pdist1_11 = dist1_11_mmx;
2963                 pbdist1 = bdist1_mmx;
2964                 pdist2 = dist2_mmx;
2965                 pbdist2 = bdist2_mmx;
2966                 pdist2_22 = dist2_22_mmx;
2967                 pbdist2_22 = bdist2_22_mmx;
2968                 pfind_best_one_pel = find_best_one_pel;
2969                 pbuild_sub22_mcomps     = build_sub22_mcomps;
2970                 pmblock_sub44_dists = mblock_sub44_dists_mmx;
2971         }
2972 #endif
2973 }
2974
2975
2976 void motion_engine_loop(motion_engine_t *engine)
2977 {
2978         while(!engine->done)
2979         {
2980                 pthread_mutex_lock(&(engine->input_lock));
2981
2982                 if(!engine->done)
2983                 {
2984                         pict_data_s *picture = engine->pict_data;
2985                         motion_comp_s *mc_data = engine->motion_comp;
2986                         int secondfield = engine->secondfield;
2987                         int ipflag = engine->ipflag;
2988                         mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
2989                         int i, j;
2990                         int mb_row_incr;                        /* Offset increment to go down 1 row of mb's */
2991                         int mb_row_start;
2992
2993                         if (picture->pict_struct == FRAME_PICTURE)
2994                         {                       
2995                                 mb_row_incr = 16*width;
2996                                 mb_row_start = engine->start_row / 16 * mb_row_incr;
2997 /* loop through all macroblocks of a frame picture */
2998                                 for (j=engine->start_row; j < engine->end_row; j+=16)
2999                                 {
3000                                         for (i=0; i<width; i+=16)
3001                                         {
3002                                                 frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
3003                                                 mbi++;
3004                                         }
3005                                         mb_row_start += mb_row_incr;
3006                                 }
3007                         }
3008                         else
3009                         {               
3010                                 mb_row_incr = (16 * 2) * width;
3011                                 mb_row_start = engine->start_row / 16 * mb_row_incr;
3012 /* loop through all macroblocks of a field picture */
3013                                 for (j=engine->start_row; j < engine->end_row; j+=16)
3014                                 {
3015                                         for (i=0; i<width; i+=16)
3016                                         {
3017                                                 field_ME(engine, picture, mc_data, mb_row_start, i,j,
3018                                                                  mbi,secondfield,ipflag);
3019                                                 mbi++;
3020                                         }
3021                                         mb_row_start += mb_row_incr;
3022                                 }
3023                         }
3024                 }
3025
3026                 pthread_mutex_unlock(&(engine->output_lock));
3027         }
3028 }
3029
3030
3031 void motion_estimation(pict_data_s *picture,
3032         motion_comp_s *mc_data,
3033         int secondfield, int ipflag)
3034 {
3035         int i;
3036 /* Start loop */
3037         for(i = 0; i < processors; i++)
3038         {
3039                 motion_engines[i].motion_comp = mc_data;
3040
3041                 motion_engines[i].pict_data = picture;
3042
3043                 motion_engines[i].secondfield = secondfield;
3044                 motion_engines[i].ipflag = ipflag;
3045                 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3046         }
3047
3048 /* Wait for completion */
3049         for(i = 0; i < processors; i++)
3050         {
3051                 pthread_mutex_lock(&(motion_engines[i].output_lock));
3052         }
3053 }
3054
3055 void start_motion_engines()
3056 {
3057         int i;
3058         int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
3059         int current_row = 0;
3060         pthread_attr_t  attr;
3061         pthread_mutexattr_t mutex_attr;
3062
3063         pthread_mutexattr_init(&mutex_attr);
3064         pthread_attr_init(&attr);
3065         motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
3066         for(i = 0; i < processors; i++)
3067         {
3068                 motion_engines[i].start_row = current_row * i * 16;
3069                 current_row += rows_per_processor;
3070                 if(current_row > height2 / 16) current_row = height2 / 16;
3071                 motion_engines[i].end_row = current_row * 16;
3072                 pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
3073                 pthread_mutex_lock(&(motion_engines[i].input_lock));
3074                 pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
3075                 pthread_mutex_lock(&(motion_engines[i].output_lock));
3076                 motion_engines[i].done = 0;
3077                 pthread_create(&(motion_engines[i].tid), 
3078                         &attr, 
3079                         (void*)motion_engine_loop, 
3080                         &motion_engines[i]);
3081         }
3082 }
3083
3084 void stop_motion_engines()
3085 {
3086         int i;
3087         for(i = 0; i < processors; i++)
3088         {
3089                 motion_engines[i].done = 1;
3090                 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3091                 pthread_join(motion_engines[i].tid, 0);
3092                 pthread_mutex_destroy(&(motion_engines[i].input_lock));
3093                 pthread_mutex_destroy(&(motion_engines[i].output_lock));
3094         }
3095         free(motion_engines);
3096 }
3097
3098