Libav
dct-test.c
Go to the documentation of this file.
1 /*
2  * (c) 2001 Fabrice Bellard
3  * 2007 Marc Hoffman <marc.hoffman@analog.com>
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
28 #include "config.h"
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include <string.h>
32 #if HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif
35 #include <math.h>
36 
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
41 
42 #include "dct.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
45 #include "faandct.h"
46 #include "faanidct.h"
47 #include "x86/idct_xvid.h"
48 #include "dctref.h"
49 
50 // BFIN
51 void ff_bfin_idct(int16_t *block);
52 void ff_bfin_fdct(int16_t *block);
53 
54 // ALTIVEC
55 void ff_fdct_altivec(int16_t *block);
56 
57 // ARM
58 void ff_j_rev_dct_arm(int16_t *data);
59 void ff_simple_idct_arm(int16_t *data);
60 void ff_simple_idct_armv5te(int16_t *data);
61 void ff_simple_idct_armv6(int16_t *data);
62 void ff_simple_idct_neon(int16_t *data);
63 
64 struct algo {
65  const char *name;
66  void (*func)(int16_t *block);
70  int nonspec;
71 };
72 
73 static int cpu_flags;
74 
75 static const struct algo fdct_tab[] = {
76  { "REF-DBL", ff_ref_fdct, NO_PERM },
77  { "FAAN", ff_faandct, NO_PERM },
78  { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
79  { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
80 
81 #if HAVE_MMX_INLINE
82  { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
83 #endif
84 #if HAVE_MMXEXT_INLINE
86 #endif
87 #if HAVE_SSE2_INLINE
89 #endif
90 
91 #if HAVE_ALTIVEC
92  { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
93 #endif
94 
95 #if ARCH_BFIN
96  { "BFINfdct", ff_bfin_fdct, NO_PERM },
97 #endif
98 
99  { 0 }
100 };
101 
102 static const struct algo idct_tab[] = {
103  { "FAANI", ff_faanidct, NO_PERM },
104  { "REF-DBL", ff_ref_idct, NO_PERM },
105  { "INT", ff_j_rev_dct, MMX_PERM },
106  { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
107 
108 #if HAVE_MMX_INLINE
110  { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
111 #endif
112 #if HAVE_MMXEXT_INLINE
113  { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
114 #endif
115 #if HAVE_SSE2_INLINE
116  { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
117 #endif
118 
119 #if ARCH_BFIN
120  { "BFINidct", ff_bfin_idct, NO_PERM },
121 #endif
122 
123 #if ARCH_ARM
124  { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
125  { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
126 #endif
127 #if HAVE_ARMV5TE
128  { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
129 #endif
130 #if HAVE_ARMV6
131  { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
132 #endif
133 #if HAVE_NEON && ARCH_ARM
135 #endif
136 
137  { 0 }
138 };
139 
140 #define AANSCALE_BITS 12
141 
142 #define NB_ITS 20000
143 #define NB_ITS_SPEED 50000
144 
145 static short idct_mmx_perm[64];
146 
147 static short idct_simple_mmx_perm[64] = {
148  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
149  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
150  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
151  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
152  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
153  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
154  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
155  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
156 };
157 
158 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
159 
160 static void idct_mmx_init(void)
161 {
162  int i;
163 
164  /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
165  for (i = 0; i < 64; i++) {
166  idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
167  }
168 }
169 
170 DECLARE_ALIGNED(16, static int16_t, block)[64];
171 DECLARE_ALIGNED(8, static int16_t, block1)[64];
172 
173 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
174 {
175  int i, j;
176 
177  memset(block, 0, 64 * sizeof(*block));
178 
179  switch (test) {
180  case 0:
181  for (i = 0; i < 64; i++)
182  block[i] = (av_lfg_get(prng) % 512) - 256;
183  if (is_idct) {
184  ff_ref_fdct(block);
185  for (i = 0; i < 64; i++)
186  block[i] >>= 3;
187  }
188  break;
189  case 1:
190  j = av_lfg_get(prng) % 10 + 1;
191  for (i = 0; i < j; i++)
192  block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
193  break;
194  case 2:
195  block[ 0] = av_lfg_get(prng) % 4096 - 2048;
196  block[63] = (block[0] & 1) ^ 1;
197  break;
198  }
199 }
200 
201 static void permute(int16_t dst[64], const int16_t src[64], int perm)
202 {
203  int i;
204 
205  if (perm == MMX_PERM) {
206  for (i = 0; i < 64; i++)
207  dst[idct_mmx_perm[i]] = src[i];
208  } else if (perm == MMX_SIMPLE_PERM) {
209  for (i = 0; i < 64; i++)
210  dst[idct_simple_mmx_perm[i]] = src[i];
211  } else if (perm == SSE2_PERM) {
212  for (i = 0; i < 64; i++)
213  dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
214  } else if (perm == PARTTRANS_PERM) {
215  for (i = 0; i < 64; i++)
216  dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
217  } else {
218  for (i = 0; i < 64; i++)
219  dst[i] = src[i];
220  }
221 }
222 
223 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
224 {
225  void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
226  int it, i, scale;
227  int err_inf, v;
228  int64_t err2, ti, ti1, it1, err_sum = 0;
229  int64_t sysErr[64], sysErrMax = 0;
230  int maxout = 0;
231  int blockSumErrMax = 0, blockSumErr;
232  AVLFG prng;
233  double omse, ome;
234  int spec_err;
235 
236  av_lfg_init(&prng, 1);
237 
238  err_inf = 0;
239  err2 = 0;
240  for (i = 0; i < 64; i++)
241  sysErr[i] = 0;
242  for (it = 0; it < NB_ITS; it++) {
243  init_block(block1, test, is_idct, &prng);
244  permute(block, block1, dct->format);
245 
246  dct->func(block);
247  emms_c();
248 
249  if (dct->format == SCALE_PERM) {
250  for (i = 0; i < 64; i++) {
251  scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
252  block[i] = (block[i] * scale) >> AANSCALE_BITS;
253  }
254  }
255 
256  ref(block1);
257 
258  blockSumErr = 0;
259  for (i = 0; i < 64; i++) {
260  int err = block[i] - block1[i];
261  err_sum += err;
262  v = abs(err);
263  if (v > err_inf)
264  err_inf = v;
265  err2 += v * v;
266  sysErr[i] += block[i] - block1[i];
267  blockSumErr += v;
268  if (abs(block[i]) > maxout)
269  maxout = abs(block[i]);
270  }
271  if (blockSumErrMax < blockSumErr)
272  blockSumErrMax = blockSumErr;
273  }
274  for (i = 0; i < 64; i++)
275  sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
276 
277  for (i = 0; i < 64; i++) {
278  if (i % 8 == 0)
279  printf("\n");
280  printf("%7d ", (int) sysErr[i]);
281  }
282  printf("\n");
283 
284  omse = (double) err2 / NB_ITS / 64;
285  ome = (double) err_sum / NB_ITS / 64;
286 
287  spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
288 
289  printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
290  is_idct ? "IDCT" : "DCT", dct->name, err_inf,
291  omse, ome, (double) sysErrMax / NB_ITS,
292  maxout, blockSumErrMax);
293 
294  if (spec_err && !dct->nonspec)
295  return 1;
296 
297  if (!speed)
298  return 0;
299 
300  /* speed test */
301  init_block(block, test, is_idct, &prng);
302  permute(block1, block, dct->format);
303 
304  ti = av_gettime();
305  it1 = 0;
306  do {
307  for (it = 0; it < NB_ITS_SPEED; it++) {
308  memcpy(block, block1, sizeof(block));
309  dct->func(block);
310  }
311  it1 += NB_ITS_SPEED;
312  ti1 = av_gettime() - ti;
313  } while (ti1 < 1000000);
314  emms_c();
315 
316  printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
317  (double) it1 * 1000.0 / (double) ti1);
318 
319  return 0;
320 }
321 
324 
325 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
326 {
327  static int init;
328  static double c8[8][8];
329  static double c4[4][4];
330  double block1[64], block2[64], block3[64];
331  double s, sum, v;
332  int i, j, k;
333 
334  if (!init) {
335  init = 1;
336 
337  for (i = 0; i < 8; i++) {
338  sum = 0;
339  for (j = 0; j < 8; j++) {
340  s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
341  c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
342  sum += c8[i][j] * c8[i][j];
343  }
344  }
345 
346  for (i = 0; i < 4; i++) {
347  sum = 0;
348  for (j = 0; j < 4; j++) {
349  s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
350  c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
351  sum += c4[i][j] * c4[i][j];
352  }
353  }
354  }
355 
356  /* butterfly */
357  s = 0.5 * sqrt(2.0);
358  for (i = 0; i < 4; i++) {
359  for (j = 0; j < 8; j++) {
360  block1[8 * (2 * i) + j] =
361  (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
362  block1[8 * (2 * i + 1) + j] =
363  (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
364  }
365  }
366 
367  /* idct8 on lines */
368  for (i = 0; i < 8; i++) {
369  for (j = 0; j < 8; j++) {
370  sum = 0;
371  for (k = 0; k < 8; k++)
372  sum += c8[k][j] * block1[8 * i + k];
373  block2[8 * i + j] = sum;
374  }
375  }
376 
377  /* idct4 */
378  for (i = 0; i < 8; i++) {
379  for (j = 0; j < 4; j++) {
380  /* top */
381  sum = 0;
382  for (k = 0; k < 4; k++)
383  sum += c4[k][j] * block2[8 * (2 * k) + i];
384  block3[8 * (2 * j) + i] = sum;
385 
386  /* bottom */
387  sum = 0;
388  for (k = 0; k < 4; k++)
389  sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
390  block3[8 * (2 * j + 1) + i] = sum;
391  }
392  }
393 
394  /* clamp and store the result */
395  for (i = 0; i < 8; i++) {
396  for (j = 0; j < 8; j++) {
397  v = block3[8 * i + j];
398  if (v < 0) v = 0;
399  else if (v > 255) v = 255;
400  dest[i * linesize + j] = (int) rint(v);
401  }
402  }
403 }
404 
405 static void idct248_error(const char *name,
406  void (*idct248_put)(uint8_t *dest, int line_size,
407  int16_t *block),
408  int speed)
409 {
410  int it, i, it1, ti, ti1, err_max, v;
411  AVLFG prng;
412 
413  av_lfg_init(&prng, 1);
414 
415  /* just one test to see if code is correct (precision is less
416  important here) */
417  err_max = 0;
418  for (it = 0; it < NB_ITS; it++) {
419  /* XXX: use forward transform to generate values */
420  for (i = 0; i < 64; i++)
421  block1[i] = av_lfg_get(&prng) % 256 - 128;
422  block1[0] += 1024;
423 
424  for (i = 0; i < 64; i++)
425  block[i] = block1[i];
426  idct248_ref(img_dest1, 8, block);
427 
428  for (i = 0; i < 64; i++)
429  block[i] = block1[i];
430  idct248_put(img_dest, 8, block);
431 
432  for (i = 0; i < 64; i++) {
433  v = abs((int) img_dest[i] - (int) img_dest1[i]);
434  if (v == 255)
435  printf("%d %d\n", img_dest[i], img_dest1[i]);
436  if (v > err_max)
437  err_max = v;
438  }
439  }
440  printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
441 
442  if (!speed)
443  return;
444 
445  ti = av_gettime();
446  it1 = 0;
447  do {
448  for (it = 0; it < NB_ITS_SPEED; it++) {
449  for (i = 0; i < 64; i++)
450  block[i] = block1[i];
451  idct248_put(img_dest, 8, block);
452  }
453  it1 += NB_ITS_SPEED;
454  ti1 = av_gettime() - ti;
455  } while (ti1 < 1000000);
456  emms_c();
457 
458  printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
459  (double) it1 * 1000.0 / (double) ti1);
460 }
461 
462 static void help(void)
463 {
464  printf("dct-test [-i] [<test-number>]\n"
465  "test-number 0 -> test with random matrixes\n"
466  " 1 -> test with random sparse matrixes\n"
467  " 2 -> do 3. test from mpeg4 std\n"
468  "-i test IDCT implementations\n"
469  "-4 test IDCT248 implementations\n"
470  "-t speed test\n");
471 }
472 
473 #if !HAVE_GETOPT
474 #include "compat/getopt.c"
475 #endif
476 
477 int main(int argc, char **argv)
478 {
479  int test_idct = 0, test_248_dct = 0;
480  int c, i;
481  int test = 1;
482  int speed = 0;
483  int err = 0;
484 
486 
487  ff_ref_dct_init();
488  idct_mmx_init();
489 
490  for (;;) {
491  c = getopt(argc, argv, "ih4t");
492  if (c == -1)
493  break;
494  switch (c) {
495  case 'i':
496  test_idct = 1;
497  break;
498  case '4':
499  test_248_dct = 1;
500  break;
501  case 't':
502  speed = 1;
503  break;
504  default:
505  case 'h':
506  help();
507  return 0;
508  }
509  }
510 
511  if (optind < argc)
512  test = atoi(argv[optind]);
513 
514  printf("Libav DCT/IDCT test\n");
515 
516  if (test_248_dct) {
517  idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
518  } else {
519  const struct algo *algos = test_idct ? idct_tab : fdct_tab;
520  for (i = 0; algos[i].name; i++)
521  if (!(~cpu_flags & algos[i].mm_support)) {
522  err |= dct_error(&algos[i], test, test_idct, speed);
523  }
524  }
525 
526  if (err)
527  printf("Error: %d.\n", err);
528 
529  return !!err;
530 }
Definition: lfg.h:25
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:53
static double rint(double x)
Definition: libm.h:130
void ff_fdct_ifast(int16_t *data)
Definition: jfdctfst.c:208
static const struct algo idct_tab[]
Definition: dct-test.c:102
void ff_simple_idct_neon(int16_t *data)
static uint8_t img_dest[64]
Definition: dct-test.c:322
static int optind
Definition: getopt.c:37
const char * name
Definition: dct-test.c:65
void ff_idct_xvid_sse2(short *block)
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:58
int main(int argc, char **argv)
Definition: dct-test.c:477
formattag
Definition: dct-test.c:67
av_cold void ff_ref_dct_init(void)
Initialize the double precision discrete cosine transform functions fdct & idct.
Definition: dctref.c:41
int nonspec
Definition: dct-test.c:70
int mm_support
Definition: dct-test.c:69
uint8_t
void ff_faanidct(int16_t block[64])
Definition: faanidct.c:132
void ff_bfin_idct(int16_t *block)
#define AV_CPU_FLAG_NEON
Definition: cpu.h:60
#define emms_c()
Definition: internal.h:46
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:30
const char * name
void ff_simple_idct248_put(uint8_t *dest, int line_size, int16_t *block)
Definition: simple_idct.c:88
const char data[16]
Definition: mxf.c:66
const uint16_t ff_aanscales[64]
Definition: aandcttab.c:26
void ff_fdct_mmxext(int16_t *block)
#define AANSCALE_BITS
Definition: dct-test.c:140
void ff_simple_idct_armv6(int16_t *data)
void ff_simple_idct_mmx(int16_t *block)
void ff_fdct_mmx(int16_t *block)
#define FFMAX(a, b)
Definition: common.h:55
static const struct algo fdct_tab[]
Definition: dct-test.c:75
#define AV_CPU_FLAG_ARMV5TE
Definition: cpu.h:55
Definition: dct-test.c:64
void ff_fdct_sse2(int16_t *block)
static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
Definition: dct-test.c:325
static short idct_simple_mmx_perm[64]
Definition: dct-test.c:147
static void test(const char *pattern, const char *host)
Definition: noproxy-test.c:23
#define FFABS(a)
Definition: common.h:52
static int cpu_flags
Definition: dct-test.c:73
static void permute(int16_t dst[64], const int16_t src[64], int perm)
Definition: dct-test.c:201
static short idct_mmx_perm[64]
Definition: dct-test.c:145
int64_t av_gettime(void)
Get the current time in microseconds.
Definition: time.c:37
static void idct_mmx_init(void)
Definition: dct-test.c:160
#define AV_CPU_FLAG_ARMV6
Definition: cpu.h:56
void ff_jpeg_fdct_islow_8(int16_t *data)
void ff_j_rev_dct(int16_t *data)
#define NB_ITS_SPEED
Definition: dct-test.c:143
void ff_faandct(int16_t *data)
Definition: faandct.c:121
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:29
static void(WINAPI *cond_broadcast)(pthread_cond_t *cond)
static const uint8_t idct_sse2_row_perm[8]
Definition: dct-test.c:158
static int getopt(int argc, char *argv[], char *opts)
Definition: getopt.c:41
static unsigned int av_lfg_get(AVLFG *c)
Get the next random unsigned 32-bit number using an ALFG.
Definition: lfg.h:38
void ff_bfin_fdct(int16_t *block)
void ff_ref_fdct(short *block)
Transform 8x8 block of data with a double precision forward DCT This is a reference implementation...
Definition: dctref.c:59
av_cold void av_lfg_init(AVLFG *c, unsigned int seed)
Definition: lfg.c:30
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:47
static int16_t block1[64]
Definition: dct-test.c:171
static const uint16_t scale[4]
AAN (Arai Agui Nakajima) (I)DCT tables.
header for Xvid IDCT functions
static uint8_t img_dest1[64]
Definition: dct-test.c:323
common internal and external API header
void ff_fdct_altivec(int16_t *block)
Definition: fdct_altivec.c:197
void ff_simple_idct_armv5te(int16_t *data)
static void idct248_error(const char *name, void(*idct248_put)(uint8_t *dest, int line_size, int16_t *block), int speed)
Definition: dct-test.c:405
static av_cold int init(AVCodecParserContext *s)
Definition: h264_parser.c:498
void ff_ref_idct(short *block)
Transform 8x8 block of data with a double precision inverse DCT This is a reference implementation...
Definition: dctref.c:95
#define c4
Definition: idct_sh4.c:29
simple idct header.
static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
Definition: dct-test.c:223
#define NB_ITS
Definition: dct-test.c:142
#define AV_CPU_FLAG_SSE2
PIV SSE2 functions.
Definition: cpu.h:36
enum algo::formattag format
Floating point AAN DCT
void(* func)(int16_t *block)
Definition: dct-test.c:66
void ff_idct_xvid_mmx(short *block)
static void help(void)
Definition: dct-test.c:462
void ff_idct_xvid_mmxext(short *block)
void ff_j_rev_dct_arm(int16_t *data)
static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
Definition: dct-test.c:173
void ff_simple_idct_arm(int16_t *data)
void ff_simple_idct_8(int16_t *block)
static int16_t block[64]
Definition: dct-test.c:170