58 #ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
59 #define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
62 unsigned char t[64 / 8 ];
63 unsigned int w[64 / 32];
64 unsigned short s[64 / 16];
65 unsigned char c[64 / 8];
73 static inline void renormalize(
unsigned char* X,
unsigned char threshold)
78 unsigned char min = X[0];
80 for (
i = 0;
i < NUMSTATES;
i++)
83 for (
i = 0;
i < NUMSTATES;
i++)
96 unsigned char* Branchtab)
98 int j, decision0, decision1;
99 unsigned char metric, m0, m1, m2, m3;
104 int PRECISIONSHIFT = 2;
107 for (j = 0; j < RATE; j++)
108 metric += (Branchtab[
i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
109 metric = metric >> PRECISIONSHIFT;
111 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
114 m1 = X[
i + NUMSTATES / 2] + (max - metric);
115 m2 = X[
i] + (max - metric);
116 m3 = X[
i + NUMSTATES / 2] + metric;
118 decision0 = (
signed int)(m0 - m1) > 0;
119 decision1 = (
signed int)(m2 - m3) > 0;
121 Y[2 *
i] = decision0 ? m1 : m0;
122 Y[2 *
i + 1] = decision1 ? m3 : m2;
124 d->
w[
i / (
sizeof(
unsigned int) * 8 / 2) +
125 s * (
sizeof(
decision_t) /
sizeof(
unsigned int))] |=
126 (decision0 | decision1 << 1) << ((2 *
i) & (
sizeof(
unsigned int) * 8 - 1));
132 #include <immintrin.h>
135 static inline void volk_8u_x4_conv_k7_r2_8u_avx2(
unsigned char* Y,
139 unsigned int framebits,
141 unsigned char* Branchtab)
144 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
145 unsigned char a75, a81;
148 unsigned char *a80, *b6;
149 int *a110, *a91, *a93;
150 __m256i *a112, *a71, *a72, *a77, *a83, *a95;
152 __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25, m26,
153 s18, s19, s22, s23, s24, s25, t13, t14, t15;
158 s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
159 s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
164 a76 = _mm256_set1_epi8(a75);
165 a77 = ((__m256i*)Branchtab);
167 a79 = _mm256_xor_si256(a76, a78);
170 a82 = _mm256_set1_epi8(a81);
173 a85 = _mm256_xor_si256(a82, a84);
174 t13 = _mm256_avg_epu8(a79, a85);
175 a86 = ((__m256i)t13);
176 a87 = _mm256_srli_epi16(a86, 2);
177 a88 = ((__m256i)a87);
178 t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
179 t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
180 m23 = _mm256_adds_epu8(s18, t14);
181 m24 = _mm256_adds_epu8(s19, t15);
182 m25 = _mm256_adds_epu8(s18, t15);
183 m26 = _mm256_adds_epu8(s19, t14);
184 a89 = _mm256_min_epu8(m24, m23);
185 d9 = _mm256_cmpeq_epi8(a89, m24);
186 a90 = _mm256_min_epu8(m26, m25);
187 d10 = _mm256_cmpeq_epi8(a90, m26);
188 s22 = _mm256_unpacklo_epi8(d9, d10);
189 s23 = _mm256_unpackhi_epi8(d9, d10);
190 s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
195 s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
198 s22 = _mm256_unpacklo_epi8(a89, a90);
199 s23 = _mm256_unpackhi_epi8(a89, a90);
201 s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
203 s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
206 if ((((
unsigned char*)Y)[0] > 210)) {
208 m5 = ((__m256i*)Y)[0];
209 m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
211 m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
212 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
214 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
216 m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
218 m7 = _mm256_unpacklo_epi8(m7, m7);
219 m7 = _mm256_shufflelo_epi16(m7, 0);
220 m6 = _mm256_unpacklo_epi64(m7, m7);
221 m6 = _mm256_permute2x128_si256(
224 ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
225 ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
227 unsigned char a188, a194;
230 unsigned char *a187, *a193;
231 int *a204, *a206, *a223, *b16;
232 __m256i *a184, *a185, *a190, *a196, *a208, *a225;
234 __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39, m40,
235 m41, m42, s46, s47, s50, s51, t25, t26, t27;
236 a184 = ((__m256i*)Y);
240 s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
241 s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
245 a189 = _mm256_set1_epi8(a188);
246 a190 = ((__m256i*)Branchtab);
248 a192 = _mm256_xor_si256(a189, a191);
251 a195 = _mm256_set1_epi8(a194);
254 a198 = _mm256_xor_si256(a195, a197);
255 t25 = _mm256_avg_epu8(a192, a198);
256 a199 = ((__m256i)t25);
257 a200 = _mm256_srli_epi16(a199, 2);
258 a201 = ((__m256i)a200);
259 t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
260 t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
261 m39 = _mm256_adds_epu8(s46, t26);
262 m40 = _mm256_adds_epu8(s47, t27);
263 m41 = _mm256_adds_epu8(s46, t27);
264 m42 = _mm256_adds_epu8(s47, t26);
265 a202 = _mm256_min_epu8(m40, m39);
266 d17 = _mm256_cmpeq_epi8(a202, m40);
267 a203 = _mm256_min_epu8(m42, m41);
268 d18 = _mm256_cmpeq_epi8(a203, m42);
269 s24 = _mm256_unpacklo_epi8(d17, d18);
270 s25 = _mm256_unpackhi_epi8(d17, d18);
271 s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
277 s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
280 s50 = _mm256_unpacklo_epi8(a202, a203);
281 s51 = _mm256_unpackhi_epi8(a202, a203);
282 s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
283 s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
284 a208 = ((__m256i*)X);
289 if ((((
unsigned char*)X)[0] > 210)) {
291 m12 = ((__m256i*)X)[0];
292 m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
294 m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
295 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
297 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
299 m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
301 m14 = _mm256_unpacklo_epi8(m14, m14);
302 m14 = _mm256_shufflelo_epi16(m14, 0);
303 m13 = _mm256_unpacklo_epi64(m14, m14);
304 m13 = _mm256_permute2x128_si256(m13, m13, 0);
305 ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
306 ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
313 for (j = 0; j < (framebits + excess) % 2; ++j) {
315 for (
i = 0;
i < 64 / 2;
i++) {
317 (((framebits + excess) >> 1) << 1) + j,
335 #include <emmintrin.h>
336 #include <mmintrin.h>
337 #include <pmmintrin.h>
339 #include <xmmintrin.h>
345 unsigned int framebits,
347 unsigned char* Branchtab)
350 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
351 unsigned char a75, a81;
353 short int s20, s21, s26, s27;
354 unsigned char *a74, *a80, *b6;
355 short int *a110, *a111, *a91, *a93, *a94;
356 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
357 __m128i a105, a106, a86, a87;
358 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
359 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
360 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
368 a76 = _mm_set1_epi8(a75);
369 a77 = ((__m128i*)Branchtab);
371 a79 = _mm_xor_si128(a76, a78);
375 a82 = _mm_set1_epi8(a81);
378 a85 = _mm_xor_si128(a82, a84);
379 t13 = _mm_avg_epu8(a79, a85);
380 a86 = ((__m128i)t13);
381 a87 = _mm_srli_epi16(a86, 2);
382 a88 = ((__m128i)a87);
385 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
387 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
389 m23 = _mm_adds_epu8(s18, t14);
390 m24 = _mm_adds_epu8(s19, t15);
391 m25 = _mm_adds_epu8(s18, t15);
392 m26 = _mm_adds_epu8(s19, t14);
393 a89 = _mm_min_epu8(m24, m23);
394 d9 = _mm_cmpeq_epi8(a89, m24);
395 a90 = _mm_min_epu8(m26, m25);
396 d10 = _mm_cmpeq_epi8(a90, m26);
397 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
398 a91 = ((
short int*)dec);
402 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
405 s22 = _mm_unpacklo_epi8(a89, a90);
406 s23 = _mm_unpackhi_epi8(a89, a90);
417 a101 = _mm_xor_si128(a76, a100);
420 a104 = _mm_xor_si128(a82, a103);
421 t16 = _mm_avg_epu8(a101, a104);
422 a105 = ((__m128i)t16);
423 a106 = _mm_srli_epi16(a105, 2);
424 a107 = ((__m128i)a106);
427 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
429 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
431 m27 = _mm_adds_epu8(s24, t17);
432 m28 = _mm_adds_epu8(s25, t18);
433 m29 = _mm_adds_epu8(s24, t18);
434 m30 = _mm_adds_epu8(s25, t17);
435 a108 = _mm_min_epu8(m28, m27);
436 d11 = _mm_cmpeq_epi8(a108, m28);
437 a109 = _mm_min_epu8(m30, m29);
438 d12 = _mm_cmpeq_epi8(a109, m30);
439 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
442 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
445 s28 = _mm_unpacklo_epi8(a108, a109);
446 s29 = _mm_unpackhi_epi8(a108, a109);
451 if ((((
unsigned char*)Y)[0] > 210)) {
453 m5 = ((__m128i*)Y)[0];
454 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
455 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
456 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
458 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
460 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
462 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
463 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
464 m7 = _mm_unpacklo_epi8(m7, m7);
465 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
466 m6 = _mm_unpacklo_epi64(m7, m7);
467 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
468 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
469 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
470 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
472 unsigned char a188, a194;
474 short int s48, s49, s54, s55;
475 unsigned char *a187, *a193, *b15;
476 short int *a204, *a206, *a207, *a223, *a224, *b16;
477 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
479 __m128i a199, a200, a218, a219;
480 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
481 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
482 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
483 a184 = ((__m128i*)Y);
491 a189 = _mm_set1_epi8(a188);
492 a190 = ((__m128i*)Branchtab);
494 a192 = _mm_xor_si128(a189, a191);
497 a195 = _mm_set1_epi8(a194);
500 a198 = _mm_xor_si128(a195, a197);
501 t25 = _mm_avg_epu8(a192, a198);
502 a199 = ((__m128i)t25);
503 a200 = _mm_srli_epi16(a199, 2);
504 a201 = ((__m128i)a200);
507 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
509 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
511 m39 = _mm_adds_epu8(s46, t26);
512 m40 = _mm_adds_epu8(s47, t27);
513 m41 = _mm_adds_epu8(s46, t27);
514 m42 = _mm_adds_epu8(s47, t26);
515 a202 = _mm_min_epu8(m40, m39);
516 d17 = _mm_cmpeq_epi8(a202, m40);
517 a203 = _mm_min_epu8(m42, m41);
518 d18 = _mm_cmpeq_epi8(a203, m42);
519 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
520 a204 = ((
short int*)dec);
525 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
528 s50 = _mm_unpacklo_epi8(a202, a203);
529 s51 = _mm_unpackhi_epi8(a202, a203);
530 a208 = ((__m128i*)X);
540 a214 = _mm_xor_si128(a189, a213);
543 a217 = _mm_xor_si128(a195, a216);
544 t28 = _mm_avg_epu8(a214, a217);
545 a218 = ((__m128i)t28);
546 a219 = _mm_srli_epi16(a218, 2);
547 a220 = ((__m128i)a219);
550 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
552 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
554 m43 = _mm_adds_epu8(s52, t29);
555 m44 = _mm_adds_epu8(s53, t30);
556 m45 = _mm_adds_epu8(s52, t30);
557 m46 = _mm_adds_epu8(s53, t29);
558 a221 = _mm_min_epu8(m44, m43);
559 d19 = _mm_cmpeq_epi8(a221, m44);
560 a222 = _mm_min_epu8(m46, m45);
561 d20 = _mm_cmpeq_epi8(a222, m46);
562 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
565 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
568 s56 = _mm_unpacklo_epi8(a221, a222);
569 s57 = _mm_unpackhi_epi8(a221, a222);
574 if ((((
unsigned char*)X)[0] > 210)) {
576 m12 = ((__m128i*)X)[0];
577 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
578 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
579 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
581 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
582 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
584 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
586 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
588 m14 = _mm_unpacklo_epi8(m14, m14);
589 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
590 m13 = _mm_unpacklo_epi64(m14, m14);
591 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
592 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
593 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
594 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
607 for (j = 0; j < (framebits + excess) % 2; ++j) {
609 for (
i = 0;
i < 64 / 2;
i++) {
611 (((framebits + excess) >> 1) << 1) + j,
640 unsigned int framebits,
642 unsigned char* Branchtab)
644 int nbits = framebits + excess;
646 int RENORMALIZE_THRESHOLD = 210;
649 for (s = 0; s < nbits; s++) {
651 for (
i = 0;
i < NUMSTATES / 2;
i++) {
660 Y = (
unsigned char*)tmp;