39 #include <visp3/core/vpCPUFeatures.h>
40 #include <visp3/core/vpImageConvert.h>
41 #include <visp3/core/vpImageTools.h>
43 #if defined __SSE2__ || defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP >= 2)
44 #include <emmintrin.h>
45 #define VISP_HAVE_SSE2 1
47 #if defined __SSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
48 #include <pmmintrin.h>
49 #define VISP_HAVE_SSE3 1
51 #if defined __SSSE3__ || (defined _MSC_VER && _MSC_VER >= 1500)
52 #include <tmmintrin.h>
53 #define VISP_HAVE_SSSE3 1
114 unsigned char B_star)
123 double factor = (double)(B_star - A_star) / (double)(B - A);
125 for (
unsigned int i = 0; i < I.
getHeight(); i++)
126 for (
unsigned int j = 0; j < I.
getWidth(); j++) {
134 I[i][j] = (
unsigned char)(A_star + factor * (v - A));
169 const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0);
170 const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1);
172 const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1);
174 for (; i <= I1.
getSize()-16; i+= 16) {
175 const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(I1.
bitmap + i));
176 const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(I2.
bitmap + i));
178 __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1);
179 __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1);
181 const __m128i vshift = _mm_set1_epi16(128);
182 __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
184 const __m128i v255 = _mm_set1_epi16(255);
185 const __m128i vzero = _mm_setzero_si128();
186 const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
188 vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2);
189 vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2);
191 vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
192 const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
194 _mm_storeu_si128(reinterpret_cast<__m128i *>(Idiff.
bitmap + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1),
195 _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2)));
201 for (; i < I1.
getSize(); i++) {
224 "(%ux%u) and (%ux%u) have not the same size",
240 const __m128i mask1 = _mm_set_epi8(-1, 14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0);
241 const __m128i mask2 = _mm_set_epi8(-1, 15, -1, 13, -1, 11, -1, 9, -1, 7, -1, 5, -1, 3, -1, 1);
243 const __m128i mask_out2 = _mm_set_epi8(14, -1, 12, -1, 10, -1, 8, -1, 6, -1, 4, -1, 2, -1, 0, -1);
245 for (; i <= I1.
getSize()-4; i+= 4) {
246 const __m128i vdata1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(I1.
bitmap + i));
247 const __m128i vdata2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(I2.
bitmap + i));
249 __m128i vdata1_reorg = _mm_shuffle_epi8(vdata1, mask1);
250 __m128i vdata2_reorg = _mm_shuffle_epi8(vdata2, mask1);
252 const __m128i vshift = _mm_set1_epi16(128);
253 __m128i vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
255 const __m128i v255 = _mm_set1_epi16(255);
256 const __m128i vzero = _mm_setzero_si128();
257 const __m128i vdata_diff_min_max1 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
259 vdata1_reorg = _mm_shuffle_epi8(vdata1, mask2);
260 vdata2_reorg = _mm_shuffle_epi8(vdata2, mask2);
262 vdata_diff = _mm_add_epi16(_mm_sub_epi16(vdata1_reorg, vdata2_reorg), vshift);
263 const __m128i vdata_diff_min_max2 = _mm_max_epi16(_mm_min_epi16(vdata_diff, v255), vzero);
265 _mm_storeu_si128(reinterpret_cast<__m128i *>(Idiff.
bitmap + i), _mm_or_si128(_mm_shuffle_epi8(vdata_diff_min_max1, mask1),
266 _mm_shuffle_epi8(vdata_diff_min_max2, mask_out2)));
272 for (; i < I1.
getSize(); i++) {
305 for (
unsigned int b = 0; b < n; b++) {
328 for (
unsigned int b = 0; b < n; b++) {
356 for (
unsigned int b = 0; b < n; b++) {
389 unsigned char *ptr_I1 = I1.
bitmap;
390 unsigned char *ptr_I2 = I2.
bitmap;
391 unsigned char *ptr_Ires = Ires.
bitmap;
392 unsigned int cpt = 0;
396 for (; cpt <= Ires.
getSize() - 16; cpt += 16, ptr_I1 += 16, ptr_I2 += 16, ptr_Ires += 16) {
397 const __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr_I1));
398 const __m128i v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr_I2));
399 const __m128i vres = saturate ? _mm_adds_epu8(v1, v2) : _mm_add_epi8(v1, v2);
401 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr_Ires), vres);
406 for (; cpt < Ires.
getSize(); cpt++, ++ptr_I1, ++ptr_I2, ++ptr_Ires) {
407 *ptr_Ires = saturate ? vpMath::saturate<unsigned char>((
short int)*ptr_I1 + (
short int)*ptr_I2) : *ptr_I1 + *ptr_I2;
431 unsigned char *ptr_I1 = I1.
bitmap;
432 unsigned char *ptr_I2 = I2.
bitmap;
433 unsigned char *ptr_Ires = Ires.
bitmap;
434 unsigned int cpt = 0;
438 for (; cpt <= Ires.
getSize() - 16; cpt += 16, ptr_I1 += 16, ptr_I2 += 16, ptr_Ires += 16) {
439 const __m128i v1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr_I1));
440 const __m128i v2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr_I2));
441 const __m128i vres = saturate ? _mm_subs_epu8(v1, v2) : _mm_sub_epi8(v1, v2);
443 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr_Ires), vres);
448 for (; cpt < Ires.
getSize(); cpt++, ++ptr_I1, ++ptr_I2, ++ptr_Ires) {
449 *ptr_Ires = saturate ?
450 vpMath::saturate<unsigned char>(static_cast<short int>(*ptr_I1) - static_cast<short int>(*ptr_I2)) :
470 mapU.
resize(height, width,
false,
false);
471 mapV.
resize(height, width,
false,
false);
472 mapDu.
resize(height, width,
false,
false);
473 mapDv.
resize(height, width,
false,
false);
475 float u0 = static_cast<float>(cam.
get_u0());
476 float v0 = static_cast<float>(cam.
get_v0());
477 float px = static_cast<float>(cam.
get_px());
478 float py = static_cast<float>(cam.
get_py());
479 float kud = static_cast<float>(cam.
get_kud());
481 if (std::fabs(static_cast<double>(kud)) <= std::numeric_limits<double>::epsilon()) {
483 for (
unsigned int i = 0; i < height; i++) {
484 for (
unsigned int j = 0; j < width; j++) {
485 mapU[i][j] = static_cast<int>(j);
486 mapV[i][j] = static_cast<int>(i);
495 float invpx = 1.0f / px;
496 float invpy = 1.0f / py;
498 float kud_px2 = kud * invpx * invpx;
499 float kud_py2 = kud * invpy * invpy;
501 for (
unsigned int v = 0; v < height; v++) {
502 float deltav = v - v0;
503 float fr1 = 1.0f + kud_py2 * deltav * deltav;
505 for (
unsigned int u = 0; u < width; u++) {
507 float deltau = u - u0;
508 float fr2 = fr1 + kud_px2 * deltau * deltau;
510 float u_float = deltau * fr2 + u0;
511 float v_float = deltav * fr2 + v0;
513 int u_round = static_cast<int>(u_float);
514 int v_round = static_cast<int>(v_float);
516 mapU[v][u] = u_round;
517 mapV[v][u] = v_round;
519 mapDu[v][u] = u_float - u_round;
520 mapDv[v][u] = v_float - v_round;
539 std::cerr <<
"Error, input image is empty." << std::endl;
546 for (
unsigned int i = 1; i < II.
getHeight(); i++) {
547 for (
unsigned int j = 1; j < II.
getWidth(); j++) {
548 II[i][j] = I[i - 1][j - 1] + II[i - 1][j] + II[i][j - 1] - II[i - 1][j - 1];
549 IIsq[i][j] =
vpMath::sqr(I[i - 1][j - 1]) + IIsq[i - 1][j] + IIsq[i][j - 1] - IIsq[i - 1][j - 1];
563 const bool useOptimized)
570 "image dimension mismatch between I1=%ux%u and I2=%ux%u",
581 unsigned int cpt = 0;
585 const double *ptr_I1 = I1.
bitmap;
586 const double *ptr_I2 = I2.
bitmap;
588 const __m128d v_mean_a = _mm_set1_pd(a);
589 const __m128d v_mean_b = _mm_set1_pd(b);
590 __m128d v_ab = _mm_setzero_pd();
591 __m128d v_a2 = _mm_setzero_pd();
592 __m128d v_b2 = _mm_setzero_pd();
594 for (; cpt <= I1.
getSize() - 2; cpt += 2, ptr_I1 += 2, ptr_I2 += 2) {
595 const __m128d v1 = _mm_loadu_pd(ptr_I1);
596 const __m128d v2 = _mm_loadu_pd(ptr_I2);
597 const __m128d norm_a = _mm_sub_pd(v1, v_mean_a);
598 const __m128d norm_b = _mm_sub_pd(v2, v_mean_b);
599 v_ab = _mm_add_pd(v_ab, _mm_mul_pd(norm_a, norm_b));
600 v_a2 = _mm_add_pd(v_a2, _mm_mul_pd(norm_a, norm_a));
601 v_b2 = _mm_add_pd(v_b2, _mm_mul_pd(norm_b, norm_b));
604 double v_res_ab[2], v_res_a2[2], v_res_b2[2];
605 _mm_storeu_pd(v_res_ab, v_ab);
606 _mm_storeu_pd(v_res_a2, v_a2);
607 _mm_storeu_pd(v_res_b2, v_b2);
609 ab = v_res_ab[0] + v_res_ab[1];
610 a2 = v_res_a2[0] + v_res_a2[1];
611 b2 = v_res_b2[0] + v_res_b2[1];
615 for (; cpt < I1.
getSize(); cpt++) {
621 return ab / sqrt(a2 * b2);
636 for (
unsigned int i = 0; i < height; ++i)
637 for (
unsigned int j = 0; j < width; ++j)
639 for (
unsigned int j = 0; j < width; ++j)
650 for (
unsigned int i = 0; i < I.
getHeight(); ++i)
651 for (
unsigned int j = 0; j < I.
getWidth(); ++j)
652 I(i, j, I(i, j) / s);
663 const vpImageInterpolationType &method)
669 int x1 = (int)floor(point.
get_i());
670 int x2 = (int)ceil(point.
get_i());
671 int y1 = (int)floor(point.
get_j());
672 int y2 = (int)ceil(point.
get_j());
678 v1 = (x2 - point.
get_i()) * I(x1, y1) + (point.
get_i() - x1) * I(x2, y1);
679 v2 = (x2 - point.
get_i()) * I(x1, y2) + (point.
get_i() - x1) * I(x2, y2);
683 return (y2 - point.
get_j()) * v1 + (point.
get_j() - y1) * v2;
687 "vpImageTools::interpolate(): bi-cubic interpolation is not implemented.");
709 for (
unsigned int x = 0; x < x_d; ++x) {
710 for (
unsigned int y = 0; y < y_d; ++y) {
732 for (
unsigned int x = 0; x < x_d; ++x) {
733 for (
unsigned int y = 0; y < y_d; ++y) {
755 vpImage<double> &I_score,
const unsigned int step_u,
const unsigned int step_v,
756 const bool useOptimized)
759 std::cerr <<
"Error, input image is empty." << std::endl;
764 std::cerr <<
"Error, template image is empty." << std::endl;
769 std::cerr <<
"Error, template image is bigger than input image." << std::endl;
788 const double sum2 = (II_tpl[height_tpl][width_tpl] + II_tpl[0][0] - II_tpl[0][width_tpl] - II_tpl[height_tpl][0]);
789 const double mean2 = sum2 / I_tpl.
getSize();
790 for (
unsigned int cpt = 0; cpt < I_tpl_double.
getSize(); cpt++) {
791 I_tpl_double.
bitmap[cpt] -= mean2;
794 #if defined _OPENMP && _OPENMP >= 200711 // OpenMP 3.1
795 #pragma omp parallel for schedule(dynamic)
796 for (
unsigned int i = 0; i < I.
getHeight() - height_tpl; i += step_v) {
797 for (
unsigned int j = 0; j < I.
getWidth() - width_tpl; j += step_u) {
803 int end = (int)((I.
getHeight() - height_tpl) / step_v) + 1;
804 std::vector<unsigned int> vec_step_v((
size_t)end);
805 for (
unsigned int cpt = 0, idx = 0; cpt < I.
getHeight() - height_tpl; cpt += step_v, idx++) {
806 vec_step_v[(size_t)idx] = cpt;
808 #if defined _OPENMP // only to disable warning: ignoring #pragma omp parallel [-Wunknown-pragmas]
809 #pragma omp parallel for schedule(dynamic)
811 for (
int cpt = 0; cpt < end; cpt++) {
812 for (
unsigned int j = 0; j < I.
getWidth() - width_tpl; j += step_u) {
813 I_score[vec_step_v[cpt]][j] =
821 for (
unsigned int i = 0; i < I.
getHeight() - height_tpl; i += step_v) {
822 for (
unsigned int j = 0; j < I.
getWidth() - width_tpl; j += step_u) {
839 float vpImageTools::cubicHermite(
const float A,
const float B,
const float C,
const float D,
const float t)
841 float a = (-A + 3.0f * B - 3.0f * C + D) / 2.0f;
842 float b = A + 2.0f * C - (5.0f * B + D) / 2.0f;
843 float c = (-A + C) / 2.0f;
846 return a * t * t * t + b * t * t + c * t + d;
849 float vpImageTools::lerp(
const float A,
const float B,
const float t) {
850 return A * (1.0f - t) + B * t;
856 const unsigned int i0,
const unsigned int j0)
860 bool use_sse_version =
true;
862 const double *ptr_I1 = I1.
bitmap;
863 const double *ptr_I2 = I2.
bitmap;
865 __m128d v_ab = _mm_setzero_pd();
867 for (
unsigned int i = 0; i < I2.
getHeight(); i++) {
871 for (; j <= I2.
getWidth() - 2; j += 2, ptr_I1 += 2, ptr_I2 += 2) {
872 const __m128d v1 = _mm_loadu_pd(ptr_I1);
873 const __m128d v2 = _mm_loadu_pd(ptr_I2);
874 v_ab = _mm_add_pd(v_ab, _mm_mul_pd(v1, v2));
878 ab += (I1[i0 + i][j0 + j]) * I2[i][j];
883 _mm_storeu_pd(v_res_ab, v_ab);
885 ab += v_res_ab[0] + v_res_ab[1];
887 use_sse_version =
false;
890 bool use_sse_version =
false;
893 if (!use_sse_version) {
894 for (
unsigned int i = 0; i < I2.
getHeight(); i++) {
895 for (
unsigned int j = 0; j < I2.
getWidth(); j++) {
896 ab += (I1[i0 + i][j0 + j]) * I2[i][j];
903 (II[i0 + height_tpl][j0 + width_tpl] + II[i0][j0] - II[i0][j0 + width_tpl] - II[i0 + height_tpl][j0]);
904 const double sum2 = (II_tpl[height_tpl][width_tpl] + II_tpl[0][0] - II_tpl[0][width_tpl] - II_tpl[height_tpl][0]);
913 return ab / sqrt(a2 * b2);
931 #if defined _OPENMP // only to disable warning: ignoring #pragma omp parallel [-Wunknown-pragmas]
932 #pragma omp parallel for schedule(dynamic)
934 for (
int i_ = 0; i_ < static_cast<int>(I.
getHeight()); i_++) {
935 const unsigned int i = static_cast<unsigned int>(i_);
936 for (
unsigned int j = 0; j < I.
getWidth(); j++) {
938 int u_round = mapU[i][j];
939 int v_round = mapV[i][j];
941 float du = mapDu[i][j];
942 float dv = mapDv[i][j];
944 if (0 <= u_round && 0 <= v_round && u_round < static_cast<int>(I.
getWidth()) - 1
945 && v_round < static_cast<int>(I.
getHeight()) - 1) {
947 float col0 = lerp(I[v_round][u_round], I[v_round][u_round + 1], du);
948 float col1 = lerp(I[v_round + 1][u_round], I[v_round + 1][u_round + 1], du);
949 float value = lerp(col0, col1, dv);
951 Iundist[i][j] = static_cast<unsigned char>(value);
980 #if defined VISP_HAVE_SSE2
981 #if defined _OPENMP // only to disable warning: ignoring #pragma omp parallel [-Wunknown-pragmas]
982 #pragma omp parallel for schedule(dynamic)
984 for (
int i_ = 0; i_ < static_cast<int>(I.
getHeight()); i_++) {
985 const unsigned int i = static_cast<unsigned int>(i_);
986 for (
unsigned int j = 0; j < I.
getWidth(); j++) {
988 int u_round = mapU[i][j];
989 int v_round = mapV[i][j];
991 const __m128 vdu = _mm_set1_ps(mapDu[i][j]);
992 const __m128 vdv = _mm_set1_ps(mapDv[i][j]);
994 if (0 <= u_round && 0 <= v_round && u_round < static_cast<int>(I.
getWidth()) - 1
995 && v_round < static_cast<int>(I.
getHeight()) - 1) {
996 #define VLERP(va, vb, vt) _mm_add_ps(va, _mm_mul_ps(_mm_sub_ps(vb, va), vt));
999 const __m128 vdata1 =
1000 _mm_set_ps(static_cast<float>(I[v_round][u_round].A), static_cast<float>(I[v_round][u_round].B),
1001 static_cast<float>(I[v_round][u_round].G), static_cast<float>(I[v_round][u_round].R));
1003 const __m128 vdata2 =
1004 _mm_set_ps(static_cast<float>(I[v_round][u_round + 1].A), static_cast<float>(I[v_round][u_round + 1].B),
1005 static_cast<float>(I[v_round][u_round + 1].G), static_cast<float>(I[v_round][u_round + 1].R));
1007 const __m128 vdata3 =
1008 _mm_set_ps(static_cast<float>(I[v_round + 1][u_round].A), static_cast<float>(I[v_round + 1][u_round].B),
1009 static_cast<float>(I[v_round + 1][u_round].G), static_cast<float>(I[v_round + 1][u_round].R));
1011 const __m128 vdata4 = _mm_set_ps(
1012 static_cast<float>(I[v_round + 1][u_round + 1].A), static_cast<float>(I[v_round + 1][u_round + 1].B),
1013 static_cast<float>(I[v_round + 1][u_round + 1].G), static_cast<float>(I[v_round + 1][u_round + 1].R));
1015 const __m128 vcol0 = VLERP(vdata1, vdata2, vdu);
1016 const __m128 vcol1 = VLERP(vdata3, vdata4, vdu);
1017 const __m128 vvalue = VLERP(vcol0, vcol1, vdv);
1022 _mm_storeu_ps(values, vvalue);
1023 Iundist[i][j].R = static_cast<unsigned char>(values[0]);
1024 Iundist[i][j].G = static_cast<unsigned char>(values[1]);
1025 Iundist[i][j].B = static_cast<unsigned char>(values[2]);
1026 Iundist[i][j].A = static_cast<unsigned char>(values[3]);
1034 #if defined _OPENMP // only to disable warning: ignoring #pragma omp parallel [-Wunknown-pragmas]
1035 #pragma omp parallel for schedule(dynamic)
1037 for (
int i_ = 0; i_ < static_cast<int>(I.
getHeight()); i_++) {
1038 const unsigned int i = static_cast<unsigned int>(i_);
1039 for (
unsigned int j = 0; j < I.
getWidth(); j++) {
1041 int u_round = mapU[i][j];
1042 int v_round = mapV[i][j];
1044 float du = mapDu[i][j];
1045 float dv = mapDv[i][j];
1047 if (0 <= u_round && 0 <= v_round && u_round < static_cast<int>(I.
getWidth()) - 1
1048 && v_round < static_cast<int>(I.
getHeight()) - 1) {
1050 float col0 = lerp(I[v_round][u_round].R, I[v_round][u_round + 1].R, du);
1051 float col1 = lerp(I[v_round + 1][u_round].G, I[v_round + 1][u_round + 1].G, du);
1052 float value = lerp(col0, col1, dv);
1054 Iundist[i][j].R = static_cast<unsigned char>(value);
1056 col0 = lerp(I[v_round][u_round].G, I[v_round][u_round + 1].G, du);
1057 col1 = lerp(I[v_round + 1][u_round].G, I[v_round + 1][u_round + 1].G, du);
1058 value = lerp(col0, col1, dv);
1060 Iundist[i][j].G = static_cast<unsigned char>(value);
1062 col0 = lerp(I[v_round][u_round].B, I[v_round][u_round + 1].B, du);
1063 col1 = lerp(I[v_round + 1][u_round].B, I[v_round + 1][u_round + 1].B, du);
1064 value = lerp(col0, col1, dv);
1066 Iundist[i][j].B = static_cast<unsigned char>(value);
1068 col0 = lerp(I[v_round][u_round].A, I[v_round][u_round + 1].A, du);
1069 col1 = lerp(I[v_round + 1][u_round].A, I[v_round + 1][u_round + 1].A, du);
1070 value = lerp(col0, col1, dv);
1072 Iundist[i][j].A = static_cast<unsigned char>(value);