30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
39 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
40 !defined(USE_SSE2_EMULATION)
43 #include <emmintrin.h>
47 #include <smmintrin.h>
50 #include "gdal_priv_templates.hpp"
52 static inline __m128i GDALCopyInt16ToXMM(
const void *ptr)
56 return _mm_cvtsi32_si128(s);
59 static inline __m128i GDALCopyInt32ToXMM(
const void *ptr)
63 return _mm_cvtsi32_si128(i);
66 static inline __m128i GDALCopyInt64ToXMM(
const void *ptr)
68 #if defined(__i386__) || defined(_M_IX86)
69 return _mm_loadl_epi64(
static_cast<const __m128i *
>(ptr));
73 return _mm_cvtsi64_si128(i);
77 static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void *pDest)
79 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
89 #pragma GCC diagnostic push
90 #pragma GCC diagnostic ignored "-Weffc++"
93 XMMReg2Double() =
default;
95 #pragma GCC diagnostic pop
98 XMMReg2Double(
double val) : xmm(_mm_load_sd(&val))
101 XMMReg2Double(
const XMMReg2Double &other) : xmm(other.xmm)
105 static inline XMMReg2Double Zero()
112 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
115 reg.nsLoad1ValHighAndLow(ptr);
119 static inline XMMReg2Double Load2Val(
const double *ptr)
126 static inline XMMReg2Double Load2Val(
const float *ptr)
133 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
136 reg.nsLoad2ValAligned(ptr);
140 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
147 static inline XMMReg2Double Load2Val(
const short *ptr)
154 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
161 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
162 const XMMReg2Double &expr2)
165 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
169 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
170 const XMMReg2Double &expr2)
173 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
177 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
178 const XMMReg2Double &expr2)
181 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
185 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
186 const XMMReg2Double &expr2)
189 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
193 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
194 const XMMReg2Double &true_expr,
195 const XMMReg2Double &false_expr)
198 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
199 _mm_andnot_pd(cond.xmm, false_expr.xmm));
203 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
204 const XMMReg2Double &expr2)
207 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
211 inline void nsLoad1ValHighAndLow(
const double *ptr)
213 xmm = _mm_load1_pd(ptr);
216 inline void nsLoad2Val(
const double *ptr)
218 xmm = _mm_loadu_pd(ptr);
221 inline void nsLoad2ValAligned(
const double *ptr)
223 xmm = _mm_load_pd(ptr);
226 inline void nsLoad2Val(
const float *ptr)
228 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
231 inline void nsLoad2Val(
const unsigned char *ptr)
233 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
235 xmm_i = _mm_cvtepu8_epi32(xmm_i);
237 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
238 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
240 xmm = _mm_cvtepi32_pd(xmm_i);
243 inline void nsLoad2Val(
const short *ptr)
245 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
247 xmm_i = _mm_cvtepi16_epi32(xmm_i);
249 xmm_i = _mm_unpacklo_epi16(
251 xmm_i = _mm_srai_epi32(
254 xmm = _mm_cvtepi32_pd(xmm_i);
257 inline void nsLoad2Val(
const unsigned short *ptr)
259 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
261 xmm_i = _mm_cvtepu16_epi32(xmm_i);
263 xmm_i = _mm_unpacklo_epi16(
265 _mm_setzero_si128());
267 xmm = _mm_cvtepi32_pd(xmm_i);
270 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
273 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
275 xmm_i = _mm_cvtepu8_epi32(xmm_i);
277 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
278 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
280 low.xmm = _mm_cvtepi32_pd(xmm_i);
282 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
285 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
289 high.nsLoad2Val(ptr + 2);
292 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
296 high.nsLoad2Val(ptr + 2);
299 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
303 high.nsLoad2Val(ptr + 2);
306 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
309 __m128 temp1 = _mm_loadu_ps(ptr);
310 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
311 low.xmm = _mm_cvtps_pd(temp1);
312 high.xmm = _mm_cvtps_pd(temp2);
315 inline void Zeroize()
317 xmm = _mm_setzero_pd();
320 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
326 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
328 xmm = _mm_add_pd(xmm, other.xmm);
332 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
334 xmm = _mm_mul_pd(xmm, other.xmm);
338 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
341 ret.xmm = _mm_add_pd(xmm, other.xmm);
345 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
348 ret.xmm = _mm_sub_pd(xmm, other.xmm);
352 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
355 ret.xmm = _mm_mul_pd(xmm, other.xmm);
359 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
362 ret.xmm = _mm_div_pd(xmm, other.xmm);
366 inline double GetHorizSum()
const
369 xmm2 = _mm_shuffle_pd(
372 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
375 inline void Store2Val(
double *ptr)
const
377 _mm_storeu_pd(ptr, xmm);
380 inline void Store2ValAligned(
double *ptr)
const
382 _mm_store_pd(ptr, xmm);
385 inline void Store2Val(
float *ptr)
const
387 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
388 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
391 inline void Store2Val(
unsigned char *ptr)
const
393 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
396 tmp = _mm_packs_epi32(tmp, tmp);
397 tmp = _mm_packus_epi16(tmp, tmp);
398 GDALCopyXMMToInt16(tmp,
reinterpret_cast<GInt16 *
>(ptr));
401 inline void Store2Val(
unsigned short *ptr)
const
403 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
407 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
408 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
411 inline void StoreMask(
unsigned char *ptr)
const
413 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(ptr),
414 _mm_castpd_si128(xmm));
417 inline operator double()
const
419 return _mm_cvtsd_f64(xmm);
425 #ifndef NO_WARN_USE_SSE2_EMULATION
426 #warning "Software emulation of SSE2 !"
438 XMMReg2Double(
double val)
443 XMMReg2Double(
const XMMReg2Double &other) : low(other.low), high(other.high)
447 static inline XMMReg2Double Zero()
454 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
457 reg.nsLoad1ValHighAndLow(ptr);
461 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
462 const XMMReg2Double &expr2)
466 if (expr1.low == expr2.low)
467 memset(&(reg.low), 0xFF,
sizeof(
double));
471 if (expr1.high == expr2.high)
472 memset(&(reg.high), 0xFF,
sizeof(
double));
479 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
480 const XMMReg2Double &expr2)
484 if (expr1.low != expr2.low)
485 memset(&(reg.low), 0xFF,
sizeof(
double));
489 if (expr1.high != expr2.high)
490 memset(&(reg.high), 0xFF,
sizeof(
double));
497 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
498 const XMMReg2Double &expr2)
502 if (expr1.low > expr2.low)
503 memset(&(reg.low), 0xFF,
sizeof(
double));
507 if (expr1.high > expr2.high)
508 memset(&(reg.high), 0xFF,
sizeof(
double));
515 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
516 const XMMReg2Double &expr2)
519 int low1[2], high1[2];
520 int low2[2], high2[2];
521 memcpy(low1, &expr1.low,
sizeof(
double));
522 memcpy(high1, &expr1.high,
sizeof(
double));
523 memcpy(low2, &expr2.low,
sizeof(
double));
524 memcpy(high2, &expr2.high,
sizeof(
double));
527 high1[0] &= high2[0];
528 high1[1] &= high2[1];
529 memcpy(®.low, low1,
sizeof(
double));
530 memcpy(®.high, high1,
sizeof(
double));
534 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
535 const XMMReg2Double &true_expr,
536 const XMMReg2Double &false_expr)
540 reg.low = true_expr.low;
542 reg.low = false_expr.low;
544 reg.high = true_expr.high;
546 reg.high = false_expr.high;
550 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
551 const XMMReg2Double &expr2)
554 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
555 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
559 static inline XMMReg2Double Load2Val(
const double *ptr)
566 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
569 reg.nsLoad2ValAligned(ptr);
573 static inline XMMReg2Double Load2Val(
const float *ptr)
580 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
587 static inline XMMReg2Double Load2Val(
const short *ptr)
594 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
601 inline void nsLoad1ValHighAndLow(
const double *ptr)
607 inline void nsLoad2Val(
const double *ptr)
613 inline void nsLoad2ValAligned(
const double *ptr)
619 inline void nsLoad2Val(
const float *ptr)
625 inline void nsLoad2Val(
const unsigned char *ptr)
631 inline void nsLoad2Val(
const short *ptr)
637 inline void nsLoad2Val(
const unsigned short *ptr)
643 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
652 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
656 high.nsLoad2Val(ptr + 2);
659 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
663 high.nsLoad2Val(ptr + 2);
666 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
670 high.nsLoad2Val(ptr + 2);
673 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
677 high.nsLoad2Val(ptr + 2);
680 inline void Zeroize()
686 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
693 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
700 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
707 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
710 ret.low = low + other.low;
711 ret.high = high + other.high;
715 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
718 ret.low = low - other.low;
719 ret.high = high - other.high;
723 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
726 ret.low = low * other.low;
727 ret.high = high * other.high;
731 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
734 ret.low = low / other.low;
735 ret.high = high / other.high;
739 inline double GetHorizSum()
const
744 inline void Store2Val(
double *ptr)
const
750 inline void Store2ValAligned(
double *ptr)
const
756 inline void Store2Val(
float *ptr)
const
758 ptr[0] =
static_cast<float>(low);
759 ptr[1] =
static_cast<float>(high);
762 void Store2Val(
unsigned char *ptr)
const
764 ptr[0] = (
unsigned char)(low + 0.5);
765 ptr[1] = (
unsigned char)(high + 0.5);
768 void Store2Val(
unsigned short *ptr)
const
771 ptr[1] = (
GUInt16)(high + 0.5);
774 inline void StoreMask(
unsigned char *ptr)
const
776 memcpy(ptr, &low, 8);
777 memcpy(ptr + 8, &high, 8);
780 inline operator double()
const
788 #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
790 #include <immintrin.h>
797 XMMReg4Double() : ymm(_mm256_setzero_pd())
800 XMMReg4Double(
const XMMReg4Double &other) : ymm(other.ymm)
804 static inline XMMReg4Double Zero()
811 inline void Zeroize()
813 ymm = _mm256_setzero_pd();
816 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
819 reg.nsLoad1ValHighAndLow(ptr);
823 inline void nsLoad1ValHighAndLow(
const double *ptr)
825 ymm = _mm256_set1_pd(*ptr);
828 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
835 inline void nsLoad4Val(
const unsigned char *ptr)
837 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
838 xmm_i = _mm_cvtepu8_epi32(xmm_i);
839 ymm = _mm256_cvtepi32_pd(xmm_i);
842 static inline XMMReg4Double Load4Val(
const short *ptr)
849 inline void nsLoad4Val(
const short *ptr)
851 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
852 xmm_i = _mm_cvtepi16_epi32(xmm_i);
853 ymm = _mm256_cvtepi32_pd(xmm_i);
856 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
863 inline void nsLoad4Val(
const unsigned short *ptr)
865 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
866 xmm_i = _mm_cvtepu16_epi32(xmm_i);
867 ymm = _mm256_cvtepi32_pd(
872 static inline XMMReg4Double Load4Val(
const double *ptr)
879 inline void nsLoad4Val(
const double *ptr)
881 ymm = _mm256_loadu_pd(ptr);
884 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
887 reg.nsLoad4ValAligned(ptr);
891 inline void nsLoad4ValAligned(
const double *ptr)
893 ymm = _mm256_load_pd(ptr);
896 static inline XMMReg4Double Load4Val(
const float *ptr)
903 inline void nsLoad4Val(
const float *ptr)
905 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
908 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
909 const XMMReg4Double &expr2)
912 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
916 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
917 const XMMReg4Double &expr2)
920 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
924 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
925 const XMMReg4Double &expr2)
928 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
932 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
933 const XMMReg4Double &expr2)
936 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
940 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
941 const XMMReg4Double &true_expr,
942 const XMMReg4Double &false_expr)
945 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
946 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
950 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
951 const XMMReg4Double &expr2)
954 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
958 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
964 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
966 ymm = _mm256_add_pd(ymm, other.ymm);
970 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
972 ymm = _mm256_mul_pd(ymm, other.ymm);
976 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
979 ret.ymm = _mm256_add_pd(ymm, other.ymm);
983 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
986 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
990 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
993 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
997 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1000 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1004 void AddToLow(
const XMMReg2Double &other)
1006 __m256d ymm2 = _mm256_setzero_pd();
1007 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1008 ymm = _mm256_add_pd(ymm, ymm2);
1011 inline double GetHorizSum()
const
1013 __m256d ymm_tmp1, ymm_tmp2;
1014 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1015 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1016 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1017 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1020 inline void Store4Val(
unsigned char *ptr)
const
1023 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1027 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1029 GDALCopyXMMToInt32(xmm_i,
reinterpret_cast<GInt32 *
>(ptr));
1032 inline void Store4Val(
unsigned short *ptr)
const
1035 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1036 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
1037 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
1040 inline void Store4Val(
float *ptr)
const
1042 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1045 inline void Store4Val(
double *ptr)
const
1047 _mm256_storeu_pd(ptr, ymm);
1050 inline void StoreMask(
unsigned char *ptr)
const
1052 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(ptr),
1053 _mm256_castpd_si256(ymm));
1062 XMMReg2Double low, high;
1064 #if defined(__GNUC__)
1065 #pragma GCC diagnostic push
1066 #pragma GCC diagnostic ignored "-Weffc++"
1069 XMMReg4Double() =
default;
1070 #if defined(__GNUC__)
1071 #pragma GCC diagnostic pop
1074 XMMReg4Double(
const XMMReg4Double &other) : low(other.low), high(other.high)
1078 static inline XMMReg4Double Zero()
1086 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
1089 reg.low.nsLoad1ValHighAndLow(ptr);
1094 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
1097 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1101 static inline XMMReg4Double Load4Val(
const short *ptr)
1104 reg.low.nsLoad2Val(ptr);
1105 reg.high.nsLoad2Val(ptr + 2);
1109 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
1112 reg.low.nsLoad2Val(ptr);
1113 reg.high.nsLoad2Val(ptr + 2);
1117 static inline XMMReg4Double Load4Val(
const double *ptr)
1120 reg.low.nsLoad2Val(ptr);
1121 reg.high.nsLoad2Val(ptr + 2);
1125 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
1128 reg.low.nsLoad2ValAligned(ptr);
1129 reg.high.nsLoad2ValAligned(ptr + 2);
1133 static inline XMMReg4Double Load4Val(
const float *ptr)
1136 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1140 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
1141 const XMMReg4Double &expr2)
1144 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1145 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1149 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
1150 const XMMReg4Double &expr2)
1153 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1154 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1158 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
1159 const XMMReg4Double &expr2)
1162 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1163 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1167 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
1168 const XMMReg4Double &expr2)
1171 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1172 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1176 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
1177 const XMMReg4Double &true_expr,
1178 const XMMReg4Double &false_expr)
1182 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1184 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1188 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
1189 const XMMReg4Double &expr2)
1192 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1193 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1197 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
1204 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
1211 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
1218 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
1221 ret.low = low + other.low;
1222 ret.high = high + other.high;
1226 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1229 ret.low = low - other.low;
1230 ret.high = high - other.high;
1234 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1237 ret.low = low * other.low;
1238 ret.high = high * other.high;
1242 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1245 ret.low = low / other.low;
1246 ret.high = high / other.high;
1250 void AddToLow(
const XMMReg2Double &other)
1255 inline double GetHorizSum()
const
1257 return (low + high).GetHorizSum();
1260 inline void Store4Val(
unsigned char *ptr)
const
1262 #ifdef USE_SSE2_EMULATION
1264 high.Store2Val(ptr + 2);
1266 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1269 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1272 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1273 _mm_castsi128_ps(tmpHigh),
1274 _MM_SHUFFLE(1, 0, 1, 0)));
1275 tmp = _mm_packs_epi32(tmp, tmp);
1276 tmp = _mm_packus_epi16(tmp, tmp);
1277 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
1281 inline void Store4Val(
unsigned short *ptr)
const
1285 high.Store2Val(ptr + 2);
1287 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1288 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1289 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1291 xmm0 = _mm_packus_epi32(xmm0, xmm0);
1293 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1294 xmm0 = _mm_packs_epi32(xmm0, xmm0);
1295 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1297 GDALCopyXMMToInt64(xmm0, (
GInt64 *)ptr);
1301 inline void Store4Val(
float *ptr)
const
1304 high.Store2Val(ptr + 2);
1307 inline void Store4Val(
double *ptr)
const
1310 high.Store2Val(ptr + 2);
1313 inline void StoreMask(
unsigned char *ptr)
const
1316 high.StoreMask(ptr + 16);