14#ifndef GDALSSE_PRIV_H_INCLUDED
15#define GDALSSE_PRIV_H_INCLUDED
23#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
24 !defined(USE_SSE2_EMULATION)
34#include "gdal_priv_templates.hpp"
36static inline __m128i GDALCopyInt16ToXMM(
const void *ptr)
40 return _mm_cvtsi32_si128(s);
43static inline __m128i GDALCopyInt32ToXMM(
const void *ptr)
47 return _mm_cvtsi32_si128(i);
50static inline __m128i GDALCopyInt64ToXMM(
const void *ptr)
52#if defined(__i386__) || defined(_M_IX86)
53 return _mm_loadl_epi64(
static_cast<const __m128i *
>(ptr));
57 return _mm_cvtsi64_si128(i);
61static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void *pDest)
63 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
73#pragma GCC diagnostic push
74#pragma GCC diagnostic ignored "-Weffc++"
77 XMMReg2Double() =
default;
79#pragma GCC diagnostic pop
82 XMMReg2Double(
double val) : xmm(_mm_load_sd(&val))
86 XMMReg2Double(
const XMMReg2Double &other) : xmm(other.xmm)
90 static inline XMMReg2Double Zero()
97 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
100 reg.nsLoad1ValHighAndLow(ptr);
104 static inline XMMReg2Double Load2Val(
const double *ptr)
111 static inline XMMReg2Double Load2Val(
const float *ptr)
118 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
121 reg.nsLoad2ValAligned(ptr);
125 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
132 static inline XMMReg2Double Load2Val(
const short *ptr)
139 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
146 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
147 const XMMReg2Double &expr2)
150 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
154 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
155 const XMMReg2Double &expr2)
158 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
162 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
163 const XMMReg2Double &expr2)
166 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
170 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
171 const XMMReg2Double &expr2)
174 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
178 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
179 const XMMReg2Double &true_expr,
180 const XMMReg2Double &false_expr)
183 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
184 _mm_andnot_pd(cond.xmm, false_expr.xmm));
188 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
189 const XMMReg2Double &expr2)
192 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
196 inline void nsLoad1ValHighAndLow(
const double *ptr)
198 xmm = _mm_load1_pd(ptr);
201 inline void nsLoad2Val(
const double *ptr)
203 xmm = _mm_loadu_pd(ptr);
206 inline void nsLoad2ValAligned(
const double *ptr)
208 xmm = _mm_load_pd(ptr);
211 inline void nsLoad2Val(
const float *ptr)
213 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
216 inline void nsLoad2Val(
const unsigned char *ptr)
218 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
220 xmm_i = _mm_cvtepu8_epi32(xmm_i);
222 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
223 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
225 xmm = _mm_cvtepi32_pd(xmm_i);
228 inline void nsLoad2Val(
const short *ptr)
230 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
232 xmm_i = _mm_cvtepi16_epi32(xmm_i);
234 xmm_i = _mm_unpacklo_epi16(
236 xmm_i = _mm_srai_epi32(
239 xmm = _mm_cvtepi32_pd(xmm_i);
242 inline void nsLoad2Val(
const unsigned short *ptr)
244 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
246 xmm_i = _mm_cvtepu16_epi32(xmm_i);
248 xmm_i = _mm_unpacklo_epi16(
250 _mm_setzero_si128());
252 xmm = _mm_cvtepi32_pd(xmm_i);
255 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
258 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
260 xmm_i = _mm_cvtepu8_epi32(xmm_i);
262 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
263 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
265 low.xmm = _mm_cvtepi32_pd(xmm_i);
267 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
270 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
274 high.nsLoad2Val(ptr + 2);
277 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
281 high.nsLoad2Val(ptr + 2);
284 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
288 high.nsLoad2Val(ptr + 2);
291 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
294 __m128 temp1 = _mm_loadu_ps(ptr);
295 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
296 low.xmm = _mm_cvtps_pd(temp1);
297 high.xmm = _mm_cvtps_pd(temp2);
300 inline void Zeroize()
302 xmm = _mm_setzero_pd();
305 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
311 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
313 xmm = _mm_add_pd(xmm, other.xmm);
317 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
319 xmm = _mm_mul_pd(xmm, other.xmm);
323 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
326 ret.xmm = _mm_add_pd(xmm, other.xmm);
330 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
333 ret.xmm = _mm_sub_pd(xmm, other.xmm);
337 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
340 ret.xmm = _mm_mul_pd(xmm, other.xmm);
344 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
347 ret.xmm = _mm_div_pd(xmm, other.xmm);
351 inline double GetHorizSum()
const
354 xmm2 = _mm_shuffle_pd(
357 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
360 inline void Store2Val(
double *ptr)
const
362 _mm_storeu_pd(ptr, xmm);
365 inline void Store2ValAligned(
double *ptr)
const
367 _mm_store_pd(ptr, xmm);
370 inline void Store2Val(
float *ptr)
const
372 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
373 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
376 inline void Store2Val(
unsigned char *ptr)
const
378 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
381 tmp = _mm_packs_epi32(tmp, tmp);
382 tmp = _mm_packus_epi16(tmp, tmp);
383 GDALCopyXMMToInt16(tmp,
reinterpret_cast<GInt16 *
>(ptr));
386 inline void Store2Val(
unsigned short *ptr)
const
388 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
392 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
393 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
396 inline void StoreMask(
unsigned char *ptr)
const
398 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(ptr),
399 _mm_castpd_si128(xmm));
402 inline operator double()
const
404 return _mm_cvtsd_f64(xmm);
410#ifndef NO_WARN_USE_SSE2_EMULATION
411#warning "Software emulation of SSE2 !"
424 XMMReg2Double(
double val)
430 XMMReg2Double(
const XMMReg2Double &other) : low(other.low), high(other.high)
434 static inline XMMReg2Double Zero()
441 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
444 reg.nsLoad1ValHighAndLow(ptr);
448 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
449 const XMMReg2Double &expr2)
453 if (expr1.low == expr2.low)
454 memset(&(reg.low), 0xFF,
sizeof(
double));
458 if (expr1.high == expr2.high)
459 memset(&(reg.high), 0xFF,
sizeof(
double));
466 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
467 const XMMReg2Double &expr2)
471 if (expr1.low != expr2.low)
472 memset(&(reg.low), 0xFF,
sizeof(
double));
476 if (expr1.high != expr2.high)
477 memset(&(reg.high), 0xFF,
sizeof(
double));
484 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
485 const XMMReg2Double &expr2)
489 if (expr1.low > expr2.low)
490 memset(&(reg.low), 0xFF,
sizeof(
double));
494 if (expr1.high > expr2.high)
495 memset(&(reg.high), 0xFF,
sizeof(
double));
502 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
503 const XMMReg2Double &expr2)
506 int low1[2], high1[2];
507 int low2[2], high2[2];
508 memcpy(low1, &expr1.low,
sizeof(
double));
509 memcpy(high1, &expr1.high,
sizeof(
double));
510 memcpy(low2, &expr2.low,
sizeof(
double));
511 memcpy(high2, &expr2.high,
sizeof(
double));
514 high1[0] &= high2[0];
515 high1[1] &= high2[1];
516 memcpy(®.low, low1,
sizeof(
double));
517 memcpy(®.high, high1,
sizeof(
double));
521 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
522 const XMMReg2Double &true_expr,
523 const XMMReg2Double &false_expr)
527 reg.low = true_expr.low;
529 reg.low = false_expr.low;
531 reg.high = true_expr.high;
533 reg.high = false_expr.high;
537 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
538 const XMMReg2Double &expr2)
541 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
542 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
546 static inline XMMReg2Double Load2Val(
const double *ptr)
553 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
556 reg.nsLoad2ValAligned(ptr);
560 static inline XMMReg2Double Load2Val(
const float *ptr)
567 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
574 static inline XMMReg2Double Load2Val(
const short *ptr)
581 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
588 inline void nsLoad1ValHighAndLow(
const double *ptr)
594 inline void nsLoad2Val(
const double *ptr)
600 inline void nsLoad2ValAligned(
const double *ptr)
606 inline void nsLoad2Val(
const float *ptr)
612 inline void nsLoad2Val(
const unsigned char *ptr)
618 inline void nsLoad2Val(
const short *ptr)
624 inline void nsLoad2Val(
const unsigned short *ptr)
630 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
639 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
643 high.nsLoad2Val(ptr + 2);
646 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
650 high.nsLoad2Val(ptr + 2);
653 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
657 high.nsLoad2Val(ptr + 2);
660 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
664 high.nsLoad2Val(ptr + 2);
667 inline void Zeroize()
673 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
680 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
687 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
694 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
697 ret.low = low + other.low;
698 ret.high = high + other.high;
702 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
705 ret.low = low - other.low;
706 ret.high = high - other.high;
710 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
713 ret.low = low * other.low;
714 ret.high = high * other.high;
718 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
721 ret.low = low / other.low;
722 ret.high = high / other.high;
726 inline double GetHorizSum()
const
731 inline void Store2Val(
double *ptr)
const
737 inline void Store2ValAligned(
double *ptr)
const
743 inline void Store2Val(
float *ptr)
const
745 ptr[0] =
static_cast<float>(low);
746 ptr[1] =
static_cast<float>(high);
749 void Store2Val(
unsigned char *ptr)
const
751 ptr[0] = (
unsigned char)(low + 0.5);
752 ptr[1] = (
unsigned char)(high + 0.5);
755 void Store2Val(
unsigned short *ptr)
const
758 ptr[1] = (
GUInt16)(high + 0.5);
761 inline void StoreMask(
unsigned char *ptr)
const
763 memcpy(ptr, &low, 8);
764 memcpy(ptr + 8, &high, 8);
767 inline operator double()
const
775#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
777#include <immintrin.h>
784 XMMReg4Double() : ymm(_mm256_setzero_pd())
788 XMMReg4Double(
const XMMReg4Double &other) : ymm(other.ymm)
792 static inline XMMReg4Double Zero()
799 inline void Zeroize()
801 ymm = _mm256_setzero_pd();
804 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
807 reg.nsLoad1ValHighAndLow(ptr);
811 inline void nsLoad1ValHighAndLow(
const double *ptr)
813 ymm = _mm256_set1_pd(*ptr);
816 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
823 inline void nsLoad4Val(
const unsigned char *ptr)
825 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
826 xmm_i = _mm_cvtepu8_epi32(xmm_i);
827 ymm = _mm256_cvtepi32_pd(xmm_i);
830 static inline XMMReg4Double Load4Val(
const short *ptr)
837 inline void nsLoad4Val(
const short *ptr)
839 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
840 xmm_i = _mm_cvtepi16_epi32(xmm_i);
841 ymm = _mm256_cvtepi32_pd(xmm_i);
844 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
851 inline void nsLoad4Val(
const unsigned short *ptr)
853 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
854 xmm_i = _mm_cvtepu16_epi32(xmm_i);
855 ymm = _mm256_cvtepi32_pd(
860 static inline XMMReg4Double Load4Val(
const double *ptr)
867 inline void nsLoad4Val(
const double *ptr)
869 ymm = _mm256_loadu_pd(ptr);
872 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
875 reg.nsLoad4ValAligned(ptr);
879 inline void nsLoad4ValAligned(
const double *ptr)
881 ymm = _mm256_load_pd(ptr);
884 static inline XMMReg4Double Load4Val(
const float *ptr)
891 inline void nsLoad4Val(
const float *ptr)
893 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
896 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
897 const XMMReg4Double &expr2)
900 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
904 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
905 const XMMReg4Double &expr2)
908 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
912 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
913 const XMMReg4Double &expr2)
916 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
920 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
921 const XMMReg4Double &expr2)
924 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
928 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
929 const XMMReg4Double &true_expr,
930 const XMMReg4Double &false_expr)
933 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
934 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
938 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
939 const XMMReg4Double &expr2)
942 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
946 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
952 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
954 ymm = _mm256_add_pd(ymm, other.ymm);
958 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
960 ymm = _mm256_mul_pd(ymm, other.ymm);
964 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
967 ret.ymm = _mm256_add_pd(ymm, other.ymm);
971 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
974 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
978 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
981 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
985 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
988 ret.ymm = _mm256_div_pd(ymm, other.ymm);
992 void AddToLow(
const XMMReg2Double &other)
994 __m256d ymm2 = _mm256_setzero_pd();
995 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
996 ymm = _mm256_add_pd(ymm, ymm2);
999 inline double GetHorizSum()
const
1001 __m256d ymm_tmp1, ymm_tmp2;
1002 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1003 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1004 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1005 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1008 inline void Store4Val(
unsigned char *ptr)
const
1011 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1015 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1017 GDALCopyXMMToInt32(xmm_i,
reinterpret_cast<GInt32 *
>(ptr));
1020 inline void Store4Val(
unsigned short *ptr)
const
1023 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1024 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
1025 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
1028 inline void Store4Val(
float *ptr)
const
1030 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1033 inline void Store4Val(
double *ptr)
const
1035 _mm256_storeu_pd(ptr, ymm);
1038 inline void StoreMask(
unsigned char *ptr)
const
1040 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(ptr),
1041 _mm256_castpd_si256(ymm));
1050 XMMReg2Double low, high;
1052#if defined(__GNUC__)
1053#pragma GCC diagnostic push
1054#pragma GCC diagnostic ignored "-Weffc++"
1057 XMMReg4Double() =
default;
1058#if defined(__GNUC__)
1059#pragma GCC diagnostic pop
1062 XMMReg4Double(
const XMMReg4Double &other) : low(other.low), high(other.high)
1066 static inline XMMReg4Double Zero()
1074 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
1077 reg.low.nsLoad1ValHighAndLow(ptr);
1082 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
1085 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1089 static inline XMMReg4Double Load4Val(
const short *ptr)
1092 reg.low.nsLoad2Val(ptr);
1093 reg.high.nsLoad2Val(ptr + 2);
1097 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
1100 reg.low.nsLoad2Val(ptr);
1101 reg.high.nsLoad2Val(ptr + 2);
1105 static inline XMMReg4Double Load4Val(
const double *ptr)
1108 reg.low.nsLoad2Val(ptr);
1109 reg.high.nsLoad2Val(ptr + 2);
1113 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
1116 reg.low.nsLoad2ValAligned(ptr);
1117 reg.high.nsLoad2ValAligned(ptr + 2);
1121 static inline XMMReg4Double Load4Val(
const float *ptr)
1124 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1128 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
1129 const XMMReg4Double &expr2)
1132 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1133 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1137 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
1138 const XMMReg4Double &expr2)
1141 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1142 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1146 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
1147 const XMMReg4Double &expr2)
1150 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1151 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1155 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
1156 const XMMReg4Double &expr2)
1159 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1160 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1164 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
1165 const XMMReg4Double &true_expr,
1166 const XMMReg4Double &false_expr)
1170 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1172 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1176 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
1177 const XMMReg4Double &expr2)
1180 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1181 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1185 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
1192 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
1199 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
1206 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
1209 ret.low = low + other.low;
1210 ret.high = high + other.high;
1214 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1217 ret.low = low - other.low;
1218 ret.high = high - other.high;
1222 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1225 ret.low = low * other.low;
1226 ret.high = high * other.high;
1230 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1233 ret.low = low / other.low;
1234 ret.high = high / other.high;
1238 void AddToLow(
const XMMReg2Double &other)
1243 inline double GetHorizSum()
const
1245 return (low + high).GetHorizSum();
1248 inline void Store4Val(
unsigned char *ptr)
const
1250#ifdef USE_SSE2_EMULATION
1252 high.Store2Val(ptr + 2);
1254 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1257 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1260 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1261 _mm_castsi128_ps(tmpHigh),
1262 _MM_SHUFFLE(1, 0, 1, 0)));
1263 tmp = _mm_packs_epi32(tmp, tmp);
1264 tmp = _mm_packus_epi16(tmp, tmp);
1265 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
1269 inline void Store4Val(
unsigned short *ptr)
const
1273 high.Store2Val(ptr + 2);
1275 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1276 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1277 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1279 xmm0 = _mm_packus_epi32(xmm0, xmm0);
1281 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1282 xmm0 = _mm_packs_epi32(xmm0, xmm0);
1283 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1285 GDALCopyXMMToInt64(xmm0, (
GInt64 *)ptr);
1289 inline void Store4Val(
float *ptr)
const
1292 high.Store2Val(ptr + 2);
1295 inline void Store4Val(
double *ptr)
const
1298 high.Store2Val(ptr + 2);
1301 inline void StoreMask(
unsigned char *ptr)
const
1304 high.StoreMask(ptr + 16);
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition: cpl_port.h:165
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:220
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:167
int GInt32
Int32 type.
Definition: cpl_port.h:159