GDAL
gdalsse_priv.h
1 /******************************************************************************
2  * $Id$
3  *
4  * Project: GDAL
5  * Purpose: SSE2 helper
6  * Author: Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 #ifndef DOXYGEN_SKIP
34 
35 #include "cpl_port.h"
36 
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
40  !defined(USE_SSE2_EMULATION)
41 
42 /* Requires SSE2 */
43 #include <emmintrin.h>
44 #include <string.h>
45 
46 #ifdef __SSE4_1__
47 #include <smmintrin.h>
48 #endif
49 
50 #include "gdal_priv_templates.hpp"
51 
52 static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
53 {
54  unsigned short s;
55  memcpy(&s, ptr, 2);
56  return _mm_cvtsi32_si128(s);
57 }
58 
59 static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
60 {
61  GInt32 i;
62  memcpy(&i, ptr, 4);
63  return _mm_cvtsi32_si128(i);
64 }
65 
66 static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
67 {
68 #if defined(__i386__) || defined(_M_IX86)
69  return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
70 #else
71  GInt64 i;
72  memcpy(&i, ptr, 8);
73  return _mm_cvtsi64_si128(i);
74 #endif
75 }
76 
77 static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
78 {
79  GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
80  memcpy(pDest, &i, 2);
81 }
82 
83 class XMMReg2Double
84 {
85  public:
86  __m128d xmm;
87 
88 #if defined(__GNUC__)
89 #pragma GCC diagnostic push
90 #pragma GCC diagnostic ignored "-Weffc++"
91 #endif
92  /* coverity[uninit_member] */
93  XMMReg2Double() = default;
94 #if defined(__GNUC__)
95 #pragma GCC diagnostic pop
96 #endif
97 
98  XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
99  {
100  }
101  XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
102  {
103  }
104 
105  static inline XMMReg2Double Zero()
106  {
107  XMMReg2Double reg;
108  reg.Zeroize();
109  return reg;
110  }
111 
112  static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
113  {
114  XMMReg2Double reg;
115  reg.nsLoad1ValHighAndLow(ptr);
116  return reg;
117  }
118 
119  static inline XMMReg2Double Load2Val(const double *ptr)
120  {
121  XMMReg2Double reg;
122  reg.nsLoad2Val(ptr);
123  return reg;
124  }
125 
126  static inline XMMReg2Double Load2Val(const float *ptr)
127  {
128  XMMReg2Double reg;
129  reg.nsLoad2Val(ptr);
130  return reg;
131  }
132 
133  static inline XMMReg2Double Load2ValAligned(const double *ptr)
134  {
135  XMMReg2Double reg;
136  reg.nsLoad2ValAligned(ptr);
137  return reg;
138  }
139 
140  static inline XMMReg2Double Load2Val(const unsigned char *ptr)
141  {
142  XMMReg2Double reg;
143  reg.nsLoad2Val(ptr);
144  return reg;
145  }
146 
147  static inline XMMReg2Double Load2Val(const short *ptr)
148  {
149  XMMReg2Double reg;
150  reg.nsLoad2Val(ptr);
151  return reg;
152  }
153 
154  static inline XMMReg2Double Load2Val(const unsigned short *ptr)
155  {
156  XMMReg2Double reg;
157  reg.nsLoad2Val(ptr);
158  return reg;
159  }
160 
161  static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
162  const XMMReg2Double &expr2)
163  {
164  XMMReg2Double reg;
165  reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
166  return reg;
167  }
168 
169  static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
170  const XMMReg2Double &expr2)
171  {
172  XMMReg2Double reg;
173  reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
174  return reg;
175  }
176 
177  static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
178  const XMMReg2Double &expr2)
179  {
180  XMMReg2Double reg;
181  reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
182  return reg;
183  }
184 
185  static inline XMMReg2Double And(const XMMReg2Double &expr1,
186  const XMMReg2Double &expr2)
187  {
188  XMMReg2Double reg;
189  reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
190  return reg;
191  }
192 
193  static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
194  const XMMReg2Double &true_expr,
195  const XMMReg2Double &false_expr)
196  {
197  XMMReg2Double reg;
198  reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
199  _mm_andnot_pd(cond.xmm, false_expr.xmm));
200  return reg;
201  }
202 
203  static inline XMMReg2Double Min(const XMMReg2Double &expr1,
204  const XMMReg2Double &expr2)
205  {
206  XMMReg2Double reg;
207  reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
208  return reg;
209  }
210 
211  inline void nsLoad1ValHighAndLow(const double *ptr)
212  {
213  xmm = _mm_load1_pd(ptr);
214  }
215 
216  inline void nsLoad2Val(const double *ptr)
217  {
218  xmm = _mm_loadu_pd(ptr);
219  }
220 
221  inline void nsLoad2ValAligned(const double *ptr)
222  {
223  xmm = _mm_load_pd(ptr);
224  }
225 
226  inline void nsLoad2Val(const float *ptr)
227  {
228  xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
229  }
230 
231  inline void nsLoad2Val(const unsigned char *ptr)
232  {
233  __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
234 #ifdef __SSE4_1__
235  xmm_i = _mm_cvtepu8_epi32(xmm_i);
236 #else
237  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
238  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
239 #endif
240  xmm = _mm_cvtepi32_pd(xmm_i);
241  }
242 
243  inline void nsLoad2Val(const short *ptr)
244  {
245  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
246 #ifdef __SSE4_1__
247  xmm_i = _mm_cvtepi16_epi32(xmm_i);
248 #else
249  xmm_i = _mm_unpacklo_epi16(
250  xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
251  xmm_i = _mm_srai_epi32(
252  xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
253 #endif
254  xmm = _mm_cvtepi32_pd(xmm_i);
255  }
256 
257  inline void nsLoad2Val(const unsigned short *ptr)
258  {
259  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
260 #ifdef __SSE4_1__
261  xmm_i = _mm_cvtepu16_epi32(xmm_i);
262 #else
263  xmm_i = _mm_unpacklo_epi16(
264  xmm_i,
265  _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
266 #endif
267  xmm = _mm_cvtepi32_pd(xmm_i);
268  }
269 
270  static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
271  XMMReg2Double &high)
272  {
273  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
274 #ifdef __SSE4_1__
275  xmm_i = _mm_cvtepu8_epi32(xmm_i);
276 #else
277  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
278  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
279 #endif
280  low.xmm = _mm_cvtepi32_pd(xmm_i);
281  high.xmm =
282  _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
283  }
284 
285  static inline void Load4Val(const short *ptr, XMMReg2Double &low,
286  XMMReg2Double &high)
287  {
288  low.nsLoad2Val(ptr);
289  high.nsLoad2Val(ptr + 2);
290  }
291 
292  static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
293  XMMReg2Double &high)
294  {
295  low.nsLoad2Val(ptr);
296  high.nsLoad2Val(ptr + 2);
297  }
298 
299  static inline void Load4Val(const double *ptr, XMMReg2Double &low,
300  XMMReg2Double &high)
301  {
302  low.nsLoad2Val(ptr);
303  high.nsLoad2Val(ptr + 2);
304  }
305 
306  static inline void Load4Val(const float *ptr, XMMReg2Double &low,
307  XMMReg2Double &high)
308  {
309  __m128 temp1 = _mm_loadu_ps(ptr);
310  __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
311  low.xmm = _mm_cvtps_pd(temp1);
312  high.xmm = _mm_cvtps_pd(temp2);
313  }
314 
315  inline void Zeroize()
316  {
317  xmm = _mm_setzero_pd();
318  }
319 
320  inline XMMReg2Double &operator=(const XMMReg2Double &other)
321  {
322  xmm = other.xmm;
323  return *this;
324  }
325 
326  inline XMMReg2Double &operator+=(const XMMReg2Double &other)
327  {
328  xmm = _mm_add_pd(xmm, other.xmm);
329  return *this;
330  }
331 
332  inline XMMReg2Double &operator*=(const XMMReg2Double &other)
333  {
334  xmm = _mm_mul_pd(xmm, other.xmm);
335  return *this;
336  }
337 
338  inline XMMReg2Double operator+(const XMMReg2Double &other) const
339  {
340  XMMReg2Double ret;
341  ret.xmm = _mm_add_pd(xmm, other.xmm);
342  return ret;
343  }
344 
345  inline XMMReg2Double operator-(const XMMReg2Double &other) const
346  {
347  XMMReg2Double ret;
348  ret.xmm = _mm_sub_pd(xmm, other.xmm);
349  return ret;
350  }
351 
352  inline XMMReg2Double operator*(const XMMReg2Double &other) const
353  {
354  XMMReg2Double ret;
355  ret.xmm = _mm_mul_pd(xmm, other.xmm);
356  return ret;
357  }
358 
359  inline XMMReg2Double operator/(const XMMReg2Double &other) const
360  {
361  XMMReg2Double ret;
362  ret.xmm = _mm_div_pd(xmm, other.xmm);
363  return ret;
364  }
365 
366  inline double GetHorizSum() const
367  {
368  __m128d xmm2;
369  xmm2 = _mm_shuffle_pd(
370  xmm, xmm,
371  _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
372  return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
373  }
374 
375  inline void Store2Val(double *ptr) const
376  {
377  _mm_storeu_pd(ptr, xmm);
378  }
379 
380  inline void Store2ValAligned(double *ptr) const
381  {
382  _mm_store_pd(ptr, xmm);
383  }
384 
385  inline void Store2Val(float *ptr) const
386  {
387  __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
388  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
389  }
390 
391  inline void Store2Val(unsigned char *ptr) const
392  {
393  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
394  xmm,
395  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
396  tmp = _mm_packs_epi32(tmp, tmp);
397  tmp = _mm_packus_epi16(tmp, tmp);
398  GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
399  }
400 
401  inline void Store2Val(unsigned short *ptr) const
402  {
403  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
404  xmm,
405  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
406  // X X X X 0 B 0 A --> X X X X A A B A
407  tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
408  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
409  }
410 
411  inline void StoreMask(unsigned char *ptr) const
412  {
413  _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
414  _mm_castpd_si128(xmm));
415  }
416 
417  inline operator double() const
418  {
419  return _mm_cvtsd_f64(xmm);
420  }
421 };
422 
423 #else
424 
425 #ifndef NO_WARN_USE_SSE2_EMULATION
426 #warning "Software emulation of SSE2 !"
427 #endif
428 
429 class XMMReg2Double
430 {
431  public:
432  double low;
433  double high;
434 
435  XMMReg2Double()
436  {
437  }
438  XMMReg2Double(double val)
439  {
440  low = val;
441  high = 0.0;
442  }
443  XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
444  {
445  }
446 
447  static inline XMMReg2Double Zero()
448  {
449  XMMReg2Double reg;
450  reg.Zeroize();
451  return reg;
452  }
453 
454  static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
455  {
456  XMMReg2Double reg;
457  reg.nsLoad1ValHighAndLow(ptr);
458  return reg;
459  }
460 
461  static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
462  const XMMReg2Double &expr2)
463  {
464  XMMReg2Double reg;
465 
466  if (expr1.low == expr2.low)
467  memset(&(reg.low), 0xFF, sizeof(double));
468  else
469  reg.low = 0;
470 
471  if (expr1.high == expr2.high)
472  memset(&(reg.high), 0xFF, sizeof(double));
473  else
474  reg.high = 0;
475 
476  return reg;
477  }
478 
479  static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
480  const XMMReg2Double &expr2)
481  {
482  XMMReg2Double reg;
483 
484  if (expr1.low != expr2.low)
485  memset(&(reg.low), 0xFF, sizeof(double));
486  else
487  reg.low = 0;
488 
489  if (expr1.high != expr2.high)
490  memset(&(reg.high), 0xFF, sizeof(double));
491  else
492  reg.high = 0;
493 
494  return reg;
495  }
496 
497  static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
498  const XMMReg2Double &expr2)
499  {
500  XMMReg2Double reg;
501 
502  if (expr1.low > expr2.low)
503  memset(&(reg.low), 0xFF, sizeof(double));
504  else
505  reg.low = 0;
506 
507  if (expr1.high > expr2.high)
508  memset(&(reg.high), 0xFF, sizeof(double));
509  else
510  reg.high = 0;
511 
512  return reg;
513  }
514 
515  static inline XMMReg2Double And(const XMMReg2Double &expr1,
516  const XMMReg2Double &expr2)
517  {
518  XMMReg2Double reg;
519  int low1[2], high1[2];
520  int low2[2], high2[2];
521  memcpy(low1, &expr1.low, sizeof(double));
522  memcpy(high1, &expr1.high, sizeof(double));
523  memcpy(low2, &expr2.low, sizeof(double));
524  memcpy(high2, &expr2.high, sizeof(double));
525  low1[0] &= low2[0];
526  low1[1] &= low2[1];
527  high1[0] &= high2[0];
528  high1[1] &= high2[1];
529  memcpy(&reg.low, low1, sizeof(double));
530  memcpy(&reg.high, high1, sizeof(double));
531  return reg;
532  }
533 
534  static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
535  const XMMReg2Double &true_expr,
536  const XMMReg2Double &false_expr)
537  {
538  XMMReg2Double reg;
539  if (cond.low != 0)
540  reg.low = true_expr.low;
541  else
542  reg.low = false_expr.low;
543  if (cond.high != 0)
544  reg.high = true_expr.high;
545  else
546  reg.high = false_expr.high;
547  return reg;
548  }
549 
550  static inline XMMReg2Double Min(const XMMReg2Double &expr1,
551  const XMMReg2Double &expr2)
552  {
553  XMMReg2Double reg;
554  reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
555  reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
556  return reg;
557  }
558 
559  static inline XMMReg2Double Load2Val(const double *ptr)
560  {
561  XMMReg2Double reg;
562  reg.nsLoad2Val(ptr);
563  return reg;
564  }
565 
566  static inline XMMReg2Double Load2ValAligned(const double *ptr)
567  {
568  XMMReg2Double reg;
569  reg.nsLoad2ValAligned(ptr);
570  return reg;
571  }
572 
573  static inline XMMReg2Double Load2Val(const float *ptr)
574  {
575  XMMReg2Double reg;
576  reg.nsLoad2Val(ptr);
577  return reg;
578  }
579 
580  static inline XMMReg2Double Load2Val(const unsigned char *ptr)
581  {
582  XMMReg2Double reg;
583  reg.nsLoad2Val(ptr);
584  return reg;
585  }
586 
587  static inline XMMReg2Double Load2Val(const short *ptr)
588  {
589  XMMReg2Double reg;
590  reg.nsLoad2Val(ptr);
591  return reg;
592  }
593 
594  static inline XMMReg2Double Load2Val(const unsigned short *ptr)
595  {
596  XMMReg2Double reg;
597  reg.nsLoad2Val(ptr);
598  return reg;
599  }
600 
601  inline void nsLoad1ValHighAndLow(const double *ptr)
602  {
603  low = ptr[0];
604  high = ptr[0];
605  }
606 
607  inline void nsLoad2Val(const double *ptr)
608  {
609  low = ptr[0];
610  high = ptr[1];
611  }
612 
613  inline void nsLoad2ValAligned(const double *ptr)
614  {
615  low = ptr[0];
616  high = ptr[1];
617  }
618 
619  inline void nsLoad2Val(const float *ptr)
620  {
621  low = ptr[0];
622  high = ptr[1];
623  }
624 
625  inline void nsLoad2Val(const unsigned char *ptr)
626  {
627  low = ptr[0];
628  high = ptr[1];
629  }
630 
631  inline void nsLoad2Val(const short *ptr)
632  {
633  low = ptr[0];
634  high = ptr[1];
635  }
636 
637  inline void nsLoad2Val(const unsigned short *ptr)
638  {
639  low = ptr[0];
640  high = ptr[1];
641  }
642 
643  static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
644  XMMReg2Double &high)
645  {
646  low.low = ptr[0];
647  low.high = ptr[1];
648  high.low = ptr[2];
649  high.high = ptr[3];
650  }
651 
652  static inline void Load4Val(const short *ptr, XMMReg2Double &low,
653  XMMReg2Double &high)
654  {
655  low.nsLoad2Val(ptr);
656  high.nsLoad2Val(ptr + 2);
657  }
658 
659  static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
660  XMMReg2Double &high)
661  {
662  low.nsLoad2Val(ptr);
663  high.nsLoad2Val(ptr + 2);
664  }
665 
666  static inline void Load4Val(const double *ptr, XMMReg2Double &low,
667  XMMReg2Double &high)
668  {
669  low.nsLoad2Val(ptr);
670  high.nsLoad2Val(ptr + 2);
671  }
672 
673  static inline void Load4Val(const float *ptr, XMMReg2Double &low,
674  XMMReg2Double &high)
675  {
676  low.nsLoad2Val(ptr);
677  high.nsLoad2Val(ptr + 2);
678  }
679 
680  inline void Zeroize()
681  {
682  low = 0.0;
683  high = 0.0;
684  }
685 
686  inline XMMReg2Double &operator=(const XMMReg2Double &other)
687  {
688  low = other.low;
689  high = other.high;
690  return *this;
691  }
692 
693  inline XMMReg2Double &operator+=(const XMMReg2Double &other)
694  {
695  low += other.low;
696  high += other.high;
697  return *this;
698  }
699 
700  inline XMMReg2Double &operator*=(const XMMReg2Double &other)
701  {
702  low *= other.low;
703  high *= other.high;
704  return *this;
705  }
706 
707  inline XMMReg2Double operator+(const XMMReg2Double &other) const
708  {
709  XMMReg2Double ret;
710  ret.low = low + other.low;
711  ret.high = high + other.high;
712  return ret;
713  }
714 
715  inline XMMReg2Double operator-(const XMMReg2Double &other) const
716  {
717  XMMReg2Double ret;
718  ret.low = low - other.low;
719  ret.high = high - other.high;
720  return ret;
721  }
722 
723  inline XMMReg2Double operator*(const XMMReg2Double &other) const
724  {
725  XMMReg2Double ret;
726  ret.low = low * other.low;
727  ret.high = high * other.high;
728  return ret;
729  }
730 
731  inline XMMReg2Double operator/(const XMMReg2Double &other) const
732  {
733  XMMReg2Double ret;
734  ret.low = low / other.low;
735  ret.high = high / other.high;
736  return ret;
737  }
738 
739  inline double GetHorizSum() const
740  {
741  return low + high;
742  }
743 
744  inline void Store2Val(double *ptr) const
745  {
746  ptr[0] = low;
747  ptr[1] = high;
748  }
749 
750  inline void Store2ValAligned(double *ptr) const
751  {
752  ptr[0] = low;
753  ptr[1] = high;
754  }
755 
756  inline void Store2Val(float *ptr) const
757  {
758  ptr[0] = static_cast<float>(low);
759  ptr[1] = static_cast<float>(high);
760  }
761 
762  void Store2Val(unsigned char *ptr) const
763  {
764  ptr[0] = (unsigned char)(low + 0.5);
765  ptr[1] = (unsigned char)(high + 0.5);
766  }
767 
768  void Store2Val(unsigned short *ptr) const
769  {
770  ptr[0] = (GUInt16)(low + 0.5);
771  ptr[1] = (GUInt16)(high + 0.5);
772  }
773 
774  inline void StoreMask(unsigned char *ptr) const
775  {
776  memcpy(ptr, &low, 8);
777  memcpy(ptr + 8, &high, 8);
778  }
779 
780  inline operator double() const
781  {
782  return low;
783  }
784 };
785 
786 #endif /* defined(__x86_64) || defined(_M_X64) */
787 
788 #if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
789 
790 #include <immintrin.h>
791 
792 class XMMReg4Double
793 {
794  public:
795  __m256d ymm;
796 
797  XMMReg4Double() : ymm(_mm256_setzero_pd())
798  {
799  }
800  XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
801  {
802  }
803 
804  static inline XMMReg4Double Zero()
805  {
806  XMMReg4Double reg;
807  reg.Zeroize();
808  return reg;
809  }
810 
811  inline void Zeroize()
812  {
813  ymm = _mm256_setzero_pd();
814  }
815 
816  static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
817  {
818  XMMReg4Double reg;
819  reg.nsLoad1ValHighAndLow(ptr);
820  return reg;
821  }
822 
823  inline void nsLoad1ValHighAndLow(const double *ptr)
824  {
825  ymm = _mm256_set1_pd(*ptr);
826  }
827 
828  static inline XMMReg4Double Load4Val(const unsigned char *ptr)
829  {
830  XMMReg4Double reg;
831  reg.nsLoad4Val(ptr);
832  return reg;
833  }
834 
835  inline void nsLoad4Val(const unsigned char *ptr)
836  {
837  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
838  xmm_i = _mm_cvtepu8_epi32(xmm_i);
839  ymm = _mm256_cvtepi32_pd(xmm_i);
840  }
841 
842  static inline XMMReg4Double Load4Val(const short *ptr)
843  {
844  XMMReg4Double reg;
845  reg.nsLoad4Val(ptr);
846  return reg;
847  }
848 
849  inline void nsLoad4Val(const short *ptr)
850  {
851  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
852  xmm_i = _mm_cvtepi16_epi32(xmm_i);
853  ymm = _mm256_cvtepi32_pd(xmm_i);
854  }
855 
856  static inline XMMReg4Double Load4Val(const unsigned short *ptr)
857  {
858  XMMReg4Double reg;
859  reg.nsLoad4Val(ptr);
860  return reg;
861  }
862 
863  inline void nsLoad4Val(const unsigned short *ptr)
864  {
865  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
866  xmm_i = _mm_cvtepu16_epi32(xmm_i);
867  ymm = _mm256_cvtepi32_pd(
868  xmm_i); // ok to use signed conversion since we are in the ushort
869  // range, so cannot be interpreted as negative int32
870  }
871 
872  static inline XMMReg4Double Load4Val(const double *ptr)
873  {
874  XMMReg4Double reg;
875  reg.nsLoad4Val(ptr);
876  return reg;
877  }
878 
879  inline void nsLoad4Val(const double *ptr)
880  {
881  ymm = _mm256_loadu_pd(ptr);
882  }
883 
884  static inline XMMReg4Double Load4ValAligned(const double *ptr)
885  {
886  XMMReg4Double reg;
887  reg.nsLoad4ValAligned(ptr);
888  return reg;
889  }
890 
891  inline void nsLoad4ValAligned(const double *ptr)
892  {
893  ymm = _mm256_load_pd(ptr);
894  }
895 
896  static inline XMMReg4Double Load4Val(const float *ptr)
897  {
898  XMMReg4Double reg;
899  reg.nsLoad4Val(ptr);
900  return reg;
901  }
902 
903  inline void nsLoad4Val(const float *ptr)
904  {
905  ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
906  }
907 
908  static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
909  const XMMReg4Double &expr2)
910  {
911  XMMReg4Double reg;
912  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
913  return reg;
914  }
915 
916  static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
917  const XMMReg4Double &expr2)
918  {
919  XMMReg4Double reg;
920  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
921  return reg;
922  }
923 
924  static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
925  const XMMReg4Double &expr2)
926  {
927  XMMReg4Double reg;
928  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
929  return reg;
930  }
931 
932  static inline XMMReg4Double And(const XMMReg4Double &expr1,
933  const XMMReg4Double &expr2)
934  {
935  XMMReg4Double reg;
936  reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
937  return reg;
938  }
939 
940  static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
941  const XMMReg4Double &true_expr,
942  const XMMReg4Double &false_expr)
943  {
944  XMMReg4Double reg;
945  reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
946  _mm256_andnot_pd(cond.ymm, false_expr.ymm));
947  return reg;
948  }
949 
950  static inline XMMReg4Double Min(const XMMReg4Double &expr1,
951  const XMMReg4Double &expr2)
952  {
953  XMMReg4Double reg;
954  reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
955  return reg;
956  }
957 
958  inline XMMReg4Double &operator=(const XMMReg4Double &other)
959  {
960  ymm = other.ymm;
961  return *this;
962  }
963 
964  inline XMMReg4Double &operator+=(const XMMReg4Double &other)
965  {
966  ymm = _mm256_add_pd(ymm, other.ymm);
967  return *this;
968  }
969 
970  inline XMMReg4Double &operator*=(const XMMReg4Double &other)
971  {
972  ymm = _mm256_mul_pd(ymm, other.ymm);
973  return *this;
974  }
975 
976  inline XMMReg4Double operator+(const XMMReg4Double &other) const
977  {
978  XMMReg4Double ret;
979  ret.ymm = _mm256_add_pd(ymm, other.ymm);
980  return ret;
981  }
982 
983  inline XMMReg4Double operator-(const XMMReg4Double &other) const
984  {
985  XMMReg4Double ret;
986  ret.ymm = _mm256_sub_pd(ymm, other.ymm);
987  return ret;
988  }
989 
990  inline XMMReg4Double operator*(const XMMReg4Double &other) const
991  {
992  XMMReg4Double ret;
993  ret.ymm = _mm256_mul_pd(ymm, other.ymm);
994  return ret;
995  }
996 
997  inline XMMReg4Double operator/(const XMMReg4Double &other) const
998  {
999  XMMReg4Double ret;
1000  ret.ymm = _mm256_div_pd(ymm, other.ymm);
1001  return ret;
1002  }
1003 
1004  void AddToLow(const XMMReg2Double &other)
1005  {
1006  __m256d ymm2 = _mm256_setzero_pd();
1007  ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1008  ymm = _mm256_add_pd(ymm, ymm2);
1009  }
1010 
1011  inline double GetHorizSum() const
1012  {
1013  __m256d ymm_tmp1, ymm_tmp2;
1014  ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1015  ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1016  ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1017  return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1018  }
1019 
1020  inline void Store4Val(unsigned char *ptr) const
1021  {
1022  __m128i xmm_i =
1023  _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1024  // xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1025  // xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1026  xmm_i =
1027  _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1028  (12 << 24))); // SSSE3
1029  GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1030  }
1031 
1032  inline void Store4Val(unsigned short *ptr) const
1033  {
1034  __m128i xmm_i =
1035  _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1036  xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
1037  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1038  }
1039 
1040  inline void Store4Val(float *ptr) const
1041  {
1042  _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1043  }
1044 
1045  inline void Store4Val(double *ptr) const
1046  {
1047  _mm256_storeu_pd(ptr, ymm);
1048  }
1049 
1050  inline void StoreMask(unsigned char *ptr) const
1051  {
1052  _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1053  _mm256_castpd_si256(ymm));
1054  }
1055 };
1056 
1057 #else
1058 
1059 class XMMReg4Double
1060 {
1061  public:
1062  XMMReg2Double low, high;
1063 
1064 #if defined(__GNUC__)
1065 #pragma GCC diagnostic push
1066 #pragma GCC diagnostic ignored "-Weffc++"
1067 #endif
1068  /* coverity[uninit_member] */
1069  XMMReg4Double() = default;
1070 #if defined(__GNUC__)
1071 #pragma GCC diagnostic pop
1072 #endif
1073 
1074  XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1075  {
1076  }
1077 
1078  static inline XMMReg4Double Zero()
1079  {
1080  XMMReg4Double reg;
1081  reg.low.Zeroize();
1082  reg.high.Zeroize();
1083  return reg;
1084  }
1085 
1086  static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1087  {
1088  XMMReg4Double reg;
1089  reg.low.nsLoad1ValHighAndLow(ptr);
1090  reg.high = reg.low;
1091  return reg;
1092  }
1093 
1094  static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1095  {
1096  XMMReg4Double reg;
1097  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1098  return reg;
1099  }
1100 
1101  static inline XMMReg4Double Load4Val(const short *ptr)
1102  {
1103  XMMReg4Double reg;
1104  reg.low.nsLoad2Val(ptr);
1105  reg.high.nsLoad2Val(ptr + 2);
1106  return reg;
1107  }
1108 
1109  static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1110  {
1111  XMMReg4Double reg;
1112  reg.low.nsLoad2Val(ptr);
1113  reg.high.nsLoad2Val(ptr + 2);
1114  return reg;
1115  }
1116 
1117  static inline XMMReg4Double Load4Val(const double *ptr)
1118  {
1119  XMMReg4Double reg;
1120  reg.low.nsLoad2Val(ptr);
1121  reg.high.nsLoad2Val(ptr + 2);
1122  return reg;
1123  }
1124 
1125  static inline XMMReg4Double Load4ValAligned(const double *ptr)
1126  {
1127  XMMReg4Double reg;
1128  reg.low.nsLoad2ValAligned(ptr);
1129  reg.high.nsLoad2ValAligned(ptr + 2);
1130  return reg;
1131  }
1132 
1133  static inline XMMReg4Double Load4Val(const float *ptr)
1134  {
1135  XMMReg4Double reg;
1136  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1137  return reg;
1138  }
1139 
1140  static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1141  const XMMReg4Double &expr2)
1142  {
1143  XMMReg4Double reg;
1144  reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1145  reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1146  return reg;
1147  }
1148 
1149  static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1150  const XMMReg4Double &expr2)
1151  {
1152  XMMReg4Double reg;
1153  reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1154  reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1155  return reg;
1156  }
1157 
1158  static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1159  const XMMReg4Double &expr2)
1160  {
1161  XMMReg4Double reg;
1162  reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1163  reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1164  return reg;
1165  }
1166 
1167  static inline XMMReg4Double And(const XMMReg4Double &expr1,
1168  const XMMReg4Double &expr2)
1169  {
1170  XMMReg4Double reg;
1171  reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1172  reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1173  return reg;
1174  }
1175 
1176  static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1177  const XMMReg4Double &true_expr,
1178  const XMMReg4Double &false_expr)
1179  {
1180  XMMReg4Double reg;
1181  reg.low =
1182  XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1183  reg.high =
1184  XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1185  return reg;
1186  }
1187 
1188  static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1189  const XMMReg4Double &expr2)
1190  {
1191  XMMReg4Double reg;
1192  reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1193  reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1194  return reg;
1195  }
1196 
1197  inline XMMReg4Double &operator=(const XMMReg4Double &other)
1198  {
1199  low = other.low;
1200  high = other.high;
1201  return *this;
1202  }
1203 
1204  inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1205  {
1206  low += other.low;
1207  high += other.high;
1208  return *this;
1209  }
1210 
1211  inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1212  {
1213  low *= other.low;
1214  high *= other.high;
1215  return *this;
1216  }
1217 
1218  inline XMMReg4Double operator+(const XMMReg4Double &other) const
1219  {
1220  XMMReg4Double ret;
1221  ret.low = low + other.low;
1222  ret.high = high + other.high;
1223  return ret;
1224  }
1225 
1226  inline XMMReg4Double operator-(const XMMReg4Double &other) const
1227  {
1228  XMMReg4Double ret;
1229  ret.low = low - other.low;
1230  ret.high = high - other.high;
1231  return ret;
1232  }
1233 
1234  inline XMMReg4Double operator*(const XMMReg4Double &other) const
1235  {
1236  XMMReg4Double ret;
1237  ret.low = low * other.low;
1238  ret.high = high * other.high;
1239  return ret;
1240  }
1241 
1242  inline XMMReg4Double operator/(const XMMReg4Double &other) const
1243  {
1244  XMMReg4Double ret;
1245  ret.low = low / other.low;
1246  ret.high = high / other.high;
1247  return ret;
1248  }
1249 
1250  void AddToLow(const XMMReg2Double &other)
1251  {
1252  low += other;
1253  }
1254 
1255  inline double GetHorizSum() const
1256  {
1257  return (low + high).GetHorizSum();
1258  }
1259 
1260  inline void Store4Val(unsigned char *ptr) const
1261  {
1262 #ifdef USE_SSE2_EMULATION
1263  low.Store2Val(ptr);
1264  high.Store2Val(ptr + 2);
1265 #else
1266  __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
1267  low.xmm,
1268  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1269  __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
1270  high.xmm,
1271  _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
1272  auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
1273  _mm_castsi128_ps(tmpHigh),
1274  _MM_SHUFFLE(1, 0, 1, 0)));
1275  tmp = _mm_packs_epi32(tmp, tmp);
1276  tmp = _mm_packus_epi16(tmp, tmp);
1277  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
1278 #endif
1279  }
1280 
1281  inline void Store4Val(unsigned short *ptr) const
1282  {
1283 #if 1
1284  low.Store2Val(ptr);
1285  high.Store2Val(ptr + 2);
1286 #else
1287  __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
1288  __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
1289  xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1290 #if __SSE4_1__
1291  xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1292 #else
1293  xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
1294  xmm0 = _mm_packs_epi32(xmm0, xmm0);
1295  xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
1296 #endif
1297  GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
1298 #endif
1299  }
1300 
1301  inline void Store4Val(float *ptr) const
1302  {
1303  low.Store2Val(ptr);
1304  high.Store2Val(ptr + 2);
1305  }
1306 
1307  inline void Store4Val(double *ptr) const
1308  {
1309  low.Store2Val(ptr);
1310  high.Store2Val(ptr + 2);
1311  }
1312 
1313  inline void StoreMask(unsigned char *ptr) const
1314  {
1315  low.StoreMask(ptr);
1316  high.StoreMask(ptr + 16);
1317  }
1318 };
1319 
1320 #endif
1321 
1322 #endif /* #ifndef DOXYGEN_SKIP */
1323 
1324 #endif /* GDALSSE_PRIV_H_INCLUDED */
GInt16
short GInt16
Int16 type.
Definition: cpl_port.h:192
GInt64
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:247
cpl_port.h
GUInt16
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:194
GInt32
int GInt32
Int32 type.
Definition: cpl_port.h:186