Project Alice
Loading...
Searching...
No Matches
sse.h
Go to the documentation of this file.
1/*
2Copyright 2010-2011, D. E. Shaw Research.
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9* Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
11
12* Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16* Neither the name of D. E. Shaw Research nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32#ifndef _Random123_sse_dot_h__
33#define _Random123_sse_dot_h__
34
35#if R123_USE_SSE
36
37#if R123_USE_X86INTRIN_H
38#include <x86intrin.h>
39#endif
40#if R123_USE_IA32INTRIN_H
41#include <ia32intrin.h>
42#endif
43#if R123_USE_XMMINTRIN_H
44#include <xmmintrin.h>
45#endif
46#if R123_USE_EMMINTRIN_H
47#include <emmintrin.h>
48#endif
49#if R123_USE_SMMINTRIN_H
50#include <smmintrin.h>
51#endif
52#if R123_USE_WMMINTRIN_H
53#include <wmmintrin.h>
54#endif
55#if R123_USE_INTRIN_H
56#include <intrin.h>
57#endif
58#ifdef __cplusplus
59#include <iostream>
60#include <limits>
61#include <stdexcept>
62#endif
63
64#if R123_USE_ASM_GNU
65
66/* bit25 of CX tells us whether AES is enabled. */
68 unsigned int eax, ebx, ecx, edx;
69 __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(1));
70 return (ecx >> 25) & 1;
71}
72#elif R123_USE_CPUID_MSVC
74 int CPUInfo[4];
75 __cpuid(CPUInfo, 1);
76 return (CPUInfo[2] >> 25) & 1;
77}
78#else /* R123_USE_CPUID_??? */
79#warning "No R123_USE_CPUID_XXX method chosen. haveAESNI will always return false"
80R123_STATIC_INLINE int haveAESNI() { return 0; }
81#endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
82
83// There is a lot of annoying and inexplicable variation in the
84// SSE intrinsics available in different compilation environments.
85// The details seem to depend on the compiler, the version and
86// the target architecture. Rather than insisting on
87// R123_USE_feature tests for each of these in each of the
88// compilerfeatures.h files we just keep the complexity localized
89// to here...
90#if(defined(__ICC) && __ICC < 1210) || (defined(_MSC_VER) && !defined(_WIN64) && _MSC_VER < 1900)
91/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
92 If not, use the 4x32-bit intrisic instead. N.B. It looks like Intel
93 added _mm_set_epi64x to icc version 12.1 in Jan 2012.
94*/
95R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0) {
96 union {
97 uint64_t u64;
98 uint32_t u32[2];
99 } u1, u0;
100 u1.u64 = v1;
101 u0.u64 = v0;
102 return _mm_set_epi32(u1.u32[1], u1.u32[0], u0.u32[1], u0.u32[0]);
103}
104#endif
105/* _mm_extract_lo64 abstracts the task of extracting the low 64-bit
106 word from an __m128i. The _mm_cvtsi128_si64 intrinsic does the job
107 on 64-bit platforms. Unfortunately, both MSVC and Open64 fail
108 assertions in ut_M128.cpp and ut_carray.cpp when we use the
109 _mm_cvtsi128_si64 intrinsic. (See
110 https://bugs.open64.net/show_bug.cgi?id=873 for the Open64 bug).
111 On 32-bit platforms, there's no MOVQ, so there's no intrinsic.
112 Finally, even if the intrinsic exists, it may be spelled with or
113 without the 'x'.
114*/
115#if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
116R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si) {
117 union {
118 uint64_t u64[2];
119 __m128i m;
120 } u;
121 _mm_store_si128(&u.m, si);
122 return u.u64[0];
123}
124#elif defined(__llvm__) || defined(__ICC)
125R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si) { return (uint64_t)_mm_cvtsi128_si64(si); }
126#else /* GNUC, others */
127/* FWIW, gcc's emmintrin.h has had the 'x' spelling
128 since at least gcc-3.4.4. The no-'x' spelling showed up
129 around 4.2. */
130R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si) { return (uint64_t)_mm_cvtsi128_si64x(si); }
131#endif
132#if defined(__GNUC__) && __GNUC__ < 4
133/* the cast builtins showed up in gcc4. */
134R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si) { return (__m128)si; }
135#endif
136
137#ifdef __cplusplus
138
139struct r123m128i {
140 __m128i m;
141#if R123_USE_CXX11_UNRESTRICTED_UNIONS
142 // C++98 forbids a union member from having *any* constructors.
143 // C++11 relaxes this, and allows union members to have constructors
144 // as long as there is a "trivial" default construtor. So in C++11
145 // we can provide a r123m128i constructor with an __m128i argument, and still
146 // have the default (and hence trivial) default constructor.
147 r123m128i() = default;
148 r123m128i(__m128i _m) : m(_m) { }
149#endif
150 r123m128i& operator=(__m128i const& rhs) {
151 m = rhs;
152 return *this;
153 }
154 r123m128i& operator=(R123_ULONG_LONG n) {
155 m = _mm_set_epi64x(0, n);
156 return *this;
157 }
158#if R123_USE_CXX11_EXPLICIT_CONVERSIONS
159 // With C++11 we can attach explicit to the bool conversion operator
160 // to disambiguate undesired promotions. For g++, this works
161 // only in 4.5 and above.
162 explicit operator bool() const { return _bool(); }
163#else
164 // Pre-C++11, we have to do something else. Google for the "safe bool"
165 // idiom for other ideas...
166 operator void const*() const { return _bool() ? this : 0; }
167#endif
168 operator __m128i() const { return m; }
169
170private:
171#if R123_USE_SSE4_1
172 bool _bool() const { return !_mm_testz_si128(m, m); }
173#else
174 bool _bool() const { return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
175#endif
176};
177
178R123_STATIC_INLINE r123m128i& operator++(r123m128i& v) {
179 __m128i& c = v.m;
180 __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
181 c = _mm_add_epi64(c, zeroone);
182 // return c;
183#if R123_USE_SSE4_1
184 __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
185 if(R123_BUILTIN_EXPECT(_mm_testz_si128(c, zerofff), 0)) {
186 __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
187 c = _mm_add_epi64(c, onezero);
188 }
189#else
190 unsigned mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
191 // The low two bits of mask are 11 iff the low 64 bits of
192 // c are zero.
193 if(R123_BUILTIN_EXPECT((mask & 0x3) == 0x3, 0)) {
194 __m128i onezero = _mm_set_epi64x(1, 0);
195 c = _mm_add_epi64(c, onezero);
196 }
197#endif
198 return v;
199}
200
201R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n) {
202 __m128i c = lhs.m;
203 __m128i incr128 = _mm_set_epi64x(0, n);
204 c = _mm_add_epi64(c, incr128);
205 // return c; // NO CARRY!
206
207 int64_t lo64 = _mm_extract_lo64(c);
208 if((uint64_t)lo64 < n)
209 c = _mm_add_epi64(c, _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0)));
210 lhs.m = c;
211 return lhs;
212}
213
214// We need this one because it's present, but never used in r123array1xm128i::incr
215R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, r123m128i const&) { std::abort(); }
216
217// The comparisons aren't implemented, but if we leave them out, and
218// somebody writes, e.g., M1 < M2, the compiler will do an implicit
219// conversion through void*. Sigh...
220R123_STATIC_INLINE bool operator<(r123m128i const&, r123m128i const&) { std::abort(); }
221R123_STATIC_INLINE bool operator<=(r123m128i const&, r123m128i const&) { std::abort(); }
222R123_STATIC_INLINE bool operator>(r123m128i const&, r123m128i const&) { std::abort(); }
223R123_STATIC_INLINE bool operator>=(r123m128i const&, r123m128i const&) { std::abort(); }
224
225R123_STATIC_INLINE bool operator==(r123m128i const& lhs, r123m128i const& rhs) { return 0xf == _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
226R123_STATIC_INLINE bool operator!=(r123m128i const& lhs, r123m128i const& rhs) { return !(lhs == rhs); }
227R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, r123m128i const& rhs) {
228 r123m128i LHS;
229 LHS.m = _mm_set_epi64x(0, lhs);
230 return LHS == rhs;
231}
232R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, r123m128i const& rhs) { return !(lhs == rhs); }
233R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, r123m128i const& m) {
234 union {
235 uint64_t u64[2];
236 __m128i m;
237 } u;
238 _mm_storeu_si128(&u.m, m.m);
239 return os << u.u64[0] << " " << u.u64[1];
240}
241
242R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m) {
243 uint64_t u64[2];
244 is >> u64[0] >> u64[1];
245 m.m = _mm_set_epi64x(u64[1], u64[0]);
246 return is;
247}
248
249template<typename T> inline T assemble_from_u32(uint32_t* p32); // forward declaration
250
251template<> inline r123m128i assemble_from_u32<r123m128i>(uint32_t* p32) {
252 r123m128i ret;
253 ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
254 return ret;
255}
256
257#else
258
259typedef struct {
260 __m128i m;
261} r123m128i;
262
263#endif /* __cplusplus */
264
265#else /* !R123_USE_SSE */
266R123_STATIC_INLINE int haveAESNI() { return 0; }
267#endif /* R123_USE_SSE */
268
269#endif /* _Random123_sse_dot_h__ */
#define R123_STATIC_INLINE
#define R123_BUILTIN_EXPECT(expr, likely)
#define R123_ULONG_LONG
mask_vector operator==(contiguous_tags_base< tag_type > a, tag_type b)
mask_vector operator!=(contiguous_tags_base< tag_type > a, tag_type b)
uint uint32_t
ulong uint64_t
R123_STATIC_INLINE int haveAESNI()
Definition: sse.h:266