Project Alice
Loading...
Searching...
No Matches
philox.h
Go to the documentation of this file.
1/*
2Copyright 2010-2011, D. E. Shaw Research.
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9* Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
11
12* Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16* Neither the name of D. E. Shaw Research nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32#ifndef _philox_dot_h_
33#define _philox_dot_h_
34
38#include "array.h"
39
40/*
41// Macros _Foo_tpl are code generation 'templates' They define
42// inline functions with names obtained by mangling Foo and the
43// macro arguments. E.g.,
44// _mulhilo_tpl(32, uint32_t, uint64_t)
45// expands to a definition of:
46// mulhilo32(uint32_t, uint32_t, uint32_t *, uint32_t *)
47// We then 'instantiate the template' to define
48// several different functions, e.g.,
49// mulhilo32
50// mulhilo64
51// These functions will be visible to user code, and may
52// also be used later in subsequent templates and definitions.
53
54// A template for mulhilo using a temporary of twice the word-width.
55// Gcc figures out that this can be reduced to a single 'mul' instruction,
56// despite the apparent use of double-wide variables, shifts, etc. It's
57// obviously not guaranteed that all compilers will be that smart, so
58// other implementations might be preferable, e.g., using an intrinsic
59// or an asm block. On the other hand, for 32-bit multiplies,
60// this *is* perfectly standard C99 - any C99 compiler should
61// understand it and produce correct code. For 64-bit multiplies,
62// it's only usable if the compiler recognizes that it can do
63// arithmetic on a 128-bit type. That happens to be true for gcc on
64// x86-64, and powerpc64 but not much else.
65*/
66#define _mulhilo_dword_tpl(W, Word, Dword) \
67 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { \
68 Dword product = ((Dword)a) * ((Dword)b); \
69 *hip = product >> W; \
70 return (Word)product; \
71 }
72
73/*
74// A template for mulhilo using gnu-style asm syntax.
75// INSN can be "mulw", "mull" or "mulq".
76// FIXME - porting to other architectures, we'll need still-more conditional
77// branching here. Note that intrinsics are usually preferable.
78*/
79#ifdef __powerpc__
80#define _mulhilo_asm_tpl(W, Word, INSN) \
81 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word* hip) { \
82 Word dx = 0; \
83 __asm__("\n\t" INSN " %0,%1,%2\n\t" : "=r"(dx) : "r"(b), "r"(ax)); \
84 *hip = dx; \
85 return ax * b; \
86 }
87#else
88#define _mulhilo_asm_tpl(W, Word, INSN) \
89 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word* hip) { \
90 Word dx; \
91 __asm__("\n\t" INSN " %2\n\t" : "=a"(ax), "=d"(dx) : "r"(b), "0"(ax)); \
92 *hip = dx; \
93 return ax; \
94 }
95#endif /* __powerpc__ */
96
97/*
98// A template for mulhilo using MSVC-style intrinsics
99// For example,_umul128 is an msvc intrinsic, c.f.
100// http://msdn.microsoft.com/en-us/library/3dayytw9.aspx
101*/
102#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
103 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { return INTRIN(a, b, hip); }
104
105/* N.B. This really should be called _mulhilo_mulhi_intrin. It just
106 happens that CUDA was the first time we used the idiom. */
107#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
108 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip) { \
109 *hip = INTRIN(a, b); \
110 return a * b; \
111 }
112
113/*
114// A template for mulhilo using only word-size operations and
115// C99 operators (no adc, no mulhi). It
116// requires four multiplies and a dozen or so shifts, adds
117// and tests. It's *SLOW*. It can be used to
118// implement philoxNx32 on platforms that completely lack
119// 64-bit types, e.g., Metal.
120// On 32-bit platforms, it could be used to
121// implement philoxNx64, but on such platforms both the philoxNx32
122// and the threefryNx64 cbrngs are going to have much better
123// performance. It is enabled below by R123_USE_MULHILO64_C99,
124// but that is currently (Feb 2019) only set by
125// features/metalfeatures.h headers. It can, of course, be
126// set with a compile-time -D option.
127*/
128#define _mulhilo_c99_tpl(W, Word) \
129 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip) { \
130 unsigned const WHALF = W / 2; \
131 const Word LOMASK = ((((Word)1) << WHALF) - 1); \
132 Word lo = a * b; /* full low multiply */ \
133 Word ahi = a >> WHALF; \
134 Word alo = a & LOMASK; \
135 Word bhi = b >> WHALF; \
136 Word blo = b & LOMASK; \
137 \
138 Word ahbl = ahi * blo; \
139 Word albh = alo * bhi; \
140 \
141 Word ahbl_albh = ((ahbl & LOMASK) + (albh & LOMASK)); \
142 Word hi = ahi * bhi + (ahbl >> WHALF) + (albh >> WHALF); \
143 hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
144 /* carry from the sum with alo*blo */ \
145 hi += ((lo >> WHALF) < (ahbl_albh & LOMASK)); \
146 *hip = hi; \
147 return lo; \
148 }
149
150/*
151// A template for mulhilo on a platform that can't do it
152// We could put a C version here, but is it better to run *VERY*
153// slowly or to just stop and force the user to find another CBRNG?
154*/
155#define _mulhilo_fail_tpl(W, Word) \
156 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); }
157
158/*
159// N.B. There's an MSVC intrinsic called _emul,
160// which *might* compile into better code than
161// _mulhilo_dword_tpl
162*/
163#if R123_USE_MULHILO32_ASM
164#ifdef __powerpc__
165_mulhilo_asm_tpl(32, uint32_t, "mulhwu")
166#else
167_mulhilo_asm_tpl(32, uint32_t, "mull")
168#endif /* __powerpc__ */
169#else
170#if R123_USE_64BIT
171_mulhilo_dword_tpl(32, uint32_t, uint64_t)
172#elif R123_USE_MULHILO32_MULHI_INTRIN
173_mulhilo_cuda_intrin_tpl(32, uint32_t, R123_MULHILO32_MULHI_INTRIN)
174#else
175_mulhilo_c99_tpl(32, uint32_t)
176#endif
177#endif
178
179#if R123_USE_PHILOX_64BIT
180#if R123_USE_MULHILO64_ASM
181#ifdef __powerpc64__
182 _mulhilo_asm_tpl(64, uint64_t, "mulhdu")
183#else
184 _mulhilo_asm_tpl(64, uint64_t, "mulq")
185#endif /* __powerpc64__ */
186#elif R123_USE_MULHILO64_MSVC_INTRIN
187 _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
188#elif R123_USE_MULHILO64_CUDA_INTRIN
189_mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
190#elif R123_USE_MULHILO64_OPENCL_INTRIN
191_mulhilo_cuda_intrin_tpl(64, uint64_t, mul_hi)
192#elif R123_USE_MULHILO64_MULHI_INTRIN
193_mulhilo_cuda_intrin_tpl(64, uint64_t, R123_MULHILO64_MULHI_INTRIN)
194#elif R123_USE_GNU_UINT128
195_mulhilo_dword_tpl(64, uint64_t, __uint128_t)
196#elif R123_USE_MULHILO64_C99
197_mulhilo_c99_tpl(64, uint64_t)
198#else
199_mulhilo_fail_tpl(64, uint64_t)
200#endif
201#endif
202
203/*
204// The multipliers and Weyl constants are "hard coded".
205// To change them, you can #define them with different
206// values before #include-ing this file.
207// This isn't terribly elegant, but it works for C as
208// well as C++. A nice C++-only solution would be to
209// use template parameters in the style of <random>
210*/
211#ifndef PHILOX_M2x64_0
212#define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
213#endif
214
215#ifndef PHILOX_M4x64_0
216#define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
217#endif
218
219#ifndef PHILOX_M4x64_1
220#define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
221#endif
222
223#ifndef PHILOX_M2x32_0
224#define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
225#endif
226
227#ifndef PHILOX_M4x32_0
228#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
229#endif
230#ifndef PHILOX_M4x32_1
231#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
232#endif
233
234#ifndef PHILOX_W64_0
235#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */
236#endif
237#ifndef PHILOX_W64_1
238#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */
239#endif
240
241#ifndef PHILOX_W32_0
242#define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
243#endif
244#ifndef PHILOX_W32_1
245#define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
246#endif
247
249#ifndef PHILOX2x32_DEFAULT_ROUNDS
250#define PHILOX2x32_DEFAULT_ROUNDS 10
251#endif
252
253#ifndef PHILOX2x64_DEFAULT_ROUNDS
254#define PHILOX2x64_DEFAULT_ROUNDS 10
255#endif
256
257#ifndef PHILOX4x32_DEFAULT_ROUNDS
258#define PHILOX4x32_DEFAULT_ROUNDS 10
259#endif
260
261#ifndef PHILOX4x64_DEFAULT_ROUNDS
262#define PHILOX4x64_DEFAULT_ROUNDS 10
263#endif
266/* The ignored fourth argument allows us to instantiate the
267 same macro regardless of N. */
268#define _philox2xWround_tpl(W, T) \
269 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
270 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key) { \
271 T hi; \
272 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
273 struct r123array2x##W out = {{hi ^ key.v[0] ^ ctr.v[1], lo}}; \
274 return out; \
275 }
276#define _philox2xWbumpkey_tpl(W) \
277 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey(struct r123array1x##W key) { \
278 key.v[0] += PHILOX_W##W##_0; \
279 return key; \
280 }
281
282#define _philox4xWround_tpl(W, T) \
283 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
284 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key) { \
285 T hi0; \
286 T hi1; \
287 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
288 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
289 struct r123array4x##W out = {{hi1 ^ ctr.v[1] ^ key.v[0], lo1, hi0 ^ ctr.v[3] ^ key.v[1], lo0}}; \
290 return out; \
291 }
292
293#define _philox4xWbumpkey_tpl(W) \
294 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey(struct r123array2x##W key) { \
295 key.v[0] += PHILOX_W##W##_0; \
296 key.v[1] += PHILOX_W##W##_1; \
297 return key; \
298 }
299
301#define _philoxNxW_tpl(N, Nhalf, W, T) \
302 \
303 enum r123_enum_philox##N##x##W{philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS}; \
304 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
305 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
306 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
307 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
308 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
309 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
310 R123_ASSERT(R <= 16); \
311 if(R > 0) { \
312 ctr = _philox##N##x##W##round(ctr, key); \
313 } \
314 if(R > 1) { \
315 key = _philox##N##x##W##bumpkey(key); \
316 ctr = _philox##N##x##W##round(ctr, key); \
317 } \
318 if(R > 2) { \
319 key = _philox##N##x##W##bumpkey(key); \
320 ctr = _philox##N##x##W##round(ctr, key); \
321 } \
322 if(R > 3) { \
323 key = _philox##N##x##W##bumpkey(key); \
324 ctr = _philox##N##x##W##round(ctr, key); \
325 } \
326 if(R > 4) { \
327 key = _philox##N##x##W##bumpkey(key); \
328 ctr = _philox##N##x##W##round(ctr, key); \
329 } \
330 if(R > 5) { \
331 key = _philox##N##x##W##bumpkey(key); \
332 ctr = _philox##N##x##W##round(ctr, key); \
333 } \
334 if(R > 6) { \
335 key = _philox##N##x##W##bumpkey(key); \
336 ctr = _philox##N##x##W##round(ctr, key); \
337 } \
338 if(R > 7) { \
339 key = _philox##N##x##W##bumpkey(key); \
340 ctr = _philox##N##x##W##round(ctr, key); \
341 } \
342 if(R > 8) { \
343 key = _philox##N##x##W##bumpkey(key); \
344 ctr = _philox##N##x##W##round(ctr, key); \
345 } \
346 if(R > 9) { \
347 key = _philox##N##x##W##bumpkey(key); \
348 ctr = _philox##N##x##W##round(ctr, key); \
349 } \
350 if(R > 10) { \
351 key = _philox##N##x##W##bumpkey(key); \
352 ctr = _philox##N##x##W##round(ctr, key); \
353 } \
354 if(R > 11) { \
355 key = _philox##N##x##W##bumpkey(key); \
356 ctr = _philox##N##x##W##round(ctr, key); \
357 } \
358 if(R > 12) { \
359 key = _philox##N##x##W##bumpkey(key); \
360 ctr = _philox##N##x##W##round(ctr, key); \
361 } \
362 if(R > 13) { \
363 key = _philox##N##x##W##bumpkey(key); \
364 ctr = _philox##N##x##W##round(ctr, key); \
365 } \
366 if(R > 14) { \
367 key = _philox##N##x##W##bumpkey(key); \
368 ctr = _philox##N##x##W##round(ctr, key); \
369 } \
370 if(R > 15) { \
371 key = _philox##N##x##W##bumpkey(key); \
372 ctr = _philox##N##x##W##round(ctr, key); \
373 } \
374 return ctr; \
375 }
376
377 _philox2xWbumpkey_tpl(32) _philox4xWbumpkey_tpl(32) _philox2xWround_tpl(32, uint32_t) /* philox2x32round */
378 _philox4xWround_tpl(32, uint32_t) /* philo4x32round */
379
380 _philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */
381 _philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */
382#if R123_USE_PHILOX_64BIT
384 _philox2xWbumpkey_tpl(64) _philox4xWbumpkey_tpl(64) _philox2xWround_tpl(64, uint64_t) /* philo2x64round */
385 _philox4xWround_tpl(64, uint64_t) /* philo4x64round */
387 _philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */
388 _philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */
389#endif /* R123_USE_PHILOX_64BIT */
390
391#define philox2x32(c, k) philox2x32_R(philox2x32_rounds, c, k)
392#define philox4x32(c, k) philox4x32_R(philox4x32_rounds, c, k)
393#if R123_USE_PHILOX_64BIT
394#define philox2x64(c, k) philox2x64_R(philox2x64_rounds, c, k)
395#define philox4x64(c, k) philox4x64_R(philox4x64_rounds, c, k)
396#endif /* R123_USE_PHILOX_64BIT */
397
398#if defined(__cplusplus)
399
400#define _PhiloxNxW_base_tpl(CType, KType, N, W) \
401 namespace r123 { \
402 template<unsigned int ROUNDS> struct Philox##N##x##W##_R { \
403 typedef CType ctr_type; \
404 typedef KType key_type; \
405 typedef KType ukey_type; \
406 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds = ROUNDS; \
407 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const) { \
408 R123_STATIC_ASSERT(ROUNDS <= 16, "philox is only unrolled up to 16 rounds\n"); \
409 return philox##N##x##W##_R(ROUNDS, ctr, key); \
410 } \
411 }; \
412 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
413 } // namespace r123
414
415 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) // Philox2x32_R<R>
416 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) // Philox4x32_R<R>
417#if R123_USE_PHILOX_64BIT
418 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) // Philox2x64_R<R>
419 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) // Philox4x64_R<R>
420#endif
421
422/* The _tpl macros don't quite work to do string-pasting inside comments.
423 so we just write out the boilerplate documentation four times... */
424
519#endif /* __cplusplus */
520
521#endif /* _philox_dot_h_ */
#define R123_MULHILO64_MULHI_INTRIN
#define R123_MULHILO32_MULHI_INTRIN
uint uint32_t
ulong uint64_t
uint32_t _philox4xWround_tpl(32, uint32_t) _philoxNxW_tpl(2
_philox2xWbumpkey_tpl(32) _philox4xWbumpkey_tpl(32) _philox2xWround_tpl(32
#define _philoxNxW_tpl(N, Nhalf, W, T)
Definition: philox.h:301