66#define _mulhilo_dword_tpl(W, Word, Dword) \
67 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { \
68 Dword product = ((Dword)a) * ((Dword)b); \
69 *hip = product >> W; \
70 return (Word)product; \
80#define _mulhilo_asm_tpl(W, Word, INSN) \
81 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word* hip) { \
83 __asm__("\n\t" INSN " %0,%1,%2\n\t" : "=r"(dx) : "r"(b), "r"(ax)); \
88#define _mulhilo_asm_tpl(W, Word, INSN) \
89 R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word* hip) { \
91 __asm__("\n\t" INSN " %2\n\t" : "=a"(ax), "=d"(dx) : "r"(b), "0"(ax)); \
102#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN) \
103 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { return INTRIN(a, b, hip); }
107#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN) \
108 R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip) { \
109 *hip = INTRIN(a, b); \
128#define _mulhilo_c99_tpl(W, Word) \
129 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, R123_METAL_THREAD_ADDRESS_SPACE Word* hip) { \
130 unsigned const WHALF = W / 2; \
131 const Word LOMASK = ((((Word)1) << WHALF) - 1); \
133 Word ahi = a >> WHALF; \
134 Word alo = a & LOMASK; \
135 Word bhi = b >> WHALF; \
136 Word blo = b & LOMASK; \
138 Word ahbl = ahi * blo; \
139 Word albh = alo * bhi; \
141 Word ahbl_albh = ((ahbl & LOMASK) + (albh & LOMASK)); \
142 Word hi = ahi * bhi + (ahbl >> WHALF) + (albh >> WHALF); \
143 hi += ahbl_albh >> WHALF; \
145 hi += ((lo >> WHALF) < (ahbl_albh & LOMASK)); \
155#define _mulhilo_fail_tpl(W, Word) \
156 R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); }
163#if R123_USE_MULHILO32_ASM
165_mulhilo_asm_tpl(32,
uint32_t,
"mulhwu")
167_mulhilo_asm_tpl(32,
uint32_t,
"mull")
172#elif R123_USE_MULHILO32_MULHI_INTRIN
179#if R123_USE_PHILOX_64BIT
180#if R123_USE_MULHILO64_ASM
182 _mulhilo_asm_tpl(64,
uint64_t,
"mulhdu")
184 _mulhilo_asm_tpl(64,
uint64_t,
"mulq")
186#elif R123_USE_MULHILO64_MSVC_INTRIN
187 _mulhilo_msvc_intrin_tpl(64,
uint64_t, _umul128)
188#elif R123_USE_MULHILO64_CUDA_INTRIN
189_mulhilo_cuda_intrin_tpl(64,
uint64_t, __umul64hi)
190#elif R123_USE_MULHILO64_OPENCL_INTRIN
191_mulhilo_cuda_intrin_tpl(64,
uint64_t, mul_hi)
192#elif R123_USE_MULHILO64_MULHI_INTRIN
194#elif R123_USE_GNU_UINT128
195_mulhilo_dword_tpl(64,
uint64_t, __uint128_t)
196#elif R123_USE_MULHILO64_C99
211#ifndef PHILOX_M2x64_0
212#define PHILOX_M2x64_0 R123_64BIT(0xD2B74407B1CE6E93)
215#ifndef PHILOX_M4x64_0
216#define PHILOX_M4x64_0 R123_64BIT(0xD2E7470EE14C6C93)
219#ifndef PHILOX_M4x64_1
220#define PHILOX_M4x64_1 R123_64BIT(0xCA5A826395121157)
223#ifndef PHILOX_M2x32_0
224#define PHILOX_M2x32_0 ((uint32_t)0xd256d193)
227#ifndef PHILOX_M4x32_0
228#define PHILOX_M4x32_0 ((uint32_t)0xD2511F53)
230#ifndef PHILOX_M4x32_1
231#define PHILOX_M4x32_1 ((uint32_t)0xCD9E8D57)
235#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)
238#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)
242#define PHILOX_W32_0 ((uint32_t)0x9E3779B9)
245#define PHILOX_W32_1 ((uint32_t)0xBB67AE85)
249#ifndef PHILOX2x32_DEFAULT_ROUNDS
250#define PHILOX2x32_DEFAULT_ROUNDS 10
253#ifndef PHILOX2x64_DEFAULT_ROUNDS
254#define PHILOX2x64_DEFAULT_ROUNDS 10
257#ifndef PHILOX4x32_DEFAULT_ROUNDS
258#define PHILOX4x32_DEFAULT_ROUNDS 10
261#ifndef PHILOX4x64_DEFAULT_ROUNDS
262#define PHILOX4x64_DEFAULT_ROUNDS 10
268#define _philox2xWround_tpl(W, T) \
269 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
270 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key) { \
272 T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi); \
273 struct r123array2x##W out = {{hi ^ key.v[0] ^ ctr.v[1], lo}}; \
276#define _philox2xWbumpkey_tpl(W) \
277 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey(struct r123array1x##W key) { \
278 key.v[0] += PHILOX_W##W##_0; \
282#define _philox4xWround_tpl(W, T) \
283 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
284 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key) { \
287 T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0); \
288 T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1); \
289 struct r123array4x##W out = {{hi1 ^ ctr.v[1] ^ key.v[0], lo1, hi0 ^ ctr.v[3] ^ key.v[1], lo0}}; \
293#define _philox4xWbumpkey_tpl(W) \
294 R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey(struct r123array2x##W key) { \
295 key.v[0] += PHILOX_W##W##_0; \
296 key.v[1] += PHILOX_W##W##_1; \
301#define _philoxNxW_tpl(N, Nhalf, W, T) \
303 enum r123_enum_philox##N##x##W{philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS}; \
304 typedef struct r123array##N##x##W philox##N##x##W##_ctr_t; \
305 typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t; \
306 typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t; \
307 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
308 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
309 R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
310 R123_ASSERT(R <= 16); \
312 ctr = _philox##N##x##W##round(ctr, key); \
315 key = _philox##N##x##W##bumpkey(key); \
316 ctr = _philox##N##x##W##round(ctr, key); \
319 key = _philox##N##x##W##bumpkey(key); \
320 ctr = _philox##N##x##W##round(ctr, key); \
323 key = _philox##N##x##W##bumpkey(key); \
324 ctr = _philox##N##x##W##round(ctr, key); \
327 key = _philox##N##x##W##bumpkey(key); \
328 ctr = _philox##N##x##W##round(ctr, key); \
331 key = _philox##N##x##W##bumpkey(key); \
332 ctr = _philox##N##x##W##round(ctr, key); \
335 key = _philox##N##x##W##bumpkey(key); \
336 ctr = _philox##N##x##W##round(ctr, key); \
339 key = _philox##N##x##W##bumpkey(key); \
340 ctr = _philox##N##x##W##round(ctr, key); \
343 key = _philox##N##x##W##bumpkey(key); \
344 ctr = _philox##N##x##W##round(ctr, key); \
347 key = _philox##N##x##W##bumpkey(key); \
348 ctr = _philox##N##x##W##round(ctr, key); \
351 key = _philox##N##x##W##bumpkey(key); \
352 ctr = _philox##N##x##W##round(ctr, key); \
355 key = _philox##N##x##W##bumpkey(key); \
356 ctr = _philox##N##x##W##round(ctr, key); \
359 key = _philox##N##x##W##bumpkey(key); \
360 ctr = _philox##N##x##W##round(ctr, key); \
363 key = _philox##N##x##W##bumpkey(key); \
364 ctr = _philox##N##x##W##round(ctr, key); \
367 key = _philox##N##x##W##bumpkey(key); \
368 ctr = _philox##N##x##W##round(ctr, key); \
371 key = _philox##N##x##W##bumpkey(key); \
372 ctr = _philox##N##x##W##round(ctr, key); \
382#if R123_USE_PHILOX_64BIT
391#define philox2x32(c, k) philox2x32_R(philox2x32_rounds, c, k)
392#define philox4x32(c, k) philox4x32_R(philox4x32_rounds, c, k)
393#if R123_USE_PHILOX_64BIT
394#define philox2x64(c, k) philox2x64_R(philox2x64_rounds, c, k)
395#define philox4x64(c, k) philox4x64_R(philox4x64_rounds, c, k)
398#if defined(__cplusplus)
400#define _PhiloxNxW_base_tpl(CType, KType, N, W) \
402 template<unsigned int ROUNDS> struct Philox##N##x##W##_R { \
403 typedef CType ctr_type; \
404 typedef KType key_type; \
405 typedef KType ukey_type; \
406 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds = ROUNDS; \
407 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const) { \
408 R123_STATIC_ASSERT(ROUNDS <= 16, "philox is only unrolled up to 16 rounds\n"); \
409 return philox##N##x##W##_R(ROUNDS, ctr, key); \
412 typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
415 _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32)
416 _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32)
417#if R123_USE_PHILOX_64BIT
418 _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64)
419 _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64)
#define R123_MULHILO64_MULHI_INTRIN
#define R123_MULHILO32_MULHI_INTRIN
uint32_t _philox4xWround_tpl(32, uint32_t) _philoxNxW_tpl(2
_philox2xWbumpkey_tpl(32) _philox4xWbumpkey_tpl(32) _philox2xWround_tpl(32
#define _philoxNxW_tpl(N, Nhalf, W, T)