Project Alice
Loading...
Searching...
No Matches
threefry.h
Go to the documentation of this file.
1/*
2Copyright 2010-2011, D. E. Shaw Research.
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are
7met:
8
9* Redistributions of source code must retain the above copyright
10 notice, this list of conditions, and the following disclaimer.
11
12* Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions, and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16* Neither the name of D. E. Shaw Research nor the names of its
17 contributors may be used to endorse or promote products derived from
18 this software without specific prior written permission.
19
20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32#ifndef _threefry_dot_h_
33#define _threefry_dot_h_
35#include "array.h"
36
38/* Significant parts of this file were copied from
39 from:
40 Skein_FinalRnd/ReferenceImplementation/skein.h
41 Skein_FinalRnd/ReferenceImplementation/skein_block.c
42
43 in http://csrc.nist.gov/groups/ST/hash/sha-3/Round3/documents/Skein_FinalRnd.zip
44
45 This file has been modified so that it may no longer perform its originally
46 intended function. If you're looking for a Skein or Threefish source code,
47 please consult the original file.
48
49 The original file had the following header:
50**************************************************************************
51**
52** Interface declarations and internal definitions for Skein hashing.
53**
54** Source code author: Doug Whiting, 2008.
55**
56** This algorithm and source code is released to the public domain.
57**
58***************************************************************************
59
60*/
61
62/* See comment at the top of philox.h for the macro pre-process
63 strategy. */
64
65/* Rotation constants: */
66enum r123_enum_threefry64x4 {
67 /* These are the R_256 constants from the Threefish reference sources
68 with names changed to R_64x4... */
69 R_64x4_0_0 = 14,
70 R_64x4_0_1 = 16,
71 R_64x4_1_0 = 52,
72 R_64x4_1_1 = 57,
73 R_64x4_2_0 = 23,
74 R_64x4_2_1 = 40,
75 R_64x4_3_0 = 5,
76 R_64x4_3_1 = 37,
77 R_64x4_4_0 = 25,
78 R_64x4_4_1 = 33,
79 R_64x4_5_0 = 46,
80 R_64x4_5_1 = 12,
81 R_64x4_6_0 = 58,
82 R_64x4_6_1 = 22,
83 R_64x4_7_0 = 32,
84 R_64x4_7_1 = 32
85};
86
87enum r123_enum_threefry64x2 {
88 /*
89 // Output from skein_rot_search: (srs64_B64-X1000)
90 // Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57
91 // Start: Tue Mar 1 10:07:48 2011
92 // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
93 */
94 R_64x2_0_0 = 16,
95 R_64x2_1_0 = 42,
96 R_64x2_2_0 = 12,
97 R_64x2_3_0 = 31,
98 R_64x2_4_0 = 16,
99 R_64x2_5_0 = 32,
100 R_64x2_6_0 = 24,
101 R_64x2_7_0 = 21
102 /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
103 // 5 rounds: minHW = 8 [ 8 8 8 8 ]
104 // 6 rounds: minHW = 16 [ 16 16 16 16 ]
105 // 7 rounds: minHW = 32 [ 32 32 32 32 ]
106 // 8 rounds: minHW = 64 [ 64 64 64 64 ]
107 // 9 rounds: minHW = 64 [ 64 64 64 64 ]
108 //10 rounds: minHW = 64 [ 64 64 64 64 ]
109 //11 rounds: minHW = 64 [ 64 64 64 64 ] */
110};
111
112enum r123_enum_threefry32x4 {
113 /* Output from skein_rot_search: (srs-B128-X5000.out)
114 // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
115 // Start: Mon Aug 24 22:41:36 2009
116 // ...
117 // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format */
118 R_32x4_0_0 = 10,
119 R_32x4_0_1 = 26,
120 R_32x4_1_0 = 11,
121 R_32x4_1_1 = 21,
122 R_32x4_2_0 = 13,
123 R_32x4_2_1 = 27,
124 R_32x4_3_0 = 23,
125 R_32x4_3_1 = 5,
126 R_32x4_4_0 = 6,
127 R_32x4_4_1 = 20,
128 R_32x4_5_0 = 17,
129 R_32x4_5_1 = 11,
130 R_32x4_6_0 = 25,
131 R_32x4_6_1 = 10,
132 R_32x4_7_0 = 18,
133 R_32x4_7_1 = 20
134
135 /* 4 rounds: minHW = 3 [ 3 3 3 3 ]
136 // 5 rounds: minHW = 7 [ 7 7 7 7 ]
137 // 6 rounds: minHW = 12 [ 13 12 13 12 ]
138 // 7 rounds: minHW = 22 [ 22 23 22 23 ]
139 // 8 rounds: minHW = 31 [ 31 31 31 31 ]
140 // 9 rounds: minHW = 32 [ 32 32 32 32 ]
141 //10 rounds: minHW = 32 [ 32 32 32 32 ]
142 //11 rounds: minHW = 32 [ 32 32 32 32 ] */
143
144};
145
146enum r123_enum_threefry32x2 {
147 /* Output from skein_rot_search (srs32x2-X5000.out)
148 // Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28
149 // Start: Tue Jul 12 11:11:33 2011
150 // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format */
151 R_32x2_0_0 = 13,
152 R_32x2_1_0 = 15,
153 R_32x2_2_0 = 26,
154 R_32x2_3_0 = 6,
155 R_32x2_4_0 = 17,
156 R_32x2_5_0 = 29,
157 R_32x2_6_0 = 16,
158 R_32x2_7_0 = 24
159
160 /* 4 rounds: minHW = 4 [ 4 4 4 4 ]
161 // 5 rounds: minHW = 6 [ 6 8 6 8 ]
162 // 6 rounds: minHW = 9 [ 9 12 9 12 ]
163 // 7 rounds: minHW = 16 [ 16 24 16 24 ]
164 // 8 rounds: minHW = 32 [ 32 32 32 32 ]
165 // 9 rounds: minHW = 32 [ 32 32 32 32 ]
166 //10 rounds: minHW = 32 [ 32 32 32 32 ]
167 //11 rounds: minHW = 32 [ 32 32 32 32 ] */
168};
169
170enum r123_enum_threefry_wcnt { WCNT2 = 2, WCNT4 = 4 };
171
172#if R123_USE_64BIT
174R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N) { return (x << (N & 63)) | (x >> ((64 - N) & 63)); }
175#endif
176
178R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N) { return (x << (N & 31)) | (x >> ((32 - N) & 31)); }
179
180#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32))
181#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
182#define SKEIN_KS_PARITY32 0x1BD11BDA
183
186#ifndef THREEFRY2x32_DEFAULT_ROUNDS
187#define THREEFRY2x32_DEFAULT_ROUNDS 20
188#endif
189
190#ifndef THREEFRY2x64_DEFAULT_ROUNDS
191#define THREEFRY2x64_DEFAULT_ROUNDS 20
192#endif
193
194#ifndef THREEFRY4x32_DEFAULT_ROUNDS
195#define THREEFRY4x32_DEFAULT_ROUNDS 20
196#endif
197
198#ifndef THREEFRY4x64_DEFAULT_ROUNDS
199#define THREEFRY4x64_DEFAULT_ROUNDS 20
200#endif
201
202#define _threefry2x_tpl(W) \
203 typedef struct r123array2x##W threefry2x##W##_ctr_t; \
204 typedef struct r123array2x##W threefry2x##W##_key_t; \
205 typedef struct r123array2x##W threefry2x##W##_ukey_t; \
206 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
207 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
208 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k) { \
209 uint##W##_t X0, X1; \
210 uint##W##_t ks0, ks1, ks2; \
211 R123_ASSERT(Nrounds <= 32); \
212 ks2 = SKEIN_KS_PARITY##W; \
213 ks0 = k.v[0]; \
214 X0 = in.v[0] + ks0; \
215 ks2 ^= ks0; \
216 \
217 ks1 = k.v[1]; \
218 X1 = in.v[1] + ks1; \
219 ks2 ^= ks1; \
220 \
221 if(Nrounds > 0) { \
222 X0 += X1; \
223 X1 = RotL_##W(X1, R_##W##x2_0_0); \
224 X1 ^= X0; \
225 } \
226 if(Nrounds > 1) { \
227 X0 += X1; \
228 X1 = RotL_##W(X1, R_##W##x2_1_0); \
229 X1 ^= X0; \
230 } \
231 if(Nrounds > 2) { \
232 X0 += X1; \
233 X1 = RotL_##W(X1, R_##W##x2_2_0); \
234 X1 ^= X0; \
235 } \
236 if(Nrounds > 3) { \
237 X0 += X1; \
238 X1 = RotL_##W(X1, R_##W##x2_3_0); \
239 X1 ^= X0; \
240 } \
241 if(Nrounds > 3) { \
242 /* InjectKey(r=1) */ \
243 X0 += ks1; \
244 X1 += ks2; \
245 X1 += 1; /* X.v[2-1] += r */ \
246 } \
247 if(Nrounds > 4) { \
248 X0 += X1; \
249 X1 = RotL_##W(X1, R_##W##x2_4_0); \
250 X1 ^= X0; \
251 } \
252 if(Nrounds > 5) { \
253 X0 += X1; \
254 X1 = RotL_##W(X1, R_##W##x2_5_0); \
255 X1 ^= X0; \
256 } \
257 if(Nrounds > 6) { \
258 X0 += X1; \
259 X1 = RotL_##W(X1, R_##W##x2_6_0); \
260 X1 ^= X0; \
261 } \
262 if(Nrounds > 7) { \
263 X0 += X1; \
264 X1 = RotL_##W(X1, R_##W##x2_7_0); \
265 X1 ^= X0; \
266 } \
267 if(Nrounds > 7) { \
268 /* InjectKey(r=2) */ \
269 X0 += ks2; \
270 X1 += ks0; \
271 X1 += 2; \
272 } \
273 if(Nrounds > 8) { \
274 X0 += X1; \
275 X1 = RotL_##W(X1, R_##W##x2_0_0); \
276 X1 ^= X0; \
277 } \
278 if(Nrounds > 9) { \
279 X0 += X1; \
280 X1 = RotL_##W(X1, R_##W##x2_1_0); \
281 X1 ^= X0; \
282 } \
283 if(Nrounds > 10) { \
284 X0 += X1; \
285 X1 = RotL_##W(X1, R_##W##x2_2_0); \
286 X1 ^= X0; \
287 } \
288 if(Nrounds > 11) { \
289 X0 += X1; \
290 X1 = RotL_##W(X1, R_##W##x2_3_0); \
291 X1 ^= X0; \
292 } \
293 if(Nrounds > 11) { \
294 /* InjectKey(r=3) */ \
295 X0 += ks0; \
296 X1 += ks1; \
297 X1 += 3; \
298 } \
299 if(Nrounds > 12) { \
300 X0 += X1; \
301 X1 = RotL_##W(X1, R_##W##x2_4_0); \
302 X1 ^= X0; \
303 } \
304 if(Nrounds > 13) { \
305 X0 += X1; \
306 X1 = RotL_##W(X1, R_##W##x2_5_0); \
307 X1 ^= X0; \
308 } \
309 if(Nrounds > 14) { \
310 X0 += X1; \
311 X1 = RotL_##W(X1, R_##W##x2_6_0); \
312 X1 ^= X0; \
313 } \
314 if(Nrounds > 15) { \
315 X0 += X1; \
316 X1 = RotL_##W(X1, R_##W##x2_7_0); \
317 X1 ^= X0; \
318 } \
319 if(Nrounds > 15) { \
320 /* InjectKey(r=4) */ \
321 X0 += ks1; \
322 X1 += ks2; \
323 X1 += 4; \
324 } \
325 if(Nrounds > 16) { \
326 X0 += X1; \
327 X1 = RotL_##W(X1, R_##W##x2_0_0); \
328 X1 ^= X0; \
329 } \
330 if(Nrounds > 17) { \
331 X0 += X1; \
332 X1 = RotL_##W(X1, R_##W##x2_1_0); \
333 X1 ^= X0; \
334 } \
335 if(Nrounds > 18) { \
336 X0 += X1; \
337 X1 = RotL_##W(X1, R_##W##x2_2_0); \
338 X1 ^= X0; \
339 } \
340 if(Nrounds > 19) { \
341 X0 += X1; \
342 X1 = RotL_##W(X1, R_##W##x2_3_0); \
343 X1 ^= X0; \
344 } \
345 if(Nrounds > 19) { \
346 /* InjectKey(r=5) */ \
347 X0 += ks2; \
348 X1 += ks0; \
349 X1 += 5; \
350 } \
351 if(Nrounds > 20) { \
352 X0 += X1; \
353 X1 = RotL_##W(X1, R_##W##x2_4_0); \
354 X1 ^= X0; \
355 } \
356 if(Nrounds > 21) { \
357 X0 += X1; \
358 X1 = RotL_##W(X1, R_##W##x2_5_0); \
359 X1 ^= X0; \
360 } \
361 if(Nrounds > 22) { \
362 X0 += X1; \
363 X1 = RotL_##W(X1, R_##W##x2_6_0); \
364 X1 ^= X0; \
365 } \
366 if(Nrounds > 23) { \
367 X0 += X1; \
368 X1 = RotL_##W(X1, R_##W##x2_7_0); \
369 X1 ^= X0; \
370 } \
371 if(Nrounds > 23) { \
372 /* InjectKey(r=6) */ \
373 X0 += ks0; \
374 X1 += ks1; \
375 X1 += 6; \
376 } \
377 if(Nrounds > 24) { \
378 X0 += X1; \
379 X1 = RotL_##W(X1, R_##W##x2_0_0); \
380 X1 ^= X0; \
381 } \
382 if(Nrounds > 25) { \
383 X0 += X1; \
384 X1 = RotL_##W(X1, R_##W##x2_1_0); \
385 X1 ^= X0; \
386 } \
387 if(Nrounds > 26) { \
388 X0 += X1; \
389 X1 = RotL_##W(X1, R_##W##x2_2_0); \
390 X1 ^= X0; \
391 } \
392 if(Nrounds > 27) { \
393 X0 += X1; \
394 X1 = RotL_##W(X1, R_##W##x2_3_0); \
395 X1 ^= X0; \
396 } \
397 if(Nrounds > 27) { \
398 /* InjectKey(r=7) */ \
399 X0 += ks1; \
400 X1 += ks2; \
401 X1 += 7; \
402 } \
403 if(Nrounds > 28) { \
404 X0 += X1; \
405 X1 = RotL_##W(X1, R_##W##x2_4_0); \
406 X1 ^= X0; \
407 } \
408 if(Nrounds > 29) { \
409 X0 += X1; \
410 X1 = RotL_##W(X1, R_##W##x2_5_0); \
411 X1 ^= X0; \
412 } \
413 if(Nrounds > 30) { \
414 X0 += X1; \
415 X1 = RotL_##W(X1, R_##W##x2_6_0); \
416 X1 ^= X0; \
417 } \
418 if(Nrounds > 31) { \
419 X0 += X1; \
420 X1 = RotL_##W(X1, R_##W##x2_7_0); \
421 X1 ^= X0; \
422 } \
423 if(Nrounds > 31) { \
424 /* InjectKey(r=8) */ \
425 X0 += ks2; \
426 X1 += ks0; \
427 X1 += 8; \
428 } \
429 threefry2x##W##_ctr_t ret = {{X0, X1}}; \
430 return ret; \
431 } \
432 \
433 enum r123_enum_threefry2x##W{threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS}; \
434 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
435 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k) { return threefry2x##W##_R(threefry2x##W##_rounds, in, k); }
436
437#define _threefry4x_tpl(W) \
438 typedef struct r123array4x##W threefry4x##W##_ctr_t; \
439 typedef struct r123array4x##W threefry4x##W##_key_t; \
440 typedef struct r123array4x##W threefry4x##W##_ukey_t; \
441 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
442 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
443 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k) { \
444 uint##W##_t X0, X1, X2, X3; \
445 uint##W##_t ks0, ks1, ks2, ks3, ks4; \
446 R123_ASSERT(Nrounds <= 72); \
447 ks4 = SKEIN_KS_PARITY##W; \
448 ks0 = k.v[0]; \
449 X0 = in.v[0] + ks0; \
450 ks4 ^= ks0; \
451 \
452 ks1 = k.v[1]; \
453 X1 = in.v[1] + ks1; \
454 ks4 ^= ks1; \
455 \
456 ks2 = k.v[2]; \
457 X2 = in.v[2] + ks2; \
458 ks4 ^= ks2; \
459 \
460 ks3 = k.v[3]; \
461 X3 = in.v[3] + ks3; \
462 ks4 ^= ks3; \
463 \
464 if(Nrounds > 0) { \
465 X0 += X1; \
466 X1 = RotL_##W(X1, R_##W##x4_0_0); \
467 X1 ^= X0; \
468 X2 += X3; \
469 X3 = RotL_##W(X3, R_##W##x4_0_1); \
470 X3 ^= X2; \
471 } \
472 if(Nrounds > 1) { \
473 X0 += X3; \
474 X3 = RotL_##W(X3, R_##W##x4_1_0); \
475 X3 ^= X0; \
476 X2 += X1; \
477 X1 = RotL_##W(X1, R_##W##x4_1_1); \
478 X1 ^= X2; \
479 } \
480 if(Nrounds > 2) { \
481 X0 += X1; \
482 X1 = RotL_##W(X1, R_##W##x4_2_0); \
483 X1 ^= X0; \
484 X2 += X3; \
485 X3 = RotL_##W(X3, R_##W##x4_2_1); \
486 X3 ^= X2; \
487 } \
488 if(Nrounds > 3) { \
489 X0 += X3; \
490 X3 = RotL_##W(X3, R_##W##x4_3_0); \
491 X3 ^= X0; \
492 X2 += X1; \
493 X1 = RotL_##W(X1, R_##W##x4_3_1); \
494 X1 ^= X2; \
495 } \
496 if(Nrounds > 3) { \
497 /* InjectKey(r=1) */ \
498 X0 += ks1; \
499 X1 += ks2; \
500 X2 += ks3; \
501 X3 += ks4; \
502 X3 += 1; /* XWCNT4-1 += r */ \
503 } \
504 \
505 if(Nrounds > 4) { \
506 X0 += X1; \
507 X1 = RotL_##W(X1, R_##W##x4_4_0); \
508 X1 ^= X0; \
509 X2 += X3; \
510 X3 = RotL_##W(X3, R_##W##x4_4_1); \
511 X3 ^= X2; \
512 } \
513 if(Nrounds > 5) { \
514 X0 += X3; \
515 X3 = RotL_##W(X3, R_##W##x4_5_0); \
516 X3 ^= X0; \
517 X2 += X1; \
518 X1 = RotL_##W(X1, R_##W##x4_5_1); \
519 X1 ^= X2; \
520 } \
521 if(Nrounds > 6) { \
522 X0 += X1; \
523 X1 = RotL_##W(X1, R_##W##x4_6_0); \
524 X1 ^= X0; \
525 X2 += X3; \
526 X3 = RotL_##W(X3, R_##W##x4_6_1); \
527 X3 ^= X2; \
528 } \
529 if(Nrounds > 7) { \
530 X0 += X3; \
531 X3 = RotL_##W(X3, R_##W##x4_7_0); \
532 X3 ^= X0; \
533 X2 += X1; \
534 X1 = RotL_##W(X1, R_##W##x4_7_1); \
535 X1 ^= X2; \
536 } \
537 if(Nrounds > 7) { \
538 /* InjectKey(r=2) */ \
539 X0 += ks2; \
540 X1 += ks3; \
541 X2 += ks4; \
542 X3 += ks0; \
543 X3 += 2; /* XWCNT4-1 += r */ \
544 } \
545 \
546 if(Nrounds > 8) { \
547 X0 += X1; \
548 X1 = RotL_##W(X1, R_##W##x4_0_0); \
549 X1 ^= X0; \
550 X2 += X3; \
551 X3 = RotL_##W(X3, R_##W##x4_0_1); \
552 X3 ^= X2; \
553 } \
554 if(Nrounds > 9) { \
555 X0 += X3; \
556 X3 = RotL_##W(X3, R_##W##x4_1_0); \
557 X3 ^= X0; \
558 X2 += X1; \
559 X1 = RotL_##W(X1, R_##W##x4_1_1); \
560 X1 ^= X2; \
561 } \
562 if(Nrounds > 10) { \
563 X0 += X1; \
564 X1 = RotL_##W(X1, R_##W##x4_2_0); \
565 X1 ^= X0; \
566 X2 += X3; \
567 X3 = RotL_##W(X3, R_##W##x4_2_1); \
568 X3 ^= X2; \
569 } \
570 if(Nrounds > 11) { \
571 X0 += X3; \
572 X3 = RotL_##W(X3, R_##W##x4_3_0); \
573 X3 ^= X0; \
574 X2 += X1; \
575 X1 = RotL_##W(X1, R_##W##x4_3_1); \
576 X1 ^= X2; \
577 } \
578 if(Nrounds > 11) { \
579 /* InjectKey(r=3) */ \
580 X0 += ks3; \
581 X1 += ks4; \
582 X2 += ks0; \
583 X3 += ks1; \
584 X3 += 3; /* XWCNT4-1 += r */ \
585 } \
586 \
587 if(Nrounds > 12) { \
588 X0 += X1; \
589 X1 = RotL_##W(X1, R_##W##x4_4_0); \
590 X1 ^= X0; \
591 X2 += X3; \
592 X3 = RotL_##W(X3, R_##W##x4_4_1); \
593 X3 ^= X2; \
594 } \
595 if(Nrounds > 13) { \
596 X0 += X3; \
597 X3 = RotL_##W(X3, R_##W##x4_5_0); \
598 X3 ^= X0; \
599 X2 += X1; \
600 X1 = RotL_##W(X1, R_##W##x4_5_1); \
601 X1 ^= X2; \
602 } \
603 if(Nrounds > 14) { \
604 X0 += X1; \
605 X1 = RotL_##W(X1, R_##W##x4_6_0); \
606 X1 ^= X0; \
607 X2 += X3; \
608 X3 = RotL_##W(X3, R_##W##x4_6_1); \
609 X3 ^= X2; \
610 } \
611 if(Nrounds > 15) { \
612 X0 += X3; \
613 X3 = RotL_##W(X3, R_##W##x4_7_0); \
614 X3 ^= X0; \
615 X2 += X1; \
616 X1 = RotL_##W(X1, R_##W##x4_7_1); \
617 X1 ^= X2; \
618 } \
619 if(Nrounds > 15) { \
620 /* InjectKey(r=1) */ \
621 X0 += ks4; \
622 X1 += ks0; \
623 X2 += ks1; \
624 X3 += ks2; \
625 X3 += 4; /* XWCNT4-1 += r */ \
626 } \
627 \
628 if(Nrounds > 16) { \
629 X0 += X1; \
630 X1 = RotL_##W(X1, R_##W##x4_0_0); \
631 X1 ^= X0; \
632 X2 += X3; \
633 X3 = RotL_##W(X3, R_##W##x4_0_1); \
634 X3 ^= X2; \
635 } \
636 if(Nrounds > 17) { \
637 X0 += X3; \
638 X3 = RotL_##W(X3, R_##W##x4_1_0); \
639 X3 ^= X0; \
640 X2 += X1; \
641 X1 = RotL_##W(X1, R_##W##x4_1_1); \
642 X1 ^= X2; \
643 } \
644 if(Nrounds > 18) { \
645 X0 += X1; \
646 X1 = RotL_##W(X1, R_##W##x4_2_0); \
647 X1 ^= X0; \
648 X2 += X3; \
649 X3 = RotL_##W(X3, R_##W##x4_2_1); \
650 X3 ^= X2; \
651 } \
652 if(Nrounds > 19) { \
653 X0 += X3; \
654 X3 = RotL_##W(X3, R_##W##x4_3_0); \
655 X3 ^= X0; \
656 X2 += X1; \
657 X1 = RotL_##W(X1, R_##W##x4_3_1); \
658 X1 ^= X2; \
659 } \
660 if(Nrounds > 19) { \
661 /* InjectKey(r=1) */ \
662 X0 += ks0; \
663 X1 += ks1; \
664 X2 += ks2; \
665 X3 += ks3; \
666 X3 += 5; /* XWCNT4-1 += r */ \
667 } \
668 \
669 if(Nrounds > 20) { \
670 X0 += X1; \
671 X1 = RotL_##W(X1, R_##W##x4_4_0); \
672 X1 ^= X0; \
673 X2 += X3; \
674 X3 = RotL_##W(X3, R_##W##x4_4_1); \
675 X3 ^= X2; \
676 } \
677 if(Nrounds > 21) { \
678 X0 += X3; \
679 X3 = RotL_##W(X3, R_##W##x4_5_0); \
680 X3 ^= X0; \
681 X2 += X1; \
682 X1 = RotL_##W(X1, R_##W##x4_5_1); \
683 X1 ^= X2; \
684 } \
685 if(Nrounds > 22) { \
686 X0 += X1; \
687 X1 = RotL_##W(X1, R_##W##x4_6_0); \
688 X1 ^= X0; \
689 X2 += X3; \
690 X3 = RotL_##W(X3, R_##W##x4_6_1); \
691 X3 ^= X2; \
692 } \
693 if(Nrounds > 23) { \
694 X0 += X3; \
695 X3 = RotL_##W(X3, R_##W##x4_7_0); \
696 X3 ^= X0; \
697 X2 += X1; \
698 X1 = RotL_##W(X1, R_##W##x4_7_1); \
699 X1 ^= X2; \
700 } \
701 if(Nrounds > 23) { \
702 /* InjectKey(r=1) */ \
703 X0 += ks1; \
704 X1 += ks2; \
705 X2 += ks3; \
706 X3 += ks4; \
707 X3 += 6; /* XWCNT4-1 += r */ \
708 } \
709 \
710 if(Nrounds > 24) { \
711 X0 += X1; \
712 X1 = RotL_##W(X1, R_##W##x4_0_0); \
713 X1 ^= X0; \
714 X2 += X3; \
715 X3 = RotL_##W(X3, R_##W##x4_0_1); \
716 X3 ^= X2; \
717 } \
718 if(Nrounds > 25) { \
719 X0 += X3; \
720 X3 = RotL_##W(X3, R_##W##x4_1_0); \
721 X3 ^= X0; \
722 X2 += X1; \
723 X1 = RotL_##W(X1, R_##W##x4_1_1); \
724 X1 ^= X2; \
725 } \
726 if(Nrounds > 26) { \
727 X0 += X1; \
728 X1 = RotL_##W(X1, R_##W##x4_2_0); \
729 X1 ^= X0; \
730 X2 += X3; \
731 X3 = RotL_##W(X3, R_##W##x4_2_1); \
732 X3 ^= X2; \
733 } \
734 if(Nrounds > 27) { \
735 X0 += X3; \
736 X3 = RotL_##W(X3, R_##W##x4_3_0); \
737 X3 ^= X0; \
738 X2 += X1; \
739 X1 = RotL_##W(X1, R_##W##x4_3_1); \
740 X1 ^= X2; \
741 } \
742 if(Nrounds > 27) { \
743 /* InjectKey(r=1) */ \
744 X0 += ks2; \
745 X1 += ks3; \
746 X2 += ks4; \
747 X3 += ks0; \
748 X3 += 7; /* XWCNT4-1 += r */ \
749 } \
750 \
751 if(Nrounds > 28) { \
752 X0 += X1; \
753 X1 = RotL_##W(X1, R_##W##x4_4_0); \
754 X1 ^= X0; \
755 X2 += X3; \
756 X3 = RotL_##W(X3, R_##W##x4_4_1); \
757 X3 ^= X2; \
758 } \
759 if(Nrounds > 29) { \
760 X0 += X3; \
761 X3 = RotL_##W(X3, R_##W##x4_5_0); \
762 X3 ^= X0; \
763 X2 += X1; \
764 X1 = RotL_##W(X1, R_##W##x4_5_1); \
765 X1 ^= X2; \
766 } \
767 if(Nrounds > 30) { \
768 X0 += X1; \
769 X1 = RotL_##W(X1, R_##W##x4_6_0); \
770 X1 ^= X0; \
771 X2 += X3; \
772 X3 = RotL_##W(X3, R_##W##x4_6_1); \
773 X3 ^= X2; \
774 } \
775 if(Nrounds > 31) { \
776 X0 += X3; \
777 X3 = RotL_##W(X3, R_##W##x4_7_0); \
778 X3 ^= X0; \
779 X2 += X1; \
780 X1 = RotL_##W(X1, R_##W##x4_7_1); \
781 X1 ^= X2; \
782 } \
783 if(Nrounds > 31) { \
784 /* InjectKey(r=1) */ \
785 X0 += ks3; \
786 X1 += ks4; \
787 X2 += ks0; \
788 X3 += ks1; \
789 X3 += 8; /* XWCNT4-1 += r */ \
790 } \
791 \
792 if(Nrounds > 32) { \
793 X0 += X1; \
794 X1 = RotL_##W(X1, R_##W##x4_0_0); \
795 X1 ^= X0; \
796 X2 += X3; \
797 X3 = RotL_##W(X3, R_##W##x4_0_1); \
798 X3 ^= X2; \
799 } \
800 if(Nrounds > 33) { \
801 X0 += X3; \
802 X3 = RotL_##W(X3, R_##W##x4_1_0); \
803 X3 ^= X0; \
804 X2 += X1; \
805 X1 = RotL_##W(X1, R_##W##x4_1_1); \
806 X1 ^= X2; \
807 } \
808 if(Nrounds > 34) { \
809 X0 += X1; \
810 X1 = RotL_##W(X1, R_##W##x4_2_0); \
811 X1 ^= X0; \
812 X2 += X3; \
813 X3 = RotL_##W(X3, R_##W##x4_2_1); \
814 X3 ^= X2; \
815 } \
816 if(Nrounds > 35) { \
817 X0 += X3; \
818 X3 = RotL_##W(X3, R_##W##x4_3_0); \
819 X3 ^= X0; \
820 X2 += X1; \
821 X1 = RotL_##W(X1, R_##W##x4_3_1); \
822 X1 ^= X2; \
823 } \
824 if(Nrounds > 35) { \
825 /* InjectKey(r=1) */ \
826 X0 += ks4; \
827 X1 += ks0; \
828 X2 += ks1; \
829 X3 += ks2; \
830 X3 += 9; /* XWCNT4-1 += r */ \
831 } \
832 \
833 if(Nrounds > 36) { \
834 X0 += X1; \
835 X1 = RotL_##W(X1, R_##W##x4_4_0); \
836 X1 ^= X0; \
837 X2 += X3; \
838 X3 = RotL_##W(X3, R_##W##x4_4_1); \
839 X3 ^= X2; \
840 } \
841 if(Nrounds > 37) { \
842 X0 += X3; \
843 X3 = RotL_##W(X3, R_##W##x4_5_0); \
844 X3 ^= X0; \
845 X2 += X1; \
846 X1 = RotL_##W(X1, R_##W##x4_5_1); \
847 X1 ^= X2; \
848 } \
849 if(Nrounds > 38) { \
850 X0 += X1; \
851 X1 = RotL_##W(X1, R_##W##x4_6_0); \
852 X1 ^= X0; \
853 X2 += X3; \
854 X3 = RotL_##W(X3, R_##W##x4_6_1); \
855 X3 ^= X2; \
856 } \
857 if(Nrounds > 39) { \
858 X0 += X3; \
859 X3 = RotL_##W(X3, R_##W##x4_7_0); \
860 X3 ^= X0; \
861 X2 += X1; \
862 X1 = RotL_##W(X1, R_##W##x4_7_1); \
863 X1 ^= X2; \
864 } \
865 if(Nrounds > 39) { \
866 /* InjectKey(r=1) */ \
867 X0 += ks0; \
868 X1 += ks1; \
869 X2 += ks2; \
870 X3 += ks3; \
871 X3 += 10; /* XWCNT4-1 += r */ \
872 } \
873 \
874 if(Nrounds > 40) { \
875 X0 += X1; \
876 X1 = RotL_##W(X1, R_##W##x4_0_0); \
877 X1 ^= X0; \
878 X2 += X3; \
879 X3 = RotL_##W(X3, R_##W##x4_0_1); \
880 X3 ^= X2; \
881 } \
882 if(Nrounds > 41) { \
883 X0 += X3; \
884 X3 = RotL_##W(X3, R_##W##x4_1_0); \
885 X3 ^= X0; \
886 X2 += X1; \
887 X1 = RotL_##W(X1, R_##W##x4_1_1); \
888 X1 ^= X2; \
889 } \
890 if(Nrounds > 42) { \
891 X0 += X1; \
892 X1 = RotL_##W(X1, R_##W##x4_2_0); \
893 X1 ^= X0; \
894 X2 += X3; \
895 X3 = RotL_##W(X3, R_##W##x4_2_1); \
896 X3 ^= X2; \
897 } \
898 if(Nrounds > 43) { \
899 X0 += X3; \
900 X3 = RotL_##W(X3, R_##W##x4_3_0); \
901 X3 ^= X0; \
902 X2 += X1; \
903 X1 = RotL_##W(X1, R_##W##x4_3_1); \
904 X1 ^= X2; \
905 } \
906 if(Nrounds > 43) { \
907 /* InjectKey(r=1) */ \
908 X0 += ks1; \
909 X1 += ks2; \
910 X2 += ks3; \
911 X3 += ks4; \
912 X3 += 11; /* XWCNT4-1 += r */ \
913 } \
914 \
915 if(Nrounds > 44) { \
916 X0 += X1; \
917 X1 = RotL_##W(X1, R_##W##x4_4_0); \
918 X1 ^= X0; \
919 X2 += X3; \
920 X3 = RotL_##W(X3, R_##W##x4_4_1); \
921 X3 ^= X2; \
922 } \
923 if(Nrounds > 45) { \
924 X0 += X3; \
925 X3 = RotL_##W(X3, R_##W##x4_5_0); \
926 X3 ^= X0; \
927 X2 += X1; \
928 X1 = RotL_##W(X1, R_##W##x4_5_1); \
929 X1 ^= X2; \
930 } \
931 if(Nrounds > 46) { \
932 X0 += X1; \
933 X1 = RotL_##W(X1, R_##W##x4_6_0); \
934 X1 ^= X0; \
935 X2 += X3; \
936 X3 = RotL_##W(X3, R_##W##x4_6_1); \
937 X3 ^= X2; \
938 } \
939 if(Nrounds > 47) { \
940 X0 += X3; \
941 X3 = RotL_##W(X3, R_##W##x4_7_0); \
942 X3 ^= X0; \
943 X2 += X1; \
944 X1 = RotL_##W(X1, R_##W##x4_7_1); \
945 X1 ^= X2; \
946 } \
947 if(Nrounds > 47) { \
948 /* InjectKey(r=1) */ \
949 X0 += ks2; \
950 X1 += ks3; \
951 X2 += ks4; \
952 X3 += ks0; \
953 X3 += 12; /* XWCNT4-1 += r */ \
954 } \
955 \
956 if(Nrounds > 48) { \
957 X0 += X1; \
958 X1 = RotL_##W(X1, R_##W##x4_0_0); \
959 X1 ^= X0; \
960 X2 += X3; \
961 X3 = RotL_##W(X3, R_##W##x4_0_1); \
962 X3 ^= X2; \
963 } \
964 if(Nrounds > 49) { \
965 X0 += X3; \
966 X3 = RotL_##W(X3, R_##W##x4_1_0); \
967 X3 ^= X0; \
968 X2 += X1; \
969 X1 = RotL_##W(X1, R_##W##x4_1_1); \
970 X1 ^= X2; \
971 } \
972 if(Nrounds > 50) { \
973 X0 += X1; \
974 X1 = RotL_##W(X1, R_##W##x4_2_0); \
975 X1 ^= X0; \
976 X2 += X3; \
977 X3 = RotL_##W(X3, R_##W##x4_2_1); \
978 X3 ^= X2; \
979 } \
980 if(Nrounds > 51) { \
981 X0 += X3; \
982 X3 = RotL_##W(X3, R_##W##x4_3_0); \
983 X3 ^= X0; \
984 X2 += X1; \
985 X1 = RotL_##W(X1, R_##W##x4_3_1); \
986 X1 ^= X2; \
987 } \
988 if(Nrounds > 51) { \
989 /* InjectKey(r=1) */ \
990 X0 += ks3; \
991 X1 += ks4; \
992 X2 += ks0; \
993 X3 += ks1; \
994 X3 += 13; /* XWCNT4-1 += r */ \
995 } \
996 \
997 if(Nrounds > 52) { \
998 X0 += X1; \
999 X1 = RotL_##W(X1, R_##W##x4_4_0); \
1000 X1 ^= X0; \
1001 X2 += X3; \
1002 X3 = RotL_##W(X3, R_##W##x4_4_1); \
1003 X3 ^= X2; \
1004 } \
1005 if(Nrounds > 53) { \
1006 X0 += X3; \
1007 X3 = RotL_##W(X3, R_##W##x4_5_0); \
1008 X3 ^= X0; \
1009 X2 += X1; \
1010 X1 = RotL_##W(X1, R_##W##x4_5_1); \
1011 X1 ^= X2; \
1012 } \
1013 if(Nrounds > 54) { \
1014 X0 += X1; \
1015 X1 = RotL_##W(X1, R_##W##x4_6_0); \
1016 X1 ^= X0; \
1017 X2 += X3; \
1018 X3 = RotL_##W(X3, R_##W##x4_6_1); \
1019 X3 ^= X2; \
1020 } \
1021 if(Nrounds > 55) { \
1022 X0 += X3; \
1023 X3 = RotL_##W(X3, R_##W##x4_7_0); \
1024 X3 ^= X0; \
1025 X2 += X1; \
1026 X1 = RotL_##W(X1, R_##W##x4_7_1); \
1027 X1 ^= X2; \
1028 } \
1029 if(Nrounds > 55) { \
1030 /* InjectKey(r=1) */ \
1031 X0 += ks4; \
1032 X1 += ks0; \
1033 X2 += ks1; \
1034 X3 += ks2; \
1035 X3 += 14; /* XWCNT4-1 += r */ \
1036 } \
1037 \
1038 if(Nrounds > 56) { \
1039 X0 += X1; \
1040 X1 = RotL_##W(X1, R_##W##x4_0_0); \
1041 X1 ^= X0; \
1042 X2 += X3; \
1043 X3 = RotL_##W(X3, R_##W##x4_0_1); \
1044 X3 ^= X2; \
1045 } \
1046 if(Nrounds > 57) { \
1047 X0 += X3; \
1048 X3 = RotL_##W(X3, R_##W##x4_1_0); \
1049 X3 ^= X0; \
1050 X2 += X1; \
1051 X1 = RotL_##W(X1, R_##W##x4_1_1); \
1052 X1 ^= X2; \
1053 } \
1054 if(Nrounds > 58) { \
1055 X0 += X1; \
1056 X1 = RotL_##W(X1, R_##W##x4_2_0); \
1057 X1 ^= X0; \
1058 X2 += X3; \
1059 X3 = RotL_##W(X3, R_##W##x4_2_1); \
1060 X3 ^= X2; \
1061 } \
1062 if(Nrounds > 59) { \
1063 X0 += X3; \
1064 X3 = RotL_##W(X3, R_##W##x4_3_0); \
1065 X3 ^= X0; \
1066 X2 += X1; \
1067 X1 = RotL_##W(X1, R_##W##x4_3_1); \
1068 X1 ^= X2; \
1069 } \
1070 if(Nrounds > 59) { \
1071 /* InjectKey(r=1) */ \
1072 X0 += ks0; \
1073 X1 += ks1; \
1074 X2 += ks2; \
1075 X3 += ks3; \
1076 X3 += 15; /* XWCNT4-1 += r */ \
1077 } \
1078 \
1079 if(Nrounds > 60) { \
1080 X0 += X1; \
1081 X1 = RotL_##W(X1, R_##W##x4_4_0); \
1082 X1 ^= X0; \
1083 X2 += X3; \
1084 X3 = RotL_##W(X3, R_##W##x4_4_1); \
1085 X3 ^= X2; \
1086 } \
1087 if(Nrounds > 61) { \
1088 X0 += X3; \
1089 X3 = RotL_##W(X3, R_##W##x4_5_0); \
1090 X3 ^= X0; \
1091 X2 += X1; \
1092 X1 = RotL_##W(X1, R_##W##x4_5_1); \
1093 X1 ^= X2; \
1094 } \
1095 if(Nrounds > 62) { \
1096 X0 += X1; \
1097 X1 = RotL_##W(X1, R_##W##x4_6_0); \
1098 X1 ^= X0; \
1099 X2 += X3; \
1100 X3 = RotL_##W(X3, R_##W##x4_6_1); \
1101 X3 ^= X2; \
1102 } \
1103 if(Nrounds > 63) { \
1104 X0 += X3; \
1105 X3 = RotL_##W(X3, R_##W##x4_7_0); \
1106 X3 ^= X0; \
1107 X2 += X1; \
1108 X1 = RotL_##W(X1, R_##W##x4_7_1); \
1109 X1 ^= X2; \
1110 } \
1111 if(Nrounds > 63) { \
1112 /* InjectKey(r=1) */ \
1113 X0 += ks1; \
1114 X1 += ks2; \
1115 X2 += ks3; \
1116 X3 += ks4; \
1117 X3 += 16; /* XWCNT4-1 += r */ \
1118 } \
1119 \
1120 if(Nrounds > 64) { \
1121 X0 += X1; \
1122 X1 = RotL_##W(X1, R_##W##x4_0_0); \
1123 X1 ^= X0; \
1124 X2 += X3; \
1125 X3 = RotL_##W(X3, R_##W##x4_0_1); \
1126 X3 ^= X2; \
1127 } \
1128 if(Nrounds > 65) { \
1129 X0 += X3; \
1130 X3 = RotL_##W(X3, R_##W##x4_1_0); \
1131 X3 ^= X0; \
1132 X2 += X1; \
1133 X1 = RotL_##W(X1, R_##W##x4_1_1); \
1134 X1 ^= X2; \
1135 } \
1136 if(Nrounds > 66) { \
1137 X0 += X1; \
1138 X1 = RotL_##W(X1, R_##W##x4_2_0); \
1139 X1 ^= X0; \
1140 X2 += X3; \
1141 X3 = RotL_##W(X3, R_##W##x4_2_1); \
1142 X3 ^= X2; \
1143 } \
1144 if(Nrounds > 67) { \
1145 X0 += X3; \
1146 X3 = RotL_##W(X3, R_##W##x4_3_0); \
1147 X3 ^= X0; \
1148 X2 += X1; \
1149 X1 = RotL_##W(X1, R_##W##x4_3_1); \
1150 X1 ^= X2; \
1151 } \
1152 if(Nrounds > 67) { \
1153 /* InjectKey(r=1) */ \
1154 X0 += ks2; \
1155 X1 += ks3; \
1156 X2 += ks4; \
1157 X3 += ks0; \
1158 X3 += 17; /* XWCNT4-1 += r */ \
1159 } \
1160 \
1161 if(Nrounds > 68) { \
1162 X0 += X1; \
1163 X1 = RotL_##W(X1, R_##W##x4_4_0); \
1164 X1 ^= X0; \
1165 X2 += X3; \
1166 X3 = RotL_##W(X3, R_##W##x4_4_1); \
1167 X3 ^= X2; \
1168 } \
1169 if(Nrounds > 69) { \
1170 X0 += X3; \
1171 X3 = RotL_##W(X3, R_##W##x4_5_0); \
1172 X3 ^= X0; \
1173 X2 += X1; \
1174 X1 = RotL_##W(X1, R_##W##x4_5_1); \
1175 X1 ^= X2; \
1176 } \
1177 if(Nrounds > 70) { \
1178 X0 += X1; \
1179 X1 = RotL_##W(X1, R_##W##x4_6_0); \
1180 X1 ^= X0; \
1181 X2 += X3; \
1182 X3 = RotL_##W(X3, R_##W##x4_6_1); \
1183 X3 ^= X2; \
1184 } \
1185 if(Nrounds > 71) { \
1186 X0 += X3; \
1187 X3 = RotL_##W(X3, R_##W##x4_7_0); \
1188 X3 ^= X0; \
1189 X2 += X1; \
1190 X1 = RotL_##W(X1, R_##W##x4_7_1); \
1191 X1 ^= X2; \
1192 } \
1193 if(Nrounds > 71) { \
1194 /* InjectKey(r=1) */ \
1195 X0 += ks3; \
1196 X1 += ks4; \
1197 X2 += ks0; \
1198 X3 += ks1; \
1199 X3 += 18; /* XWCNT4-1 += r */ \
1200 } \
1201 \
1202 threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
1203 return ret; \
1204 } \
1205 \
1206 \
1207 enum r123_enum_threefry4x##W{threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS}; \
1208 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
1209 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k) { return threefry4x##W##_R(threefry4x##W##_rounds, in, k); }
1210
1211#if R123_USE_64BIT
1213#endif
1215
1216/* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
1217 than a static inline function. Why? */
1218#define threefry2x32(c, k) threefry2x32_R(threefry2x32_rounds, c, k)
1219#define threefry4x32(c, k) threefry4x32_R(threefry4x32_rounds, c, k)
1220#define threefry2x64(c, k) threefry2x64_R(threefry2x64_rounds, c, k)
1221#define threefry4x64(c, k) threefry4x64_R(threefry4x64_rounds, c, k)
1222
1223#if defined(__cplusplus)
1224#define _threefryNxWclass_tpl(NxW) \
1225 namespace r123 { \
1226 template<unsigned int ROUNDS> struct Threefry##NxW##_R { \
1227 typedef threefry##NxW##_ctr_t ctr_type; \
1228 typedef threefry##NxW##_key_t key_type; \
1229 typedef threefry##NxW##_key_t ukey_type; \
1230 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds = ROUNDS; \
1231 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)) { \
1232 R123_STATIC_ASSERT(ROUNDS <= 72, "threefry is only unrolled up to 72 rounds\n"); \
1233 return threefry##NxW##_R(ROUNDS, ctr, key); \
1234 } \
1235 }; \
1236 typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
1237 } // namespace r123
1238
1239 _threefryNxWclass_tpl(2x32) _threefryNxWclass_tpl(4x32)
1240#if R123_USE_64BIT
1241 _threefryNxWclass_tpl(2x64) _threefryNxWclass_tpl(4x64)
1242#endif
1243
1244/* The _tpl macros don't quite work to do string-pasting inside comments.
1245 so we just write out the boilerplate documentation four times... */
1246
1343#endif
1344
1345#endif
#define R123_CUDA_DEVICE
#define R123_STATIC_INLINE
#define R123_FORCE_INLINE(decl)
uint uint32_t
ulong uint64_t
#define _threefry4x_tpl(W)
Definition: threefry.h:437
#define _threefry2x_tpl(W)
Definition: threefry.h:202