32#ifndef _threefry_dot_h_
33#define _threefry_dot_h_
66enum r123_enum_threefry64x4 {
87enum r123_enum_threefry64x2 {
112enum r123_enum_threefry32x4 {
146enum r123_enum_threefry32x2 {
170enum r123_enum_threefry_wcnt { WCNT2 = 2, WCNT4 = 4 };
180#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32))
181#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
182#define SKEIN_KS_PARITY32 0x1BD11BDA
186#ifndef THREEFRY2x32_DEFAULT_ROUNDS
187#define THREEFRY2x32_DEFAULT_ROUNDS 20
190#ifndef THREEFRY2x64_DEFAULT_ROUNDS
191#define THREEFRY2x64_DEFAULT_ROUNDS 20
194#ifndef THREEFRY4x32_DEFAULT_ROUNDS
195#define THREEFRY4x32_DEFAULT_ROUNDS 20
198#ifndef THREEFRY4x64_DEFAULT_ROUNDS
199#define THREEFRY4x64_DEFAULT_ROUNDS 20
202#define _threefry2x_tpl(W) \
203 typedef struct r123array2x##W threefry2x##W##_ctr_t; \
204 typedef struct r123array2x##W threefry2x##W##_key_t; \
205 typedef struct r123array2x##W threefry2x##W##_ukey_t; \
206 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
207 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
208 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k) { \
209 uint##W##_t X0, X1; \
210 uint##W##_t ks0, ks1, ks2; \
211 R123_ASSERT(Nrounds <= 32); \
212 ks2 = SKEIN_KS_PARITY##W; \
214 X0 = in.v[0] + ks0; \
218 X1 = in.v[1] + ks1; \
223 X1 = RotL_##W(X1, R_##W##x2_0_0); \
228 X1 = RotL_##W(X1, R_##W##x2_1_0); \
233 X1 = RotL_##W(X1, R_##W##x2_2_0); \
238 X1 = RotL_##W(X1, R_##W##x2_3_0); \
249 X1 = RotL_##W(X1, R_##W##x2_4_0); \
254 X1 = RotL_##W(X1, R_##W##x2_5_0); \
259 X1 = RotL_##W(X1, R_##W##x2_6_0); \
264 X1 = RotL_##W(X1, R_##W##x2_7_0); \
275 X1 = RotL_##W(X1, R_##W##x2_0_0); \
280 X1 = RotL_##W(X1, R_##W##x2_1_0); \
285 X1 = RotL_##W(X1, R_##W##x2_2_0); \
290 X1 = RotL_##W(X1, R_##W##x2_3_0); \
301 X1 = RotL_##W(X1, R_##W##x2_4_0); \
306 X1 = RotL_##W(X1, R_##W##x2_5_0); \
311 X1 = RotL_##W(X1, R_##W##x2_6_0); \
316 X1 = RotL_##W(X1, R_##W##x2_7_0); \
327 X1 = RotL_##W(X1, R_##W##x2_0_0); \
332 X1 = RotL_##W(X1, R_##W##x2_1_0); \
337 X1 = RotL_##W(X1, R_##W##x2_2_0); \
342 X1 = RotL_##W(X1, R_##W##x2_3_0); \
353 X1 = RotL_##W(X1, R_##W##x2_4_0); \
358 X1 = RotL_##W(X1, R_##W##x2_5_0); \
363 X1 = RotL_##W(X1, R_##W##x2_6_0); \
368 X1 = RotL_##W(X1, R_##W##x2_7_0); \
379 X1 = RotL_##W(X1, R_##W##x2_0_0); \
384 X1 = RotL_##W(X1, R_##W##x2_1_0); \
389 X1 = RotL_##W(X1, R_##W##x2_2_0); \
394 X1 = RotL_##W(X1, R_##W##x2_3_0); \
405 X1 = RotL_##W(X1, R_##W##x2_4_0); \
410 X1 = RotL_##W(X1, R_##W##x2_5_0); \
415 X1 = RotL_##W(X1, R_##W##x2_6_0); \
420 X1 = RotL_##W(X1, R_##W##x2_7_0); \
429 threefry2x##W##_ctr_t ret = {{X0, X1}}; \
433 enum r123_enum_threefry2x##W{threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS}; \
434 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
435 R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k) { return threefry2x##W##_R(threefry2x##W##_rounds, in, k); }
437#define _threefry4x_tpl(W) \
438 typedef struct r123array4x##W threefry4x##W##_ctr_t; \
439 typedef struct r123array4x##W threefry4x##W##_key_t; \
440 typedef struct r123array4x##W threefry4x##W##_ukey_t; \
441 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
442 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
443 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k) { \
444 uint##W##_t X0, X1, X2, X3; \
445 uint##W##_t ks0, ks1, ks2, ks3, ks4; \
446 R123_ASSERT(Nrounds <= 72); \
447 ks4 = SKEIN_KS_PARITY##W; \
449 X0 = in.v[0] + ks0; \
453 X1 = in.v[1] + ks1; \
457 X2 = in.v[2] + ks2; \
461 X3 = in.v[3] + ks3; \
466 X1 = RotL_##W(X1, R_##W##x4_0_0); \
469 X3 = RotL_##W(X3, R_##W##x4_0_1); \
474 X3 = RotL_##W(X3, R_##W##x4_1_0); \
477 X1 = RotL_##W(X1, R_##W##x4_1_1); \
482 X1 = RotL_##W(X1, R_##W##x4_2_0); \
485 X3 = RotL_##W(X3, R_##W##x4_2_1); \
490 X3 = RotL_##W(X3, R_##W##x4_3_0); \
493 X1 = RotL_##W(X1, R_##W##x4_3_1); \
507 X1 = RotL_##W(X1, R_##W##x4_4_0); \
510 X3 = RotL_##W(X3, R_##W##x4_4_1); \
515 X3 = RotL_##W(X3, R_##W##x4_5_0); \
518 X1 = RotL_##W(X1, R_##W##x4_5_1); \
523 X1 = RotL_##W(X1, R_##W##x4_6_0); \
526 X3 = RotL_##W(X3, R_##W##x4_6_1); \
531 X3 = RotL_##W(X3, R_##W##x4_7_0); \
534 X1 = RotL_##W(X1, R_##W##x4_7_1); \
548 X1 = RotL_##W(X1, R_##W##x4_0_0); \
551 X3 = RotL_##W(X3, R_##W##x4_0_1); \
556 X3 = RotL_##W(X3, R_##W##x4_1_0); \
559 X1 = RotL_##W(X1, R_##W##x4_1_1); \
564 X1 = RotL_##W(X1, R_##W##x4_2_0); \
567 X3 = RotL_##W(X3, R_##W##x4_2_1); \
572 X3 = RotL_##W(X3, R_##W##x4_3_0); \
575 X1 = RotL_##W(X1, R_##W##x4_3_1); \
589 X1 = RotL_##W(X1, R_##W##x4_4_0); \
592 X3 = RotL_##W(X3, R_##W##x4_4_1); \
597 X3 = RotL_##W(X3, R_##W##x4_5_0); \
600 X1 = RotL_##W(X1, R_##W##x4_5_1); \
605 X1 = RotL_##W(X1, R_##W##x4_6_0); \
608 X3 = RotL_##W(X3, R_##W##x4_6_1); \
613 X3 = RotL_##W(X3, R_##W##x4_7_0); \
616 X1 = RotL_##W(X1, R_##W##x4_7_1); \
630 X1 = RotL_##W(X1, R_##W##x4_0_0); \
633 X3 = RotL_##W(X3, R_##W##x4_0_1); \
638 X3 = RotL_##W(X3, R_##W##x4_1_0); \
641 X1 = RotL_##W(X1, R_##W##x4_1_1); \
646 X1 = RotL_##W(X1, R_##W##x4_2_0); \
649 X3 = RotL_##W(X3, R_##W##x4_2_1); \
654 X3 = RotL_##W(X3, R_##W##x4_3_0); \
657 X1 = RotL_##W(X1, R_##W##x4_3_1); \
671 X1 = RotL_##W(X1, R_##W##x4_4_0); \
674 X3 = RotL_##W(X3, R_##W##x4_4_1); \
679 X3 = RotL_##W(X3, R_##W##x4_5_0); \
682 X1 = RotL_##W(X1, R_##W##x4_5_1); \
687 X1 = RotL_##W(X1, R_##W##x4_6_0); \
690 X3 = RotL_##W(X3, R_##W##x4_6_1); \
695 X3 = RotL_##W(X3, R_##W##x4_7_0); \
698 X1 = RotL_##W(X1, R_##W##x4_7_1); \
712 X1 = RotL_##W(X1, R_##W##x4_0_0); \
715 X3 = RotL_##W(X3, R_##W##x4_0_1); \
720 X3 = RotL_##W(X3, R_##W##x4_1_0); \
723 X1 = RotL_##W(X1, R_##W##x4_1_1); \
728 X1 = RotL_##W(X1, R_##W##x4_2_0); \
731 X3 = RotL_##W(X3, R_##W##x4_2_1); \
736 X3 = RotL_##W(X3, R_##W##x4_3_0); \
739 X1 = RotL_##W(X1, R_##W##x4_3_1); \
753 X1 = RotL_##W(X1, R_##W##x4_4_0); \
756 X3 = RotL_##W(X3, R_##W##x4_4_1); \
761 X3 = RotL_##W(X3, R_##W##x4_5_0); \
764 X1 = RotL_##W(X1, R_##W##x4_5_1); \
769 X1 = RotL_##W(X1, R_##W##x4_6_0); \
772 X3 = RotL_##W(X3, R_##W##x4_6_1); \
777 X3 = RotL_##W(X3, R_##W##x4_7_0); \
780 X1 = RotL_##W(X1, R_##W##x4_7_1); \
794 X1 = RotL_##W(X1, R_##W##x4_0_0); \
797 X3 = RotL_##W(X3, R_##W##x4_0_1); \
802 X3 = RotL_##W(X3, R_##W##x4_1_0); \
805 X1 = RotL_##W(X1, R_##W##x4_1_1); \
810 X1 = RotL_##W(X1, R_##W##x4_2_0); \
813 X3 = RotL_##W(X3, R_##W##x4_2_1); \
818 X3 = RotL_##W(X3, R_##W##x4_3_0); \
821 X1 = RotL_##W(X1, R_##W##x4_3_1); \
835 X1 = RotL_##W(X1, R_##W##x4_4_0); \
838 X3 = RotL_##W(X3, R_##W##x4_4_1); \
843 X3 = RotL_##W(X3, R_##W##x4_5_0); \
846 X1 = RotL_##W(X1, R_##W##x4_5_1); \
851 X1 = RotL_##W(X1, R_##W##x4_6_0); \
854 X3 = RotL_##W(X3, R_##W##x4_6_1); \
859 X3 = RotL_##W(X3, R_##W##x4_7_0); \
862 X1 = RotL_##W(X1, R_##W##x4_7_1); \
876 X1 = RotL_##W(X1, R_##W##x4_0_0); \
879 X3 = RotL_##W(X3, R_##W##x4_0_1); \
884 X3 = RotL_##W(X3, R_##W##x4_1_0); \
887 X1 = RotL_##W(X1, R_##W##x4_1_1); \
892 X1 = RotL_##W(X1, R_##W##x4_2_0); \
895 X3 = RotL_##W(X3, R_##W##x4_2_1); \
900 X3 = RotL_##W(X3, R_##W##x4_3_0); \
903 X1 = RotL_##W(X1, R_##W##x4_3_1); \
917 X1 = RotL_##W(X1, R_##W##x4_4_0); \
920 X3 = RotL_##W(X3, R_##W##x4_4_1); \
925 X3 = RotL_##W(X3, R_##W##x4_5_0); \
928 X1 = RotL_##W(X1, R_##W##x4_5_1); \
933 X1 = RotL_##W(X1, R_##W##x4_6_0); \
936 X3 = RotL_##W(X3, R_##W##x4_6_1); \
941 X3 = RotL_##W(X3, R_##W##x4_7_0); \
944 X1 = RotL_##W(X1, R_##W##x4_7_1); \
958 X1 = RotL_##W(X1, R_##W##x4_0_0); \
961 X3 = RotL_##W(X3, R_##W##x4_0_1); \
966 X3 = RotL_##W(X3, R_##W##x4_1_0); \
969 X1 = RotL_##W(X1, R_##W##x4_1_1); \
974 X1 = RotL_##W(X1, R_##W##x4_2_0); \
977 X3 = RotL_##W(X3, R_##W##x4_2_1); \
982 X3 = RotL_##W(X3, R_##W##x4_3_0); \
985 X1 = RotL_##W(X1, R_##W##x4_3_1); \
999 X1 = RotL_##W(X1, R_##W##x4_4_0); \
1002 X3 = RotL_##W(X3, R_##W##x4_4_1); \
1005 if(Nrounds > 53) { \
1007 X3 = RotL_##W(X3, R_##W##x4_5_0); \
1010 X1 = RotL_##W(X1, R_##W##x4_5_1); \
1013 if(Nrounds > 54) { \
1015 X1 = RotL_##W(X1, R_##W##x4_6_0); \
1018 X3 = RotL_##W(X3, R_##W##x4_6_1); \
1021 if(Nrounds > 55) { \
1023 X3 = RotL_##W(X3, R_##W##x4_7_0); \
1026 X1 = RotL_##W(X1, R_##W##x4_7_1); \
1029 if(Nrounds > 55) { \
1038 if(Nrounds > 56) { \
1040 X1 = RotL_##W(X1, R_##W##x4_0_0); \
1043 X3 = RotL_##W(X3, R_##W##x4_0_1); \
1046 if(Nrounds > 57) { \
1048 X3 = RotL_##W(X3, R_##W##x4_1_0); \
1051 X1 = RotL_##W(X1, R_##W##x4_1_1); \
1054 if(Nrounds > 58) { \
1056 X1 = RotL_##W(X1, R_##W##x4_2_0); \
1059 X3 = RotL_##W(X3, R_##W##x4_2_1); \
1062 if(Nrounds > 59) { \
1064 X3 = RotL_##W(X3, R_##W##x4_3_0); \
1067 X1 = RotL_##W(X1, R_##W##x4_3_1); \
1070 if(Nrounds > 59) { \
1079 if(Nrounds > 60) { \
1081 X1 = RotL_##W(X1, R_##W##x4_4_0); \
1084 X3 = RotL_##W(X3, R_##W##x4_4_1); \
1087 if(Nrounds > 61) { \
1089 X3 = RotL_##W(X3, R_##W##x4_5_0); \
1092 X1 = RotL_##W(X1, R_##W##x4_5_1); \
1095 if(Nrounds > 62) { \
1097 X1 = RotL_##W(X1, R_##W##x4_6_0); \
1100 X3 = RotL_##W(X3, R_##W##x4_6_1); \
1103 if(Nrounds > 63) { \
1105 X3 = RotL_##W(X3, R_##W##x4_7_0); \
1108 X1 = RotL_##W(X1, R_##W##x4_7_1); \
1111 if(Nrounds > 63) { \
1120 if(Nrounds > 64) { \
1122 X1 = RotL_##W(X1, R_##W##x4_0_0); \
1125 X3 = RotL_##W(X3, R_##W##x4_0_1); \
1128 if(Nrounds > 65) { \
1130 X3 = RotL_##W(X3, R_##W##x4_1_0); \
1133 X1 = RotL_##W(X1, R_##W##x4_1_1); \
1136 if(Nrounds > 66) { \
1138 X1 = RotL_##W(X1, R_##W##x4_2_0); \
1141 X3 = RotL_##W(X3, R_##W##x4_2_1); \
1144 if(Nrounds > 67) { \
1146 X3 = RotL_##W(X3, R_##W##x4_3_0); \
1149 X1 = RotL_##W(X1, R_##W##x4_3_1); \
1152 if(Nrounds > 67) { \
1161 if(Nrounds > 68) { \
1163 X1 = RotL_##W(X1, R_##W##x4_4_0); \
1166 X3 = RotL_##W(X3, R_##W##x4_4_1); \
1169 if(Nrounds > 69) { \
1171 X3 = RotL_##W(X3, R_##W##x4_5_0); \
1174 X1 = RotL_##W(X1, R_##W##x4_5_1); \
1177 if(Nrounds > 70) { \
1179 X1 = RotL_##W(X1, R_##W##x4_6_0); \
1182 X3 = RotL_##W(X3, R_##W##x4_6_1); \
1185 if(Nrounds > 71) { \
1187 X3 = RotL_##W(X3, R_##W##x4_7_0); \
1190 X1 = RotL_##W(X1, R_##W##x4_7_1); \
1193 if(Nrounds > 71) { \
1202 threefry4x##W##_ctr_t ret = {{X0, X1, X2, X3}}; \
1207 enum r123_enum_threefry4x##W{threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS}; \
1208 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
1209 R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k) { return threefry4x##W##_R(threefry4x##W##_rounds, in, k); }
1218#define threefry2x32(c, k) threefry2x32_R(threefry2x32_rounds, c, k)
1219#define threefry4x32(c, k) threefry4x32_R(threefry4x32_rounds, c, k)
1220#define threefry2x64(c, k) threefry2x64_R(threefry2x64_rounds, c, k)
1221#define threefry4x64(c, k) threefry4x64_R(threefry4x64_rounds, c, k)
1223#if defined(__cplusplus)
1224#define _threefryNxWclass_tpl(NxW) \
1226 template<unsigned int ROUNDS> struct Threefry##NxW##_R { \
1227 typedef threefry##NxW##_ctr_t ctr_type; \
1228 typedef threefry##NxW##_key_t key_type; \
1229 typedef threefry##NxW##_key_t ukey_type; \
1230 static const R123_METAL_CONSTANT_ADDRESS_SPACE unsigned int rounds = ROUNDS; \
1231 inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)) { \
1232 R123_STATIC_ASSERT(ROUNDS <= 72, "threefry is only unrolled up to 72 rounds\n"); \
1233 return threefry##NxW##_R(ROUNDS, ctr, key); \
1236 typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW; \
1239 _threefryNxWclass_tpl(2x32) _threefryNxWclass_tpl(4x32)
1241 _threefryNxWclass_tpl(2x64) _threefryNxWclass_tpl(4x64)
#define R123_STATIC_INLINE
#define R123_FORCE_INLINE(decl)
#define _threefry4x_tpl(W)
#define _threefry2x_tpl(W)