|
| 1 | +/* |
| 2 | + * Copyright (c) 2013, Alexey Degtyarev <[email protected]>. |
| 3 | + * Implementation fixed based on php-stribog: |
| 4 | + * Copyright (c) 2013 Vladimir Kolesnikov. |
| 5 | + * SPDX-License-Identifier: BSD-2-Clause AND MIT |
| 6 | + * Copyright (c) 2021 Vitaly Chikunov <[email protected]>. |
| 7 | + * All rights reserved. |
| 8 | + * |
| 9 | + * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+ |
| 10 | + */ |
| 11 | + |
| 12 | +#include "gosthash2012.h" |
| 13 | +#ifdef __GOST3411_HAS_MMX__ |
| 14 | + |
| 15 | +#include <mmintrin.h> |
| 16 | + |
| 17 | +#define XLPS XLPS32 |
| 18 | + |
| 19 | +#define X(x, y, z) { \ |
| 20 | + z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \ |
| 21 | + z->QWORD[1] = x->QWORD[1] ^ y->QWORD[1]; \ |
| 22 | + z->QWORD[2] = x->QWORD[2] ^ y->QWORD[2]; \ |
| 23 | + z->QWORD[3] = x->QWORD[3] ^ y->QWORD[3]; \ |
| 24 | + z->QWORD[4] = x->QWORD[4] ^ y->QWORD[4]; \ |
| 25 | + z->QWORD[5] = x->QWORD[5] ^ y->QWORD[5]; \ |
| 26 | + z->QWORD[6] = x->QWORD[6] ^ y->QWORD[6]; \ |
| 27 | + z->QWORD[7] = x->QWORD[7] ^ y->QWORD[7]; \ |
| 28 | +} |
| 29 | + |
| 30 | +#define XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ |
| 31 | + const __m64 *px = (const __m64 *) &x[0]; \ |
| 32 | + const __m64 *py = (const __m64 *) &y[0]; \ |
| 33 | + mm0 = _mm_xor_si64(px[0], py[0]); \ |
| 34 | + mm1 = _mm_xor_si64(px[1], py[1]); \ |
| 35 | + mm2 = _mm_xor_si64(px[2], py[2]); \ |
| 36 | + mm3 = _mm_xor_si64(px[3], py[3]); \ |
| 37 | + mm4 = _mm_xor_si64(px[4], py[4]); \ |
| 38 | + mm5 = _mm_xor_si64(px[5], py[5]); \ |
| 39 | + mm6 = _mm_xor_si64(px[6], py[6]); \ |
| 40 | + mm7 = _mm_xor_si64(px[7], py[7]); \ |
| 41 | +} |
| 42 | + |
| 43 | +#define STORE(P, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ |
| 44 | + unsigned long long *__m64p = &P->QWORD[0]; \ |
| 45 | + __m64p[0] = (unsigned long long)(mm0); \ |
| 46 | + __m64p[1] = (unsigned long long)(mm1); \ |
| 47 | + __m64p[2] = (unsigned long long)(mm2); \ |
| 48 | + __m64p[3] = (unsigned long long)(mm3); \ |
| 49 | + __m64p[4] = (unsigned long long)(mm4); \ |
| 50 | + __m64p[5] = (unsigned long long)(mm5); \ |
| 51 | + __m64p[6] = (unsigned long long)(mm6); \ |
| 52 | + __m64p[7] = (unsigned long long)(mm7); \ |
| 53 | +} |
| 54 | + |
| 55 | +#define TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \ |
| 56 | + __m64 tm0, tm1, tm2, tm3, tm4, tm5, tm6, tm7; \ |
| 57 | + tm0 = _mm_unpacklo_pi8(mm0, mm2); \ |
| 58 | + tm1 = _mm_unpackhi_pi8(mm0, mm2); \ |
| 59 | + tm2 = _mm_unpacklo_pi8(mm1, mm3); \ |
| 60 | + tm3 = _mm_unpackhi_pi8(mm1, mm3); \ |
| 61 | + tm4 = _mm_unpacklo_pi8(mm4, mm6); \ |
| 62 | + tm5 = _mm_unpackhi_pi8(mm4, mm6); \ |
| 63 | + tm6 = _mm_unpacklo_pi8(mm5, mm7); \ |
| 64 | + tm7 = _mm_unpackhi_pi8(mm5, mm7); \ |
| 65 | + \ |
| 66 | + mm0 = _mm_unpacklo_pi8(tm0, tm2); \ |
| 67 | + mm1 = _mm_unpackhi_pi8(tm0, tm2); \ |
| 68 | + mm2 = _mm_unpacklo_pi8(tm1, tm3); \ |
| 69 | + mm3 = _mm_unpackhi_pi8(tm1, tm3); \ |
| 70 | + mm4 = _mm_unpacklo_pi8(tm4, tm6); \ |
| 71 | + mm5 = _mm_unpackhi_pi8(tm4, tm6); \ |
| 72 | + mm6 = _mm_unpacklo_pi8(tm5, tm7); \ |
| 73 | + mm7 = _mm_unpackhi_pi8(tm5, tm7); \ |
| 74 | + \ |
| 75 | + tm2 = _mm_unpacklo_pi32(mm1, mm5); \ |
| 76 | + tm3 = _mm_unpackhi_pi32(mm1, mm5); \ |
| 77 | + tm0 = _mm_unpacklo_pi32(mm0, mm4); \ |
| 78 | + tm1 = _mm_unpackhi_pi32(mm0, mm4); \ |
| 79 | + mm4 = _mm_unpacklo_pi32(mm2, mm6); \ |
| 80 | + mm5 = _mm_unpackhi_pi32(mm2, mm6); \ |
| 81 | + mm6 = _mm_unpacklo_pi32(mm3, mm7); \ |
| 82 | + mm7 = _mm_unpackhi_pi32(mm3, mm7); \ |
| 83 | + mm0 = tm0; \ |
| 84 | + mm1 = tm1; \ |
| 85 | + mm2 = tm2; \ |
| 86 | + mm3 = tm3; \ |
| 87 | +} |
| 88 | + |
| 89 | +#define XTRANSPOSE(x, y, z) { \ |
| 90 | + __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; \ |
| 91 | + XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ |
| 92 | + TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ |
| 93 | + STORE(z, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \ |
| 94 | +} |
| 95 | +#define XLPS32(x, y, data) { \ |
| 96 | + unsigned int xi; \ |
| 97 | + unsigned char *p; \ |
| 98 | + ALIGN(16) union uint512_u buf; \ |
| 99 | + XTRANSPOSE(x, y, (&buf)); \ |
| 100 | + p = (unsigned char *) &buf; \ |
| 101 | + for (xi = 0; xi < 8; xi++) \ |
| 102 | + { \ |
| 103 | + __m64 mm0 = (__m64)(Ax[0][*(p++)]); \ |
| 104 | + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[1][*(p++)])); \ |
| 105 | + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[2][*(p++)])); \ |
| 106 | + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[3][*(p++)])); \ |
| 107 | + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[4][*(p++)])); \ |
| 108 | + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[5][*(p++)])); \ |
| 109 | + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[6][*(p++)])); \ |
| 110 | + mm0 = _mm_xor_si64(mm0, (__m64)(Ax[7][*(p++)])); \ |
| 111 | + data->QWORD[xi] = (unsigned long long) mm0; \ |
| 112 | + } \ |
| 113 | +} |
| 114 | + |
| 115 | +#define ROUND(i, Ki, data) { \ |
| 116 | + XLPS(Ki, (&C[i]), Ki); \ |
| 117 | + XLPS(Ki, data, data); \ |
| 118 | +} |
| 119 | + |
| 120 | +void g_mmx(union uint512_u *h, const union uint512_u * RESTRICT N, |
| 121 | + const union uint512_u * RESTRICT m) |
| 122 | +{ |
| 123 | + union uint512_u Ki, data; |
| 124 | + unsigned int i; |
| 125 | + |
| 126 | + XLPS(h, N, (&data)); |
| 127 | + |
| 128 | + /* Starting E() */ |
| 129 | + Ki = data; |
| 130 | + XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data)); |
| 131 | + |
| 132 | + for (i = 0; i < 11; i++) |
| 133 | + ROUND(i, (&Ki), (&data)); |
| 134 | + |
| 135 | + XLPS((&Ki), (&C[11]), (&Ki)); |
| 136 | + X((&Ki), (&data), (&data)); |
| 137 | + /* E() done */ |
| 138 | + |
| 139 | + X((&data), h, (&data)); |
| 140 | + X((&data), m, h); |
| 141 | + _mm_empty(); |
| 142 | +} |
| 143 | +#endif /* __GOST3411_HAS_MMX__ */ |
0 commit comments