Skip to content

Commit 0f1ec96

Browse files
committed
gosthash2012: Import and merge MMX implementations
Merged and fixed two MMX implementations. For example, [1] uses SSE2 register types `__m128i', [2] GCC's `mmintrin.h' defines `_mm_cvtsi64_m64' only for `__x86_64__', but we need MMX exactly for IA-32, since x86_64 it have SSE2 in baseline. Link: https://github.com/adegtyarev/streebog Link: https://github.com/sjinks/php-stribog Signed-off-by: Vitaly Chikunov <[email protected]>
1 parent 767c693 commit 0f1ec96

File tree

3 files changed

+151
-0
lines changed

3 files changed

+151
-0
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ set(GOST_HASH_2012_SOURCE_FILES
127127
gosthash2012_const.h
128128
gosthash2012_precalc.h
129129
gosthash2012_ref.c
130+
gosthash2012_mmx.c
130131
gosthash2012_sse2.c
131132
gosthash2012_sse41.c
132133
)

gosthash2012.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define __GOST3411_HAS_REF__
1515

1616
#if defined __x86_64__ || defined __i386__
17+
# define __GOST3411_HAS_MMX__
1718
# define __GOST3411_HAS_SSE2__
1819
# define __GOST3411_HAS_SSE41__
1920
#elif defined __SSE2__
@@ -42,6 +43,7 @@
4243
* be disabled with -mno-sse2.
4344
*/
4445
# undef __GOST3411_HAS_REF__
46+
# undef __GOST3411_HAS_MMX__
4547
# endif
4648
#endif
4749

@@ -108,6 +110,11 @@ _internal
108110
void g_ref(union uint512_u *h, const union uint512_u * RESTRICT N,
109111
const union uint512_u * RESTRICT m);
110112
#endif
113+
#ifdef __GOST3411_HAS_MMX__
114+
_internal _target("mmx")
115+
void g_mmx(union uint512_u *h, const union uint512_u * RESTRICT N,
116+
const union uint512_u * RESTRICT m);
117+
#endif
111118
#ifdef __GOST3411_HAS_SSE2__
112119
_internal _target("sse2")
113120
void g_sse2(union uint512_u *h, const union uint512_u * RESTRICT N,

gosthash2012_mmx.c

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
/*
2+
* Copyright (c) 2013, Alexey Degtyarev <[email protected]>.
3+
* Implementation fixed based on php-stribog:
4+
* Copyright (c) 2013 Vladimir Kolesnikov.
5+
* SPDX-License-Identifier: BSD-2-Clause AND MIT
6+
* Copyright (c) 2021 Vitaly Chikunov <[email protected]>.
7+
* All rights reserved.
8+
*
9+
* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0+
10+
*/
11+
12+
#include "gosthash2012.h"
13+
#ifdef __GOST3411_HAS_MMX__
14+
15+
#include <mmintrin.h>
16+
17+
#define XLPS XLPS32
18+
19+
#define X(x, y, z) { \
20+
z->QWORD[0] = x->QWORD[0] ^ y->QWORD[0]; \
21+
z->QWORD[1] = x->QWORD[1] ^ y->QWORD[1]; \
22+
z->QWORD[2] = x->QWORD[2] ^ y->QWORD[2]; \
23+
z->QWORD[3] = x->QWORD[3] ^ y->QWORD[3]; \
24+
z->QWORD[4] = x->QWORD[4] ^ y->QWORD[4]; \
25+
z->QWORD[5] = x->QWORD[5] ^ y->QWORD[5]; \
26+
z->QWORD[6] = x->QWORD[6] ^ y->QWORD[6]; \
27+
z->QWORD[7] = x->QWORD[7] ^ y->QWORD[7]; \
28+
}
29+
30+
#define XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
31+
const __m64 *px = (const __m64 *) &x[0]; \
32+
const __m64 *py = (const __m64 *) &y[0]; \
33+
mm0 = _mm_xor_si64(px[0], py[0]); \
34+
mm1 = _mm_xor_si64(px[1], py[1]); \
35+
mm2 = _mm_xor_si64(px[2], py[2]); \
36+
mm3 = _mm_xor_si64(px[3], py[3]); \
37+
mm4 = _mm_xor_si64(px[4], py[4]); \
38+
mm5 = _mm_xor_si64(px[5], py[5]); \
39+
mm6 = _mm_xor_si64(px[6], py[6]); \
40+
mm7 = _mm_xor_si64(px[7], py[7]); \
41+
}
42+
43+
#define STORE(P, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
44+
unsigned long long *__m64p = &P->QWORD[0]; \
45+
__m64p[0] = (unsigned long long)(mm0); \
46+
__m64p[1] = (unsigned long long)(mm1); \
47+
__m64p[2] = (unsigned long long)(mm2); \
48+
__m64p[3] = (unsigned long long)(mm3); \
49+
__m64p[4] = (unsigned long long)(mm4); \
50+
__m64p[5] = (unsigned long long)(mm5); \
51+
__m64p[6] = (unsigned long long)(mm6); \
52+
__m64p[7] = (unsigned long long)(mm7); \
53+
}
54+
55+
#define TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7) { \
56+
__m64 tm0, tm1, tm2, tm3, tm4, tm5, tm6, tm7; \
57+
tm0 = _mm_unpacklo_pi8(mm0, mm2); \
58+
tm1 = _mm_unpackhi_pi8(mm0, mm2); \
59+
tm2 = _mm_unpacklo_pi8(mm1, mm3); \
60+
tm3 = _mm_unpackhi_pi8(mm1, mm3); \
61+
tm4 = _mm_unpacklo_pi8(mm4, mm6); \
62+
tm5 = _mm_unpackhi_pi8(mm4, mm6); \
63+
tm6 = _mm_unpacklo_pi8(mm5, mm7); \
64+
tm7 = _mm_unpackhi_pi8(mm5, mm7); \
65+
\
66+
mm0 = _mm_unpacklo_pi8(tm0, tm2); \
67+
mm1 = _mm_unpackhi_pi8(tm0, tm2); \
68+
mm2 = _mm_unpacklo_pi8(tm1, tm3); \
69+
mm3 = _mm_unpackhi_pi8(tm1, tm3); \
70+
mm4 = _mm_unpacklo_pi8(tm4, tm6); \
71+
mm5 = _mm_unpackhi_pi8(tm4, tm6); \
72+
mm6 = _mm_unpacklo_pi8(tm5, tm7); \
73+
mm7 = _mm_unpackhi_pi8(tm5, tm7); \
74+
\
75+
tm2 = _mm_unpacklo_pi32(mm1, mm5); \
76+
tm3 = _mm_unpackhi_pi32(mm1, mm5); \
77+
tm0 = _mm_unpacklo_pi32(mm0, mm4); \
78+
tm1 = _mm_unpackhi_pi32(mm0, mm4); \
79+
mm4 = _mm_unpacklo_pi32(mm2, mm6); \
80+
mm5 = _mm_unpackhi_pi32(mm2, mm6); \
81+
mm6 = _mm_unpacklo_pi32(mm3, mm7); \
82+
mm7 = _mm_unpackhi_pi32(mm3, mm7); \
83+
mm0 = tm0; \
84+
mm1 = tm1; \
85+
mm2 = tm2; \
86+
mm3 = tm3; \
87+
}
88+
89+
#define XTRANSPOSE(x, y, z) { \
90+
__m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; \
91+
XLOAD(x, y, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
92+
TRANSPOSE(mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
93+
STORE(z, mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7); \
94+
}
95+
#define XLPS32(x, y, data) { \
96+
unsigned int xi; \
97+
unsigned char *p; \
98+
ALIGN(16) union uint512_u buf; \
99+
XTRANSPOSE(x, y, (&buf)); \
100+
p = (unsigned char *) &buf; \
101+
for (xi = 0; xi < 8; xi++) \
102+
{ \
103+
__m64 mm0 = (__m64)(Ax[0][*(p++)]); \
104+
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[1][*(p++)])); \
105+
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[2][*(p++)])); \
106+
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[3][*(p++)])); \
107+
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[4][*(p++)])); \
108+
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[5][*(p++)])); \
109+
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[6][*(p++)])); \
110+
mm0 = _mm_xor_si64(mm0, (__m64)(Ax[7][*(p++)])); \
111+
data->QWORD[xi] = (unsigned long long) mm0; \
112+
} \
113+
}
114+
115+
#define ROUND(i, Ki, data) { \
116+
XLPS(Ki, (&C[i]), Ki); \
117+
XLPS(Ki, data, data); \
118+
}
119+
120+
void g_mmx(union uint512_u *h, const union uint512_u * RESTRICT N,
121+
const union uint512_u * RESTRICT m)
122+
{
123+
union uint512_u Ki, data;
124+
unsigned int i;
125+
126+
XLPS(h, N, (&data));
127+
128+
/* Starting E() */
129+
Ki = data;
130+
XLPS((&Ki), ((const union uint512_u *)&m[0]), (&data));
131+
132+
for (i = 0; i < 11; i++)
133+
ROUND(i, (&Ki), (&data));
134+
135+
XLPS((&Ki), (&C[11]), (&Ki));
136+
X((&Ki), (&data), (&data));
137+
/* E() done */
138+
139+
X((&data), h, (&data));
140+
X((&data), m, h);
141+
_mm_empty();
142+
}
143+
#endif /* __GOST3411_HAS_MMX__ */

0 commit comments

Comments
 (0)