11#if defined(__x86_64__) || defined(__amd64__)
15void Transform(uint32_t*
s,
const unsigned char* chunk,
size_t blocks)
16#if defined(__clang__) && !defined(__OPTIMIZE__)
21 __attribute__((no_sanitize(
"address")))
24 static const uint32_t K256
alignas(16) [] = {
25 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
26 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
27 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
28 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
29 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
30 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
31 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
32 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
33 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
34 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
35 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
36 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
37 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
38 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
39 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
40 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
42 static const uint32_t FLIP_MASK
alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
43 static const uint32_t SHUF_00BA
alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
44 static const uint32_t SHUF_DC00
alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
45 uint32_t a, b, c, d, f,
g, h, y0, y1, y2;
47 uint64_t inp_end, inp;
48 uint32_t xfer
alignas(16) [4];
70 "pshufb %%xmm12,%%xmm4;"
71 "movdqu 0x10(%1),%%xmm5;"
72 "pshufb %%xmm12,%%xmm5;"
73 "movdqu 0x20(%1),%%xmm6;"
74 "pshufb %%xmm12,%%xmm6;"
75 "movdqu 0x30(%1),%%xmm7;"
76 "pshufb %%xmm12,%%xmm7;"
81 "movdqa 0x0(%13),%%xmm9;"
82 "paddd %%xmm4,%%xmm9;"
84 "movdqa %%xmm7,%%xmm0;"
88 "palignr $0x4,%%xmm6,%%xmm0;"
93 "movdqa %%xmm5,%%xmm1;"
96 "paddd %%xmm4,%%xmm0;"
100 "palignr $0x4,%%xmm4,%%xmm1;"
104 "movdqa %%xmm1,%%xmm2;"
108 "movdqa %%xmm1,%%xmm3;"
112 "pslld $0x19,%%xmm1;"
122 "movdqa %%xmm3,%%xmm2;"
125 "movdqa %%xmm3,%%xmm8;"
134 "psrld $0x12,%%xmm2;"
139 "pxor %%xmm3,%%xmm1;"
146 "pxor %%xmm2,%%xmm1;"
150 "pxor %%xmm8,%%xmm1;"
154 "pshufd $0xfa,%%xmm7,%%xmm2;"
157 "paddd %%xmm1,%%xmm0;"
160 "movdqa %%xmm2,%%xmm3;"
164 "movdqa %%xmm2,%%xmm8;"
170 "psrlq $0x11,%%xmm2;"
172 "psrlq $0x13,%%xmm3;"
180 "pxor %%xmm3,%%xmm2;"
184 "pxor %%xmm2,%%xmm8;"
188 "pshufb %%xmm10,%%xmm8;"
192 "paddd %%xmm8,%%xmm0;"
195 "pshufd $0x50,%%xmm0,%%xmm2;"
198 "movdqa %%xmm2,%%xmm3;"
202 "movdqa %%xmm2,%%xmm4;"
207 "psrlq $0x11,%%xmm2;"
210 "psrlq $0x13,%%xmm3;"
218 "pxor %%xmm3,%%xmm2;"
222 "pxor %%xmm2,%%xmm4;"
226 "pshufb %%xmm11,%%xmm4;"
230 "paddd %%xmm0,%%xmm4;"
235 "movdqa 0x10(%13),%%xmm9;"
236 "paddd %%xmm5,%%xmm9;"
238 "movdqa %%xmm4,%%xmm0;"
242 "palignr $0x4,%%xmm7,%%xmm0;"
247 "movdqa %%xmm6,%%xmm1;"
250 "paddd %%xmm5,%%xmm0;"
254 "palignr $0x4,%%xmm5,%%xmm1;"
258 "movdqa %%xmm1,%%xmm2;"
262 "movdqa %%xmm1,%%xmm3;"
266 "pslld $0x19,%%xmm1;"
276 "movdqa %%xmm3,%%xmm2;"
279 "movdqa %%xmm3,%%xmm8;"
288 "psrld $0x12,%%xmm2;"
293 "pxor %%xmm3,%%xmm1;"
300 "pxor %%xmm2,%%xmm1;"
304 "pxor %%xmm8,%%xmm1;"
308 "pshufd $0xfa,%%xmm4,%%xmm2;"
311 "paddd %%xmm1,%%xmm0;"
314 "movdqa %%xmm2,%%xmm3;"
318 "movdqa %%xmm2,%%xmm8;"
324 "psrlq $0x11,%%xmm2;"
326 "psrlq $0x13,%%xmm3;"
334 "pxor %%xmm3,%%xmm2;"
338 "pxor %%xmm2,%%xmm8;"
342 "pshufb %%xmm10,%%xmm8;"
346 "paddd %%xmm8,%%xmm0;"
349 "pshufd $0x50,%%xmm0,%%xmm2;"
352 "movdqa %%xmm2,%%xmm3;"
356 "movdqa %%xmm2,%%xmm5;"
361 "psrlq $0x11,%%xmm2;"
364 "psrlq $0x13,%%xmm3;"
372 "pxor %%xmm3,%%xmm2;"
376 "pxor %%xmm2,%%xmm5;"
380 "pshufb %%xmm11,%%xmm5;"
384 "paddd %%xmm0,%%xmm5;"
389 "movdqa 0x20(%13),%%xmm9;"
390 "paddd %%xmm6,%%xmm9;"
392 "movdqa %%xmm5,%%xmm0;"
396 "palignr $0x4,%%xmm4,%%xmm0;"
401 "movdqa %%xmm7,%%xmm1;"
404 "paddd %%xmm6,%%xmm0;"
408 "palignr $0x4,%%xmm6,%%xmm1;"
412 "movdqa %%xmm1,%%xmm2;"
416 "movdqa %%xmm1,%%xmm3;"
420 "pslld $0x19,%%xmm1;"
430 "movdqa %%xmm3,%%xmm2;"
433 "movdqa %%xmm3,%%xmm8;"
442 "psrld $0x12,%%xmm2;"
447 "pxor %%xmm3,%%xmm1;"
454 "pxor %%xmm2,%%xmm1;"
458 "pxor %%xmm8,%%xmm1;"
462 "pshufd $0xfa,%%xmm5,%%xmm2;"
465 "paddd %%xmm1,%%xmm0;"
468 "movdqa %%xmm2,%%xmm3;"
472 "movdqa %%xmm2,%%xmm8;"
478 "psrlq $0x11,%%xmm2;"
480 "psrlq $0x13,%%xmm3;"
488 "pxor %%xmm3,%%xmm2;"
492 "pxor %%xmm2,%%xmm8;"
496 "pshufb %%xmm10,%%xmm8;"
500 "paddd %%xmm8,%%xmm0;"
503 "pshufd $0x50,%%xmm0,%%xmm2;"
506 "movdqa %%xmm2,%%xmm3;"
510 "movdqa %%xmm2,%%xmm6;"
515 "psrlq $0x11,%%xmm2;"
518 "psrlq $0x13,%%xmm3;"
526 "pxor %%xmm3,%%xmm2;"
530 "pxor %%xmm2,%%xmm6;"
534 "pshufb %%xmm11,%%xmm6;"
538 "paddd %%xmm0,%%xmm6;"
543 "movdqa 0x30(%13),%%xmm9;"
544 "paddd %%xmm7,%%xmm9;"
547 "movdqa %%xmm6,%%xmm0;"
551 "palignr $0x4,%%xmm5,%%xmm0;"
556 "movdqa %%xmm4,%%xmm1;"
559 "paddd %%xmm7,%%xmm0;"
563 "palignr $0x4,%%xmm7,%%xmm1;"
567 "movdqa %%xmm1,%%xmm2;"
571 "movdqa %%xmm1,%%xmm3;"
575 "pslld $0x19,%%xmm1;"
585 "movdqa %%xmm3,%%xmm2;"
588 "movdqa %%xmm3,%%xmm8;"
597 "psrld $0x12,%%xmm2;"
602 "pxor %%xmm3,%%xmm1;"
609 "pxor %%xmm2,%%xmm1;"
613 "pxor %%xmm8,%%xmm1;"
617 "pshufd $0xfa,%%xmm6,%%xmm2;"
620 "paddd %%xmm1,%%xmm0;"
623 "movdqa %%xmm2,%%xmm3;"
627 "movdqa %%xmm2,%%xmm8;"
633 "psrlq $0x11,%%xmm2;"
635 "psrlq $0x13,%%xmm3;"
643 "pxor %%xmm3,%%xmm2;"
647 "pxor %%xmm2,%%xmm8;"
651 "pshufb %%xmm10,%%xmm8;"
655 "paddd %%xmm8,%%xmm0;"
658 "pshufd $0x50,%%xmm0,%%xmm2;"
661 "movdqa %%xmm2,%%xmm3;"
665 "movdqa %%xmm2,%%xmm7;"
670 "psrlq $0x11,%%xmm2;"
673 "psrlq $0x13,%%xmm3;"
681 "pxor %%xmm3,%%xmm2;"
685 "pxor %%xmm2,%%xmm7;"
689 "pshufb %%xmm11,%%xmm7;"
693 "paddd %%xmm0,%%xmm7;"
703 "paddd 0x0(%13),%%xmm4;"
817 "paddd 0x10(%13),%%xmm5;"
932 "movdqa %%xmm6,%%xmm4;"
933 "movdqa %%xmm7,%%xmm5;"
959 :
"+r"(
s),
"+r"(chunk),
"+r"(blocks),
"=r"(a),
"=r"(b),
"=r"(c),
"=r"(d),
"=r"(f),
"=r"(
g),
"=r"(h),
"=r"(y0),
"=r"(y1),
"=r"(y2),
"=r"(tbl),
"+m"(inp_end),
"+m"(inp),
"+m"(xfer)
960 :
"m"(K256),
"m"(FLIP_MASK),
"m"(SHUF_00BA),
"m"(SHUF_DC00)
961 :
"cc",
"memory",
"xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"xmm8",
"xmm9",
"xmm10",
"xmm11",
"xmm12"
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)