Bitcoin Core 31.99.0
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1// Copyright (c) 2017-present The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4//
5// This is a translation to GCC extended asm syntax from YASM code by Intel
6// (available at the bottom of this file).
7
8#if defined(__x86_64__) || defined(__amd64__)
9
10#include <cstdint>
11#include <cstdlib>
12
13namespace sha256_sse4
14{
15/*
16Both Clang and GCC fail with ASan on this inline assembly:
17- Clang: compile failure with -O0 or -O2 + -fcf-protection under ASan.
18 See https://github.com/llvm/llvm-project/issues/92182
19 and https://github.com/bitcoin/bitcoin/issues/31913.
20- GCC: runtime SEGV during SHA256AutoDetect()'s self-test under ASan,
21 regardless of optimization level.
22 See https://github.com/bitcoin/bitcoin/issues/34881.
23*/
24#if defined(__SANITIZE_ADDRESS__)
25 __attribute__((no_sanitize("address")))
26#elif defined(__clang__)
27#if __has_feature(address_sanitizer) // fallback can be removed once support for Clang 21 is dropped
28 __attribute__((no_sanitize("address")))
29#endif
30#endif
31void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
32{
33 static const uint32_t K256 alignas(16) [] = {
34 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
35 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
36 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
37 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
38 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
39 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
40 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
41 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
42 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
43 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
44 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
45 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
46 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
47 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
48 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
49 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
50 };
51 static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
52 static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
53 static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
54 uint32_t a, b, c, d, f, g, h, y0, y1, y2;
55 uint64_t tbl;
56 uint64_t inp_end, inp;
57 uint32_t xfer alignas(16) [4];
58
59 __asm__ __volatile__(
60 "shl $0x6,%2;"
61 "je Ldone_hash_%=;"
62 "add %1,%2;"
63 "mov %2,%14;"
64 "mov (%0),%3;"
65 "mov 0x4(%0),%4;"
66 "mov 0x8(%0),%5;"
67 "mov 0xc(%0),%6;"
68 "mov 0x10(%0),%k2;"
69 "mov 0x14(%0),%7;"
70 "mov 0x18(%0),%8;"
71 "mov 0x1c(%0),%9;"
72 "movdqa %18,%%xmm12;"
73 "movdqa %19,%%xmm10;"
74 "movdqa %20,%%xmm11;"
75
76 "Lloop0_%=:"
77 "lea %17,%13;"
78 "movdqu (%1),%%xmm4;"
79 "pshufb %%xmm12,%%xmm4;"
80 "movdqu 0x10(%1),%%xmm5;"
81 "pshufb %%xmm12,%%xmm5;"
82 "movdqu 0x20(%1),%%xmm6;"
83 "pshufb %%xmm12,%%xmm6;"
84 "movdqu 0x30(%1),%%xmm7;"
85 "pshufb %%xmm12,%%xmm7;"
86 "mov %1,%15;"
87 "mov $3,%1;"
88
89 "Lloop1_%=:"
90 "movdqa 0x0(%13),%%xmm9;"
91 "paddd %%xmm4,%%xmm9;"
92 "movdqa %%xmm9,%16;"
93 "movdqa %%xmm7,%%xmm0;"
94 "mov %k2,%10;"
95 "ror $0xe,%10;"
96 "mov %3,%11;"
97 "palignr $0x4,%%xmm6,%%xmm0;"
98 "ror $0x9,%11;"
99 "xor %k2,%10;"
100 "mov %7,%12;"
101 "ror $0x5,%10;"
102 "movdqa %%xmm5,%%xmm1;"
103 "xor %3,%11;"
104 "xor %8,%12;"
105 "paddd %%xmm4,%%xmm0;"
106 "xor %k2,%10;"
107 "and %k2,%12;"
108 "ror $0xb,%11;"
109 "palignr $0x4,%%xmm4,%%xmm1;"
110 "xor %3,%11;"
111 "ror $0x6,%10;"
112 "xor %8,%12;"
113 "movdqa %%xmm1,%%xmm2;"
114 "ror $0x2,%11;"
115 "add %10,%12;"
116 "add %16,%12;"
117 "movdqa %%xmm1,%%xmm3;"
118 "mov %3,%10;"
119 "add %12,%9;"
120 "mov %3,%12;"
121 "pslld $0x19,%%xmm1;"
122 "or %5,%10;"
123 "add %9,%6;"
124 "and %5,%12;"
125 "psrld $0x7,%%xmm2;"
126 "and %4,%10;"
127 "add %11,%9;"
128 "por %%xmm2,%%xmm1;"
129 "or %12,%10;"
130 "add %10,%9;"
131 "movdqa %%xmm3,%%xmm2;"
132 "mov %6,%10;"
133 "mov %9,%11;"
134 "movdqa %%xmm3,%%xmm8;"
135 "ror $0xe,%10;"
136 "xor %6,%10;"
137 "mov %k2,%12;"
138 "ror $0x9,%11;"
139 "pslld $0xe,%%xmm3;"
140 "xor %9,%11;"
141 "ror $0x5,%10;"
142 "xor %7,%12;"
143 "psrld $0x12,%%xmm2;"
144 "ror $0xb,%11;"
145 "xor %6,%10;"
146 "and %6,%12;"
147 "ror $0x6,%10;"
148 "pxor %%xmm3,%%xmm1;"
149 "xor %9,%11;"
150 "xor %7,%12;"
151 "psrld $0x3,%%xmm8;"
152 "add %10,%12;"
153 "add 4+%16,%12;"
154 "ror $0x2,%11;"
155 "pxor %%xmm2,%%xmm1;"
156 "mov %9,%10;"
157 "add %12,%8;"
158 "mov %9,%12;"
159 "pxor %%xmm8,%%xmm1;"
160 "or %4,%10;"
161 "add %8,%5;"
162 "and %4,%12;"
163 "pshufd $0xfa,%%xmm7,%%xmm2;"
164 "and %3,%10;"
165 "add %11,%8;"
166 "paddd %%xmm1,%%xmm0;"
167 "or %12,%10;"
168 "add %10,%8;"
169 "movdqa %%xmm2,%%xmm3;"
170 "mov %5,%10;"
171 "mov %8,%11;"
172 "ror $0xe,%10;"
173 "movdqa %%xmm2,%%xmm8;"
174 "xor %5,%10;"
175 "ror $0x9,%11;"
176 "mov %6,%12;"
177 "xor %8,%11;"
178 "ror $0x5,%10;"
179 "psrlq $0x11,%%xmm2;"
180 "xor %k2,%12;"
181 "psrlq $0x13,%%xmm3;"
182 "xor %5,%10;"
183 "and %5,%12;"
184 "psrld $0xa,%%xmm8;"
185 "ror $0xb,%11;"
186 "xor %8,%11;"
187 "xor %k2,%12;"
188 "ror $0x6,%10;"
189 "pxor %%xmm3,%%xmm2;"
190 "add %10,%12;"
191 "ror $0x2,%11;"
192 "add 8+%16,%12;"
193 "pxor %%xmm2,%%xmm8;"
194 "mov %8,%10;"
195 "add %12,%7;"
196 "mov %8,%12;"
197 "pshufb %%xmm10,%%xmm8;"
198 "or %3,%10;"
199 "add %7,%4;"
200 "and %3,%12;"
201 "paddd %%xmm8,%%xmm0;"
202 "and %9,%10;"
203 "add %11,%7;"
204 "pshufd $0x50,%%xmm0,%%xmm2;"
205 "or %12,%10;"
206 "add %10,%7;"
207 "movdqa %%xmm2,%%xmm3;"
208 "mov %4,%10;"
209 "ror $0xe,%10;"
210 "mov %7,%11;"
211 "movdqa %%xmm2,%%xmm4;"
212 "ror $0x9,%11;"
213 "xor %4,%10;"
214 "mov %5,%12;"
215 "ror $0x5,%10;"
216 "psrlq $0x11,%%xmm2;"
217 "xor %7,%11;"
218 "xor %6,%12;"
219 "psrlq $0x13,%%xmm3;"
220 "xor %4,%10;"
221 "and %4,%12;"
222 "ror $0xb,%11;"
223 "psrld $0xa,%%xmm4;"
224 "xor %7,%11;"
225 "ror $0x6,%10;"
226 "xor %6,%12;"
227 "pxor %%xmm3,%%xmm2;"
228 "ror $0x2,%11;"
229 "add %10,%12;"
230 "add 12+%16,%12;"
231 "pxor %%xmm2,%%xmm4;"
232 "mov %7,%10;"
233 "add %12,%k2;"
234 "mov %7,%12;"
235 "pshufb %%xmm11,%%xmm4;"
236 "or %9,%10;"
237 "add %k2,%3;"
238 "and %9,%12;"
239 "paddd %%xmm0,%%xmm4;"
240 "and %8,%10;"
241 "add %11,%k2;"
242 "or %12,%10;"
243 "add %10,%k2;"
244 "movdqa 0x10(%13),%%xmm9;"
245 "paddd %%xmm5,%%xmm9;"
246 "movdqa %%xmm9,%16;"
247 "movdqa %%xmm4,%%xmm0;"
248 "mov %3,%10;"
249 "ror $0xe,%10;"
250 "mov %k2,%11;"
251 "palignr $0x4,%%xmm7,%%xmm0;"
252 "ror $0x9,%11;"
253 "xor %3,%10;"
254 "mov %4,%12;"
255 "ror $0x5,%10;"
256 "movdqa %%xmm6,%%xmm1;"
257 "xor %k2,%11;"
258 "xor %5,%12;"
259 "paddd %%xmm5,%%xmm0;"
260 "xor %3,%10;"
261 "and %3,%12;"
262 "ror $0xb,%11;"
263 "palignr $0x4,%%xmm5,%%xmm1;"
264 "xor %k2,%11;"
265 "ror $0x6,%10;"
266 "xor %5,%12;"
267 "movdqa %%xmm1,%%xmm2;"
268 "ror $0x2,%11;"
269 "add %10,%12;"
270 "add %16,%12;"
271 "movdqa %%xmm1,%%xmm3;"
272 "mov %k2,%10;"
273 "add %12,%6;"
274 "mov %k2,%12;"
275 "pslld $0x19,%%xmm1;"
276 "or %8,%10;"
277 "add %6,%9;"
278 "and %8,%12;"
279 "psrld $0x7,%%xmm2;"
280 "and %7,%10;"
281 "add %11,%6;"
282 "por %%xmm2,%%xmm1;"
283 "or %12,%10;"
284 "add %10,%6;"
285 "movdqa %%xmm3,%%xmm2;"
286 "mov %9,%10;"
287 "mov %6,%11;"
288 "movdqa %%xmm3,%%xmm8;"
289 "ror $0xe,%10;"
290 "xor %9,%10;"
291 "mov %3,%12;"
292 "ror $0x9,%11;"
293 "pslld $0xe,%%xmm3;"
294 "xor %6,%11;"
295 "ror $0x5,%10;"
296 "xor %4,%12;"
297 "psrld $0x12,%%xmm2;"
298 "ror $0xb,%11;"
299 "xor %9,%10;"
300 "and %9,%12;"
301 "ror $0x6,%10;"
302 "pxor %%xmm3,%%xmm1;"
303 "xor %6,%11;"
304 "xor %4,%12;"
305 "psrld $0x3,%%xmm8;"
306 "add %10,%12;"
307 "add 4+%16,%12;"
308 "ror $0x2,%11;"
309 "pxor %%xmm2,%%xmm1;"
310 "mov %6,%10;"
311 "add %12,%5;"
312 "mov %6,%12;"
313 "pxor %%xmm8,%%xmm1;"
314 "or %7,%10;"
315 "add %5,%8;"
316 "and %7,%12;"
317 "pshufd $0xfa,%%xmm4,%%xmm2;"
318 "and %k2,%10;"
319 "add %11,%5;"
320 "paddd %%xmm1,%%xmm0;"
321 "or %12,%10;"
322 "add %10,%5;"
323 "movdqa %%xmm2,%%xmm3;"
324 "mov %8,%10;"
325 "mov %5,%11;"
326 "ror $0xe,%10;"
327 "movdqa %%xmm2,%%xmm8;"
328 "xor %8,%10;"
329 "ror $0x9,%11;"
330 "mov %9,%12;"
331 "xor %5,%11;"
332 "ror $0x5,%10;"
333 "psrlq $0x11,%%xmm2;"
334 "xor %3,%12;"
335 "psrlq $0x13,%%xmm3;"
336 "xor %8,%10;"
337 "and %8,%12;"
338 "psrld $0xa,%%xmm8;"
339 "ror $0xb,%11;"
340 "xor %5,%11;"
341 "xor %3,%12;"
342 "ror $0x6,%10;"
343 "pxor %%xmm3,%%xmm2;"
344 "add %10,%12;"
345 "ror $0x2,%11;"
346 "add 8+%16,%12;"
347 "pxor %%xmm2,%%xmm8;"
348 "mov %5,%10;"
349 "add %12,%4;"
350 "mov %5,%12;"
351 "pshufb %%xmm10,%%xmm8;"
352 "or %k2,%10;"
353 "add %4,%7;"
354 "and %k2,%12;"
355 "paddd %%xmm8,%%xmm0;"
356 "and %6,%10;"
357 "add %11,%4;"
358 "pshufd $0x50,%%xmm0,%%xmm2;"
359 "or %12,%10;"
360 "add %10,%4;"
361 "movdqa %%xmm2,%%xmm3;"
362 "mov %7,%10;"
363 "ror $0xe,%10;"
364 "mov %4,%11;"
365 "movdqa %%xmm2,%%xmm5;"
366 "ror $0x9,%11;"
367 "xor %7,%10;"
368 "mov %8,%12;"
369 "ror $0x5,%10;"
370 "psrlq $0x11,%%xmm2;"
371 "xor %4,%11;"
372 "xor %9,%12;"
373 "psrlq $0x13,%%xmm3;"
374 "xor %7,%10;"
375 "and %7,%12;"
376 "ror $0xb,%11;"
377 "psrld $0xa,%%xmm5;"
378 "xor %4,%11;"
379 "ror $0x6,%10;"
380 "xor %9,%12;"
381 "pxor %%xmm3,%%xmm2;"
382 "ror $0x2,%11;"
383 "add %10,%12;"
384 "add 12+%16,%12;"
385 "pxor %%xmm2,%%xmm5;"
386 "mov %4,%10;"
387 "add %12,%3;"
388 "mov %4,%12;"
389 "pshufb %%xmm11,%%xmm5;"
390 "or %6,%10;"
391 "add %3,%k2;"
392 "and %6,%12;"
393 "paddd %%xmm0,%%xmm5;"
394 "and %5,%10;"
395 "add %11,%3;"
396 "or %12,%10;"
397 "add %10,%3;"
398 "movdqa 0x20(%13),%%xmm9;"
399 "paddd %%xmm6,%%xmm9;"
400 "movdqa %%xmm9,%16;"
401 "movdqa %%xmm5,%%xmm0;"
402 "mov %k2,%10;"
403 "ror $0xe,%10;"
404 "mov %3,%11;"
405 "palignr $0x4,%%xmm4,%%xmm0;"
406 "ror $0x9,%11;"
407 "xor %k2,%10;"
408 "mov %7,%12;"
409 "ror $0x5,%10;"
410 "movdqa %%xmm7,%%xmm1;"
411 "xor %3,%11;"
412 "xor %8,%12;"
413 "paddd %%xmm6,%%xmm0;"
414 "xor %k2,%10;"
415 "and %k2,%12;"
416 "ror $0xb,%11;"
417 "palignr $0x4,%%xmm6,%%xmm1;"
418 "xor %3,%11;"
419 "ror $0x6,%10;"
420 "xor %8,%12;"
421 "movdqa %%xmm1,%%xmm2;"
422 "ror $0x2,%11;"
423 "add %10,%12;"
424 "add %16,%12;"
425 "movdqa %%xmm1,%%xmm3;"
426 "mov %3,%10;"
427 "add %12,%9;"
428 "mov %3,%12;"
429 "pslld $0x19,%%xmm1;"
430 "or %5,%10;"
431 "add %9,%6;"
432 "and %5,%12;"
433 "psrld $0x7,%%xmm2;"
434 "and %4,%10;"
435 "add %11,%9;"
436 "por %%xmm2,%%xmm1;"
437 "or %12,%10;"
438 "add %10,%9;"
439 "movdqa %%xmm3,%%xmm2;"
440 "mov %6,%10;"
441 "mov %9,%11;"
442 "movdqa %%xmm3,%%xmm8;"
443 "ror $0xe,%10;"
444 "xor %6,%10;"
445 "mov %k2,%12;"
446 "ror $0x9,%11;"
447 "pslld $0xe,%%xmm3;"
448 "xor %9,%11;"
449 "ror $0x5,%10;"
450 "xor %7,%12;"
451 "psrld $0x12,%%xmm2;"
452 "ror $0xb,%11;"
453 "xor %6,%10;"
454 "and %6,%12;"
455 "ror $0x6,%10;"
456 "pxor %%xmm3,%%xmm1;"
457 "xor %9,%11;"
458 "xor %7,%12;"
459 "psrld $0x3,%%xmm8;"
460 "add %10,%12;"
461 "add 4+%16,%12;"
462 "ror $0x2,%11;"
463 "pxor %%xmm2,%%xmm1;"
464 "mov %9,%10;"
465 "add %12,%8;"
466 "mov %9,%12;"
467 "pxor %%xmm8,%%xmm1;"
468 "or %4,%10;"
469 "add %8,%5;"
470 "and %4,%12;"
471 "pshufd $0xfa,%%xmm5,%%xmm2;"
472 "and %3,%10;"
473 "add %11,%8;"
474 "paddd %%xmm1,%%xmm0;"
475 "or %12,%10;"
476 "add %10,%8;"
477 "movdqa %%xmm2,%%xmm3;"
478 "mov %5,%10;"
479 "mov %8,%11;"
480 "ror $0xe,%10;"
481 "movdqa %%xmm2,%%xmm8;"
482 "xor %5,%10;"
483 "ror $0x9,%11;"
484 "mov %6,%12;"
485 "xor %8,%11;"
486 "ror $0x5,%10;"
487 "psrlq $0x11,%%xmm2;"
488 "xor %k2,%12;"
489 "psrlq $0x13,%%xmm3;"
490 "xor %5,%10;"
491 "and %5,%12;"
492 "psrld $0xa,%%xmm8;"
493 "ror $0xb,%11;"
494 "xor %8,%11;"
495 "xor %k2,%12;"
496 "ror $0x6,%10;"
497 "pxor %%xmm3,%%xmm2;"
498 "add %10,%12;"
499 "ror $0x2,%11;"
500 "add 8+%16,%12;"
501 "pxor %%xmm2,%%xmm8;"
502 "mov %8,%10;"
503 "add %12,%7;"
504 "mov %8,%12;"
505 "pshufb %%xmm10,%%xmm8;"
506 "or %3,%10;"
507 "add %7,%4;"
508 "and %3,%12;"
509 "paddd %%xmm8,%%xmm0;"
510 "and %9,%10;"
511 "add %11,%7;"
512 "pshufd $0x50,%%xmm0,%%xmm2;"
513 "or %12,%10;"
514 "add %10,%7;"
515 "movdqa %%xmm2,%%xmm3;"
516 "mov %4,%10;"
517 "ror $0xe,%10;"
518 "mov %7,%11;"
519 "movdqa %%xmm2,%%xmm6;"
520 "ror $0x9,%11;"
521 "xor %4,%10;"
522 "mov %5,%12;"
523 "ror $0x5,%10;"
524 "psrlq $0x11,%%xmm2;"
525 "xor %7,%11;"
526 "xor %6,%12;"
527 "psrlq $0x13,%%xmm3;"
528 "xor %4,%10;"
529 "and %4,%12;"
530 "ror $0xb,%11;"
531 "psrld $0xa,%%xmm6;"
532 "xor %7,%11;"
533 "ror $0x6,%10;"
534 "xor %6,%12;"
535 "pxor %%xmm3,%%xmm2;"
536 "ror $0x2,%11;"
537 "add %10,%12;"
538 "add 12+%16,%12;"
539 "pxor %%xmm2,%%xmm6;"
540 "mov %7,%10;"
541 "add %12,%k2;"
542 "mov %7,%12;"
543 "pshufb %%xmm11,%%xmm6;"
544 "or %9,%10;"
545 "add %k2,%3;"
546 "and %9,%12;"
547 "paddd %%xmm0,%%xmm6;"
548 "and %8,%10;"
549 "add %11,%k2;"
550 "or %12,%10;"
551 "add %10,%k2;"
552 "movdqa 0x30(%13),%%xmm9;"
553 "paddd %%xmm7,%%xmm9;"
554 "movdqa %%xmm9,%16;"
555 "add $0x40,%13;"
556 "movdqa %%xmm6,%%xmm0;"
557 "mov %3,%10;"
558 "ror $0xe,%10;"
559 "mov %k2,%11;"
560 "palignr $0x4,%%xmm5,%%xmm0;"
561 "ror $0x9,%11;"
562 "xor %3,%10;"
563 "mov %4,%12;"
564 "ror $0x5,%10;"
565 "movdqa %%xmm4,%%xmm1;"
566 "xor %k2,%11;"
567 "xor %5,%12;"
568 "paddd %%xmm7,%%xmm0;"
569 "xor %3,%10;"
570 "and %3,%12;"
571 "ror $0xb,%11;"
572 "palignr $0x4,%%xmm7,%%xmm1;"
573 "xor %k2,%11;"
574 "ror $0x6,%10;"
575 "xor %5,%12;"
576 "movdqa %%xmm1,%%xmm2;"
577 "ror $0x2,%11;"
578 "add %10,%12;"
579 "add %16,%12;"
580 "movdqa %%xmm1,%%xmm3;"
581 "mov %k2,%10;"
582 "add %12,%6;"
583 "mov %k2,%12;"
584 "pslld $0x19,%%xmm1;"
585 "or %8,%10;"
586 "add %6,%9;"
587 "and %8,%12;"
588 "psrld $0x7,%%xmm2;"
589 "and %7,%10;"
590 "add %11,%6;"
591 "por %%xmm2,%%xmm1;"
592 "or %12,%10;"
593 "add %10,%6;"
594 "movdqa %%xmm3,%%xmm2;"
595 "mov %9,%10;"
596 "mov %6,%11;"
597 "movdqa %%xmm3,%%xmm8;"
598 "ror $0xe,%10;"
599 "xor %9,%10;"
600 "mov %3,%12;"
601 "ror $0x9,%11;"
602 "pslld $0xe,%%xmm3;"
603 "xor %6,%11;"
604 "ror $0x5,%10;"
605 "xor %4,%12;"
606 "psrld $0x12,%%xmm2;"
607 "ror $0xb,%11;"
608 "xor %9,%10;"
609 "and %9,%12;"
610 "ror $0x6,%10;"
611 "pxor %%xmm3,%%xmm1;"
612 "xor %6,%11;"
613 "xor %4,%12;"
614 "psrld $0x3,%%xmm8;"
615 "add %10,%12;"
616 "add 4+%16,%12;"
617 "ror $0x2,%11;"
618 "pxor %%xmm2,%%xmm1;"
619 "mov %6,%10;"
620 "add %12,%5;"
621 "mov %6,%12;"
622 "pxor %%xmm8,%%xmm1;"
623 "or %7,%10;"
624 "add %5,%8;"
625 "and %7,%12;"
626 "pshufd $0xfa,%%xmm6,%%xmm2;"
627 "and %k2,%10;"
628 "add %11,%5;"
629 "paddd %%xmm1,%%xmm0;"
630 "or %12,%10;"
631 "add %10,%5;"
632 "movdqa %%xmm2,%%xmm3;"
633 "mov %8,%10;"
634 "mov %5,%11;"
635 "ror $0xe,%10;"
636 "movdqa %%xmm2,%%xmm8;"
637 "xor %8,%10;"
638 "ror $0x9,%11;"
639 "mov %9,%12;"
640 "xor %5,%11;"
641 "ror $0x5,%10;"
642 "psrlq $0x11,%%xmm2;"
643 "xor %3,%12;"
644 "psrlq $0x13,%%xmm3;"
645 "xor %8,%10;"
646 "and %8,%12;"
647 "psrld $0xa,%%xmm8;"
648 "ror $0xb,%11;"
649 "xor %5,%11;"
650 "xor %3,%12;"
651 "ror $0x6,%10;"
652 "pxor %%xmm3,%%xmm2;"
653 "add %10,%12;"
654 "ror $0x2,%11;"
655 "add 8+%16,%12;"
656 "pxor %%xmm2,%%xmm8;"
657 "mov %5,%10;"
658 "add %12,%4;"
659 "mov %5,%12;"
660 "pshufb %%xmm10,%%xmm8;"
661 "or %k2,%10;"
662 "add %4,%7;"
663 "and %k2,%12;"
664 "paddd %%xmm8,%%xmm0;"
665 "and %6,%10;"
666 "add %11,%4;"
667 "pshufd $0x50,%%xmm0,%%xmm2;"
668 "or %12,%10;"
669 "add %10,%4;"
670 "movdqa %%xmm2,%%xmm3;"
671 "mov %7,%10;"
672 "ror $0xe,%10;"
673 "mov %4,%11;"
674 "movdqa %%xmm2,%%xmm7;"
675 "ror $0x9,%11;"
676 "xor %7,%10;"
677 "mov %8,%12;"
678 "ror $0x5,%10;"
679 "psrlq $0x11,%%xmm2;"
680 "xor %4,%11;"
681 "xor %9,%12;"
682 "psrlq $0x13,%%xmm3;"
683 "xor %7,%10;"
684 "and %7,%12;"
685 "ror $0xb,%11;"
686 "psrld $0xa,%%xmm7;"
687 "xor %4,%11;"
688 "ror $0x6,%10;"
689 "xor %9,%12;"
690 "pxor %%xmm3,%%xmm2;"
691 "ror $0x2,%11;"
692 "add %10,%12;"
693 "add 12+%16,%12;"
694 "pxor %%xmm2,%%xmm7;"
695 "mov %4,%10;"
696 "add %12,%3;"
697 "mov %4,%12;"
698 "pshufb %%xmm11,%%xmm7;"
699 "or %6,%10;"
700 "add %3,%k2;"
701 "and %6,%12;"
702 "paddd %%xmm0,%%xmm7;"
703 "and %5,%10;"
704 "add %11,%3;"
705 "or %12,%10;"
706 "add %10,%3;"
707 "sub $0x1,%1;"
708 "jne Lloop1_%=;"
709 "mov $0x2,%1;"
710
711 "Lloop2_%=:"
712 "paddd 0x0(%13),%%xmm4;"
713 "movdqa %%xmm4,%16;"
714 "mov %k2,%10;"
715 "ror $0xe,%10;"
716 "mov %3,%11;"
717 "xor %k2,%10;"
718 "ror $0x9,%11;"
719 "mov %7,%12;"
720 "xor %3,%11;"
721 "ror $0x5,%10;"
722 "xor %8,%12;"
723 "xor %k2,%10;"
724 "ror $0xb,%11;"
725 "and %k2,%12;"
726 "xor %3,%11;"
727 "ror $0x6,%10;"
728 "xor %8,%12;"
729 "add %10,%12;"
730 "ror $0x2,%11;"
731 "add %16,%12;"
732 "mov %3,%10;"
733 "add %12,%9;"
734 "mov %3,%12;"
735 "or %5,%10;"
736 "add %9,%6;"
737 "and %5,%12;"
738 "and %4,%10;"
739 "add %11,%9;"
740 "or %12,%10;"
741 "add %10,%9;"
742 "mov %6,%10;"
743 "ror $0xe,%10;"
744 "mov %9,%11;"
745 "xor %6,%10;"
746 "ror $0x9,%11;"
747 "mov %k2,%12;"
748 "xor %9,%11;"
749 "ror $0x5,%10;"
750 "xor %7,%12;"
751 "xor %6,%10;"
752 "ror $0xb,%11;"
753 "and %6,%12;"
754 "xor %9,%11;"
755 "ror $0x6,%10;"
756 "xor %7,%12;"
757 "add %10,%12;"
758 "ror $0x2,%11;"
759 "add 4+%16,%12;"
760 "mov %9,%10;"
761 "add %12,%8;"
762 "mov %9,%12;"
763 "or %4,%10;"
764 "add %8,%5;"
765 "and %4,%12;"
766 "and %3,%10;"
767 "add %11,%8;"
768 "or %12,%10;"
769 "add %10,%8;"
770 "mov %5,%10;"
771 "ror $0xe,%10;"
772 "mov %8,%11;"
773 "xor %5,%10;"
774 "ror $0x9,%11;"
775 "mov %6,%12;"
776 "xor %8,%11;"
777 "ror $0x5,%10;"
778 "xor %k2,%12;"
779 "xor %5,%10;"
780 "ror $0xb,%11;"
781 "and %5,%12;"
782 "xor %8,%11;"
783 "ror $0x6,%10;"
784 "xor %k2,%12;"
785 "add %10,%12;"
786 "ror $0x2,%11;"
787 "add 8+%16,%12;"
788 "mov %8,%10;"
789 "add %12,%7;"
790 "mov %8,%12;"
791 "or %3,%10;"
792 "add %7,%4;"
793 "and %3,%12;"
794 "and %9,%10;"
795 "add %11,%7;"
796 "or %12,%10;"
797 "add %10,%7;"
798 "mov %4,%10;"
799 "ror $0xe,%10;"
800 "mov %7,%11;"
801 "xor %4,%10;"
802 "ror $0x9,%11;"
803 "mov %5,%12;"
804 "xor %7,%11;"
805 "ror $0x5,%10;"
806 "xor %6,%12;"
807 "xor %4,%10;"
808 "ror $0xb,%11;"
809 "and %4,%12;"
810 "xor %7,%11;"
811 "ror $0x6,%10;"
812 "xor %6,%12;"
813 "add %10,%12;"
814 "ror $0x2,%11;"
815 "add 12+%16,%12;"
816 "mov %7,%10;"
817 "add %12,%k2;"
818 "mov %7,%12;"
819 "or %9,%10;"
820 "add %k2,%3;"
821 "and %9,%12;"
822 "and %8,%10;"
823 "add %11,%k2;"
824 "or %12,%10;"
825 "add %10,%k2;"
826 "paddd 0x10(%13),%%xmm5;"
827 "movdqa %%xmm5,%16;"
828 "add $0x20,%13;"
829 "mov %3,%10;"
830 "ror $0xe,%10;"
831 "mov %k2,%11;"
832 "xor %3,%10;"
833 "ror $0x9,%11;"
834 "mov %4,%12;"
835 "xor %k2,%11;"
836 "ror $0x5,%10;"
837 "xor %5,%12;"
838 "xor %3,%10;"
839 "ror $0xb,%11;"
840 "and %3,%12;"
841 "xor %k2,%11;"
842 "ror $0x6,%10;"
843 "xor %5,%12;"
844 "add %10,%12;"
845 "ror $0x2,%11;"
846 "add %16,%12;"
847 "mov %k2,%10;"
848 "add %12,%6;"
849 "mov %k2,%12;"
850 "or %8,%10;"
851 "add %6,%9;"
852 "and %8,%12;"
853 "and %7,%10;"
854 "add %11,%6;"
855 "or %12,%10;"
856 "add %10,%6;"
857 "mov %9,%10;"
858 "ror $0xe,%10;"
859 "mov %6,%11;"
860 "xor %9,%10;"
861 "ror $0x9,%11;"
862 "mov %3,%12;"
863 "xor %6,%11;"
864 "ror $0x5,%10;"
865 "xor %4,%12;"
866 "xor %9,%10;"
867 "ror $0xb,%11;"
868 "and %9,%12;"
869 "xor %6,%11;"
870 "ror $0x6,%10;"
871 "xor %4,%12;"
872 "add %10,%12;"
873 "ror $0x2,%11;"
874 "add 4+%16,%12;"
875 "mov %6,%10;"
876 "add %12,%5;"
877 "mov %6,%12;"
878 "or %7,%10;"
879 "add %5,%8;"
880 "and %7,%12;"
881 "and %k2,%10;"
882 "add %11,%5;"
883 "or %12,%10;"
884 "add %10,%5;"
885 "mov %8,%10;"
886 "ror $0xe,%10;"
887 "mov %5,%11;"
888 "xor %8,%10;"
889 "ror $0x9,%11;"
890 "mov %9,%12;"
891 "xor %5,%11;"
892 "ror $0x5,%10;"
893 "xor %3,%12;"
894 "xor %8,%10;"
895 "ror $0xb,%11;"
896 "and %8,%12;"
897 "xor %5,%11;"
898 "ror $0x6,%10;"
899 "xor %3,%12;"
900 "add %10,%12;"
901 "ror $0x2,%11;"
902 "add 8+%16,%12;"
903 "mov %5,%10;"
904 "add %12,%4;"
905 "mov %5,%12;"
906 "or %k2,%10;"
907 "add %4,%7;"
908 "and %k2,%12;"
909 "and %6,%10;"
910 "add %11,%4;"
911 "or %12,%10;"
912 "add %10,%4;"
913 "mov %7,%10;"
914 "ror $0xe,%10;"
915 "mov %4,%11;"
916 "xor %7,%10;"
917 "ror $0x9,%11;"
918 "mov %8,%12;"
919 "xor %4,%11;"
920 "ror $0x5,%10;"
921 "xor %9,%12;"
922 "xor %7,%10;"
923 "ror $0xb,%11;"
924 "and %7,%12;"
925 "xor %4,%11;"
926 "ror $0x6,%10;"
927 "xor %9,%12;"
928 "add %10,%12;"
929 "ror $0x2,%11;"
930 "add 12+%16,%12;"
931 "mov %4,%10;"
932 "add %12,%3;"
933 "mov %4,%12;"
934 "or %6,%10;"
935 "add %3,%k2;"
936 "and %6,%12;"
937 "and %5,%10;"
938 "add %11,%3;"
939 "or %12,%10;"
940 "add %10,%3;"
941 "movdqa %%xmm6,%%xmm4;"
942 "movdqa %%xmm7,%%xmm5;"
943 "sub $0x1,%1;"
944 "jne Lloop2_%=;"
945 "add (%0),%3;"
946 "mov %3,(%0);"
947 "add 0x4(%0),%4;"
948 "mov %4,0x4(%0);"
949 "add 0x8(%0),%5;"
950 "mov %5,0x8(%0);"
951 "add 0xc(%0),%6;"
952 "mov %6,0xc(%0);"
953 "add 0x10(%0),%k2;"
954 "mov %k2,0x10(%0);"
955 "add 0x14(%0),%7;"
956 "mov %7,0x14(%0);"
957 "add 0x18(%0),%8;"
958 "mov %8,0x18(%0);"
959 "add 0x1c(%0),%9;"
960 "mov %9,0x1c(%0);"
961 "mov %15,%1;"
962 "add $0x40,%1;"
963 "cmp %14,%1;"
964 "jne Lloop0_%=;"
965
966 "Ldone_hash_%=:"
967
968 : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
969 : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
970 : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
971 );
972}
973}
974
975/*
976;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
977; Copyright (c) 2012, Intel Corporation
978;
979; All rights reserved.
980;
981; Redistribution and use in source and binary forms, with or without
982; modification, are permitted provided that the following conditions are
983; met:
984;
985; * Redistributions of source code must retain the above copyright
986; notice, this list of conditions and the following disclaimer.
987;
988; * Redistributions in binary form must reproduce the above copyright
989; notice, this list of conditions and the following disclaimer in the
990; documentation and/or other materials provided with the
991; distribution.
992;
993; * Neither the name of the Intel Corporation nor the names of its
994; contributors may be used to endorse or promote products derived from
995; this software without specific prior written permission.
996;
997;
998; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
999; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1000; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
1001; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
1002; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
1003; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
1004; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1005; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1006; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1007; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1008; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1009;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1010;
1011; Example YASM command lines:
1012; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1013; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1014;
1015;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1016;
1017; This code is described in an Intel White-Paper:
1018; "Fast SHA-256 Implementations on Intel Architecture Processors"
1019;
1020; To find it, surf to https://www.intel.com/p/en_US/embedded
1021; and search for that title.
1022; The paper is expected to be released roughly at the end of April, 2012
1023;
1024;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1025; This code schedules 1 blocks at a time, with 4 lanes per block
1026;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1027
1028%define MOVDQ movdqu ;; assume buffers not aligned
1029
1030;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1031
1032; addm [mem], reg
1033; Add reg to mem using reg-mem add and store
1034%macro addm 2
1035 add %2, %1
1036 mov %1, %2
1037%endm
1038
1039;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1040
1041; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1042; Load xmm with mem and byte swap each dword
1043%macro COPY_XMM_AND_BSWAP 3
1044 MOVDQ %1, %2
1045 pshufb %1, %3
1046%endmacro
1047
1048;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1049
1050%define X0 xmm4
1051%define X1 xmm5
1052%define X2 xmm6
1053%define X3 xmm7
1054
1055%define XTMP0 xmm0
1056%define XTMP1 xmm1
1057%define XTMP2 xmm2
1058%define XTMP3 xmm3
1059%define XTMP4 xmm8
1060%define XFER xmm9
1061
1062%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1063%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1064%define BYTE_FLIP_MASK xmm12
1065
1066%ifdef LINUX
1067%define NUM_BLKS rdx ; 3rd arg
1068%define CTX rsi ; 2nd arg
1069%define INP rdi ; 1st arg
1070
1071%define SRND rdi ; clobbers INP
1072%define c ecx
1073%define d r8d
1074%define e edx
1075%else
1076%define NUM_BLKS r8 ; 3rd arg
1077%define CTX rdx ; 2nd arg
1078%define INP rcx ; 1st arg
1079
1080%define SRND rcx ; clobbers INP
1081%define c edi
1082%define d esi
1083%define e r8d
1084
1085%endif
1086%define TBL rbp
1087%define a eax
1088%define b ebx
1089
1090%define f r9d
1091%define g r10d
1092%define h r11d
1093
1094%define y0 r13d
1095%define y1 r14d
1096%define y2 r15d
1097
1098
1099
1100_INP_END_SIZE equ 8
1101_INP_SIZE equ 8
1102_XFER_SIZE equ 8
1103%ifdef LINUX
1104_XMM_SAVE_SIZE equ 0
1105%else
1106_XMM_SAVE_SIZE equ 7*16
1107%endif
1108; STACK_SIZE plus pushes must be an odd multiple of 8
1109_ALIGN_SIZE equ 8
1110
1111_INP_END equ 0
1112_INP equ _INP_END + _INP_END_SIZE
1113_XFER equ _INP + _INP_SIZE
1114_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1115STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1116
1117; rotate_Xs
1118; Rotate values of symbols X0...X3
1119%macro rotate_Xs 0
1120%xdefine X_ X0
1121%xdefine X0 X1
1122%xdefine X1 X2
1123%xdefine X2 X3
1124%xdefine X3 X_
1125%endm
1126
1127; ROTATE_ARGS
1128; Rotate values of symbols a...h
1129%macro ROTATE_ARGS 0
1130%xdefine TMP_ h
1131%xdefine h g
1132%xdefine g f
1133%xdefine f e
1134%xdefine e d
1135%xdefine d c
1136%xdefine c b
1137%xdefine b a
1138%xdefine a TMP_
1139%endm
1140
1141%macro FOUR_ROUNDS_AND_SCHED 0
1142 ;; compute s0 four at a time and s1 two at a time
1143 ;; compute W[-16] + W[-7] 4 at a time
1144 movdqa XTMP0, X3
1145 mov y0, e ; y0 = e
1146 ror y0, (25-11) ; y0 = e >> (25-11)
1147 mov y1, a ; y1 = a
1148 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1149 ror y1, (22-13) ; y1 = a >> (22-13)
1150 xor y0, e ; y0 = e ^ (e >> (25-11))
1151 mov y2, f ; y2 = f
1152 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1153 movdqa XTMP1, X1
1154 xor y1, a ; y1 = a ^ (a >> (22-13)
1155 xor y2, g ; y2 = f^g
1156 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1157 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1158 and y2, e ; y2 = (f^g)&e
1159 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1160 ;; compute s0
1161 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1162 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1163 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1164 xor y2, g ; y2 = CH = ((f^g)&e)^g
1165 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1166 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1167 add y2, y0 ; y2 = S1 + CH
1168 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1169 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1170 mov y0, a ; y0 = a
1171 add h, y2 ; h = h + S1 + CH + k + w
1172 mov y2, a ; y2 = a
1173 pslld XTMP1, (32-7)
1174 or y0, c ; y0 = a|c
1175 add d, h ; d = d + h + S1 + CH + k + w
1176 and y2, c ; y2 = a&c
1177 psrld XTMP2, 7
1178 and y0, b ; y0 = (a|c)&b
1179 add h, y1 ; h = h + S1 + CH + k + w + S0
1180 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1181 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1182 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1183
1184ROTATE_ARGS
1185 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1186 mov y0, e ; y0 = e
1187 mov y1, a ; y1 = a
1188 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1189 ror y0, (25-11) ; y0 = e >> (25-11)
1190 xor y0, e ; y0 = e ^ (e >> (25-11))
1191 mov y2, f ; y2 = f
1192 ror y1, (22-13) ; y1 = a >> (22-13)
1193 pslld XTMP3, (32-18)
1194 xor y1, a ; y1 = a ^ (a >> (22-13)
1195 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1196 xor y2, g ; y2 = f^g
1197 psrld XTMP2, 18
1198 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1199 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1200 and y2, e ; y2 = (f^g)&e
1201 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1202 pxor XTMP1, XTMP3
1203 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1204 xor y2, g ; y2 = CH = ((f^g)&e)^g
1205 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1206 add y2, y0 ; y2 = S1 + CH
1207 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1208 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1209 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1210 mov y0, a ; y0 = a
1211 add h, y2 ; h = h + S1 + CH + k + w
1212 mov y2, a ; y2 = a
1213 pxor XTMP1, XTMP4 ; XTMP1 = s0
1214 or y0, c ; y0 = a|c
1215 add d, h ; d = d + h + S1 + CH + k + w
1216 and y2, c ; y2 = a&c
1217 ;; compute low s1
1218 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1219 and y0, b ; y0 = (a|c)&b
1220 add h, y1 ; h = h + S1 + CH + k + w + S0
1221 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1222 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1223 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1224
1225ROTATE_ARGS
1226 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1227 mov y0, e ; y0 = e
1228 mov y1, a ; y1 = a
1229 ror y0, (25-11) ; y0 = e >> (25-11)
1230 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1231 xor y0, e ; y0 = e ^ (e >> (25-11))
1232 ror y1, (22-13) ; y1 = a >> (22-13)
1233 mov y2, f ; y2 = f
1234 xor y1, a ; y1 = a ^ (a >> (22-13)
1235 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1236 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1237 xor y2, g ; y2 = f^g
1238 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1239 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1240 and y2, e ; y2 = (f^g)&e
1241 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1242 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1243 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1244 xor y2, g ; y2 = CH = ((f^g)&e)^g
1245 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1246 pxor XTMP2, XTMP3
1247 add y2, y0 ; y2 = S1 + CH
1248 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1249 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1250 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1251 mov y0, a ; y0 = a
1252 add h, y2 ; h = h + S1 + CH + k + w
1253 mov y2, a ; y2 = a
1254 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1255 or y0, c ; y0 = a|c
1256 add d, h ; d = d + h + S1 + CH + k + w
1257 and y2, c ; y2 = a&c
1258 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1259 and y0, b ; y0 = (a|c)&b
1260 add h, y1 ; h = h + S1 + CH + k + w + S0
1261 ;; compute high s1
1262 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1263 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1264 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1265
1266ROTATE_ARGS
1267 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1268 mov y0, e ; y0 = e
1269 ror y0, (25-11) ; y0 = e >> (25-11)
1270 mov y1, a ; y1 = a
1271 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1272 ror y1, (22-13) ; y1 = a >> (22-13)
1273 xor y0, e ; y0 = e ^ (e >> (25-11))
1274 mov y2, f ; y2 = f
1275 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1276 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1277 xor y1, a ; y1 = a ^ (a >> (22-13)
1278 xor y2, g ; y2 = f^g
1279 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1280 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1281 and y2, e ; y2 = (f^g)&e
1282 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1283 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1284 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1285 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1286 xor y2, g ; y2 = CH = ((f^g)&e)^g
1287 pxor XTMP2, XTMP3
1288 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1289 add y2, y0 ; y2 = S1 + CH
1290 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1291 pxor X0, XTMP2 ; X0 = s1 {xDxC}
1292 mov y0, a ; y0 = a
1293 add h, y2 ; h = h + S1 + CH + k + w
1294 mov y2, a ; y2 = a
1295 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1296 or y0, c ; y0 = a|c
1297 add d, h ; d = d + h + S1 + CH + k + w
1298 and y2, c ; y2 = a&c
1299 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1300 and y0, b ; y0 = (a|c)&b
1301 add h, y1 ; h = h + S1 + CH + k + w + S0
1302 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1303 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1304
1305ROTATE_ARGS
1306rotate_Xs
1307%endm
1308
1309;; input is [rsp + _XFER + %1 * 4]
1310%macro DO_ROUND 1
1311 mov y0, e ; y0 = e
1312 ror y0, (25-11) ; y0 = e >> (25-11)
1313 mov y1, a ; y1 = a
1314 xor y0, e ; y0 = e ^ (e >> (25-11))
1315 ror y1, (22-13) ; y1 = a >> (22-13)
1316 mov y2, f ; y2 = f
1317 xor y1, a ; y1 = a ^ (a >> (22-13)
1318 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1319 xor y2, g ; y2 = f^g
1320 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1321 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1322 and y2, e ; y2 = (f^g)&e
1323 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1324 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1325 xor y2, g ; y2 = CH = ((f^g)&e)^g
1326 add y2, y0 ; y2 = S1 + CH
1327 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1328 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1329 mov y0, a ; y0 = a
1330 add h, y2 ; h = h + S1 + CH + k + w
1331 mov y2, a ; y2 = a
1332 or y0, c ; y0 = a|c
1333 add d, h ; d = d + h + S1 + CH + k + w
1334 and y2, c ; y2 = a&c
1335 and y0, b ; y0 = (a|c)&b
1336 add h, y1 ; h = h + S1 + CH + k + w + S0
1337 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1338 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1339 ROTATE_ARGS
1340%endm
1341
1342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1343;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1344;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1345;; arg 1 : pointer to input data
1346;; arg 2 : pointer to digest
1347;; arg 3 : Num blocks
1348section .text
1349global sha256_sse4
1350align 32
1351sha256_sse4:
1352 push rbx
1353%ifndef LINUX
1354 push rsi
1355 push rdi
1356%endif
1357 push rbp
1358 push r13
1359 push r14
1360 push r15
1361
1362 sub rsp,STACK_SIZE
1363%ifndef LINUX
1364 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1365 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1366 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1367 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1368 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1369 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1370 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1371%endif
1372
1373 shl NUM_BLKS, 6 ; convert to bytes
1374 jz done_hash
1375 add NUM_BLKS, INP ; pointer to end of data
1376 mov [rsp + _INP_END], NUM_BLKS
1377
1378 ;; load initial digest
1379 mov a,[4*0 + CTX]
1380 mov b,[4*1 + CTX]
1381 mov c,[4*2 + CTX]
1382 mov d,[4*3 + CTX]
1383 mov e,[4*4 + CTX]
1384 mov f,[4*5 + CTX]
1385 mov g,[4*6 + CTX]
1386 mov h,[4*7 + CTX]
1387
1388 movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1389 movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1390 movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1391
1392loop0:
1393 lea TBL,[K256 wrt rip]
1394
1395 ;; byte swap first 16 dwords
1396 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1397 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1398 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1399 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1400
1401 mov [rsp + _INP], INP
1402
1403 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1404 mov SRND, 3
1405align 16
1406loop1:
1407 movdqa XFER, [TBL + 0*16]
1408 paddd XFER, X0
1409 movdqa [rsp + _XFER], XFER
1410 FOUR_ROUNDS_AND_SCHED
1411
1412 movdqa XFER, [TBL + 1*16]
1413 paddd XFER, X0
1414 movdqa [rsp + _XFER], XFER
1415 FOUR_ROUNDS_AND_SCHED
1416
1417 movdqa XFER, [TBL + 2*16]
1418 paddd XFER, X0
1419 movdqa [rsp + _XFER], XFER
1420 FOUR_ROUNDS_AND_SCHED
1421
1422 movdqa XFER, [TBL + 3*16]
1423 paddd XFER, X0
1424 movdqa [rsp + _XFER], XFER
1425 add TBL, 4*16
1426 FOUR_ROUNDS_AND_SCHED
1427
1428 sub SRND, 1
1429 jne loop1
1430
1431 mov SRND, 2
1432loop2:
1433 paddd X0, [TBL + 0*16]
1434 movdqa [rsp + _XFER], X0
1435 DO_ROUND 0
1436 DO_ROUND 1
1437 DO_ROUND 2
1438 DO_ROUND 3
1439 paddd X1, [TBL + 1*16]
1440 movdqa [rsp + _XFER], X1
1441 add TBL, 2*16
1442 DO_ROUND 0
1443 DO_ROUND 1
1444 DO_ROUND 2
1445 DO_ROUND 3
1446
1447 movdqa X0, X2
1448 movdqa X1, X3
1449
1450 sub SRND, 1
1451 jne loop2
1452
1453 addm [4*0 + CTX],a
1454 addm [4*1 + CTX],b
1455 addm [4*2 + CTX],c
1456 addm [4*3 + CTX],d
1457 addm [4*4 + CTX],e
1458 addm [4*5 + CTX],f
1459 addm [4*6 + CTX],g
1460 addm [4*7 + CTX],h
1461
1462 mov INP, [rsp + _INP]
1463 add INP, 64
1464 cmp INP, [rsp + _INP_END]
1465 jne loop0
1466
1467done_hash:
1468%ifndef LINUX
1469 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1470 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1471 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1472 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1473 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1474 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1475 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1476%endif
1477
1478 add rsp, STACK_SIZE
1479
1480 pop r15
1481 pop r14
1482 pop r13
1483 pop rbp
1484%ifndef LINUX
1485 pop rdi
1486 pop rsi
1487%endif
1488 pop rbx
1489
1490 ret
1491
1492
1493section .data
1494align 64
1495K256:
1496 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1497 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1498 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1499 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1500 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1501 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1502 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1503 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1504 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1505 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1506 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1507 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1508 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1509 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1510 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1511 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1512
1513PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1514
1515; shuffle xBxA -> 00BA
1516_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1517
1518; shuffle xDxC -> DC00
1519_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1520*/
1521
1522#endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)