Bitcoin Core 28.99.0
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1// Copyright (c) 2017-2022 The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4//
5// This is a translation to GCC extended asm syntax from YASM code by Intel
6// (available at the bottom of this file).
7
8#include <cstdlib>
9#include <stdint.h>
10
11#if defined(__x86_64__) || defined(__amd64__)
12
13namespace sha256_sse4
14{
15void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
16#if defined(__clang__) && !defined(__OPTIMIZE__)
17 /*
18 clang is unable to compile this with -O0 and -fsanitize=address.
19 See upstream bug: https://github.com/llvm/llvm-project/issues/92182
20 */
21 __attribute__((no_sanitize("address")))
22#endif
23{
24 static const uint32_t K256 alignas(16) [] = {
25 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
26 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
27 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
28 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
29 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
30 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
31 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
32 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
33 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
34 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
35 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
36 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
37 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
38 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
39 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
40 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
41 };
42 static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
43 static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
44 static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
45 uint32_t a, b, c, d, f, g, h, y0, y1, y2;
46 uint64_t tbl;
47 uint64_t inp_end, inp;
48 uint32_t xfer alignas(16) [4];
49
50 __asm__ __volatile__(
51 "shl $0x6,%2;"
52 "je Ldone_hash_%=;"
53 "add %1,%2;"
54 "mov %2,%14;"
55 "mov (%0),%3;"
56 "mov 0x4(%0),%4;"
57 "mov 0x8(%0),%5;"
58 "mov 0xc(%0),%6;"
59 "mov 0x10(%0),%k2;"
60 "mov 0x14(%0),%7;"
61 "mov 0x18(%0),%8;"
62 "mov 0x1c(%0),%9;"
63 "movdqa %18,%%xmm12;"
64 "movdqa %19,%%xmm10;"
65 "movdqa %20,%%xmm11;"
66
67 "Lloop0_%=:"
68 "lea %17,%13;"
69 "movdqu (%1),%%xmm4;"
70 "pshufb %%xmm12,%%xmm4;"
71 "movdqu 0x10(%1),%%xmm5;"
72 "pshufb %%xmm12,%%xmm5;"
73 "movdqu 0x20(%1),%%xmm6;"
74 "pshufb %%xmm12,%%xmm6;"
75 "movdqu 0x30(%1),%%xmm7;"
76 "pshufb %%xmm12,%%xmm7;"
77 "mov %1,%15;"
78 "mov $3,%1;"
79
80 "Lloop1_%=:"
81 "movdqa 0x0(%13),%%xmm9;"
82 "paddd %%xmm4,%%xmm9;"
83 "movdqa %%xmm9,%16;"
84 "movdqa %%xmm7,%%xmm0;"
85 "mov %k2,%10;"
86 "ror $0xe,%10;"
87 "mov %3,%11;"
88 "palignr $0x4,%%xmm6,%%xmm0;"
89 "ror $0x9,%11;"
90 "xor %k2,%10;"
91 "mov %7,%12;"
92 "ror $0x5,%10;"
93 "movdqa %%xmm5,%%xmm1;"
94 "xor %3,%11;"
95 "xor %8,%12;"
96 "paddd %%xmm4,%%xmm0;"
97 "xor %k2,%10;"
98 "and %k2,%12;"
99 "ror $0xb,%11;"
100 "palignr $0x4,%%xmm4,%%xmm1;"
101 "xor %3,%11;"
102 "ror $0x6,%10;"
103 "xor %8,%12;"
104 "movdqa %%xmm1,%%xmm2;"
105 "ror $0x2,%11;"
106 "add %10,%12;"
107 "add %16,%12;"
108 "movdqa %%xmm1,%%xmm3;"
109 "mov %3,%10;"
110 "add %12,%9;"
111 "mov %3,%12;"
112 "pslld $0x19,%%xmm1;"
113 "or %5,%10;"
114 "add %9,%6;"
115 "and %5,%12;"
116 "psrld $0x7,%%xmm2;"
117 "and %4,%10;"
118 "add %11,%9;"
119 "por %%xmm2,%%xmm1;"
120 "or %12,%10;"
121 "add %10,%9;"
122 "movdqa %%xmm3,%%xmm2;"
123 "mov %6,%10;"
124 "mov %9,%11;"
125 "movdqa %%xmm3,%%xmm8;"
126 "ror $0xe,%10;"
127 "xor %6,%10;"
128 "mov %k2,%12;"
129 "ror $0x9,%11;"
130 "pslld $0xe,%%xmm3;"
131 "xor %9,%11;"
132 "ror $0x5,%10;"
133 "xor %7,%12;"
134 "psrld $0x12,%%xmm2;"
135 "ror $0xb,%11;"
136 "xor %6,%10;"
137 "and %6,%12;"
138 "ror $0x6,%10;"
139 "pxor %%xmm3,%%xmm1;"
140 "xor %9,%11;"
141 "xor %7,%12;"
142 "psrld $0x3,%%xmm8;"
143 "add %10,%12;"
144 "add 4+%16,%12;"
145 "ror $0x2,%11;"
146 "pxor %%xmm2,%%xmm1;"
147 "mov %9,%10;"
148 "add %12,%8;"
149 "mov %9,%12;"
150 "pxor %%xmm8,%%xmm1;"
151 "or %4,%10;"
152 "add %8,%5;"
153 "and %4,%12;"
154 "pshufd $0xfa,%%xmm7,%%xmm2;"
155 "and %3,%10;"
156 "add %11,%8;"
157 "paddd %%xmm1,%%xmm0;"
158 "or %12,%10;"
159 "add %10,%8;"
160 "movdqa %%xmm2,%%xmm3;"
161 "mov %5,%10;"
162 "mov %8,%11;"
163 "ror $0xe,%10;"
164 "movdqa %%xmm2,%%xmm8;"
165 "xor %5,%10;"
166 "ror $0x9,%11;"
167 "mov %6,%12;"
168 "xor %8,%11;"
169 "ror $0x5,%10;"
170 "psrlq $0x11,%%xmm2;"
171 "xor %k2,%12;"
172 "psrlq $0x13,%%xmm3;"
173 "xor %5,%10;"
174 "and %5,%12;"
175 "psrld $0xa,%%xmm8;"
176 "ror $0xb,%11;"
177 "xor %8,%11;"
178 "xor %k2,%12;"
179 "ror $0x6,%10;"
180 "pxor %%xmm3,%%xmm2;"
181 "add %10,%12;"
182 "ror $0x2,%11;"
183 "add 8+%16,%12;"
184 "pxor %%xmm2,%%xmm8;"
185 "mov %8,%10;"
186 "add %12,%7;"
187 "mov %8,%12;"
188 "pshufb %%xmm10,%%xmm8;"
189 "or %3,%10;"
190 "add %7,%4;"
191 "and %3,%12;"
192 "paddd %%xmm8,%%xmm0;"
193 "and %9,%10;"
194 "add %11,%7;"
195 "pshufd $0x50,%%xmm0,%%xmm2;"
196 "or %12,%10;"
197 "add %10,%7;"
198 "movdqa %%xmm2,%%xmm3;"
199 "mov %4,%10;"
200 "ror $0xe,%10;"
201 "mov %7,%11;"
202 "movdqa %%xmm2,%%xmm4;"
203 "ror $0x9,%11;"
204 "xor %4,%10;"
205 "mov %5,%12;"
206 "ror $0x5,%10;"
207 "psrlq $0x11,%%xmm2;"
208 "xor %7,%11;"
209 "xor %6,%12;"
210 "psrlq $0x13,%%xmm3;"
211 "xor %4,%10;"
212 "and %4,%12;"
213 "ror $0xb,%11;"
214 "psrld $0xa,%%xmm4;"
215 "xor %7,%11;"
216 "ror $0x6,%10;"
217 "xor %6,%12;"
218 "pxor %%xmm3,%%xmm2;"
219 "ror $0x2,%11;"
220 "add %10,%12;"
221 "add 12+%16,%12;"
222 "pxor %%xmm2,%%xmm4;"
223 "mov %7,%10;"
224 "add %12,%k2;"
225 "mov %7,%12;"
226 "pshufb %%xmm11,%%xmm4;"
227 "or %9,%10;"
228 "add %k2,%3;"
229 "and %9,%12;"
230 "paddd %%xmm0,%%xmm4;"
231 "and %8,%10;"
232 "add %11,%k2;"
233 "or %12,%10;"
234 "add %10,%k2;"
235 "movdqa 0x10(%13),%%xmm9;"
236 "paddd %%xmm5,%%xmm9;"
237 "movdqa %%xmm9,%16;"
238 "movdqa %%xmm4,%%xmm0;"
239 "mov %3,%10;"
240 "ror $0xe,%10;"
241 "mov %k2,%11;"
242 "palignr $0x4,%%xmm7,%%xmm0;"
243 "ror $0x9,%11;"
244 "xor %3,%10;"
245 "mov %4,%12;"
246 "ror $0x5,%10;"
247 "movdqa %%xmm6,%%xmm1;"
248 "xor %k2,%11;"
249 "xor %5,%12;"
250 "paddd %%xmm5,%%xmm0;"
251 "xor %3,%10;"
252 "and %3,%12;"
253 "ror $0xb,%11;"
254 "palignr $0x4,%%xmm5,%%xmm1;"
255 "xor %k2,%11;"
256 "ror $0x6,%10;"
257 "xor %5,%12;"
258 "movdqa %%xmm1,%%xmm2;"
259 "ror $0x2,%11;"
260 "add %10,%12;"
261 "add %16,%12;"
262 "movdqa %%xmm1,%%xmm3;"
263 "mov %k2,%10;"
264 "add %12,%6;"
265 "mov %k2,%12;"
266 "pslld $0x19,%%xmm1;"
267 "or %8,%10;"
268 "add %6,%9;"
269 "and %8,%12;"
270 "psrld $0x7,%%xmm2;"
271 "and %7,%10;"
272 "add %11,%6;"
273 "por %%xmm2,%%xmm1;"
274 "or %12,%10;"
275 "add %10,%6;"
276 "movdqa %%xmm3,%%xmm2;"
277 "mov %9,%10;"
278 "mov %6,%11;"
279 "movdqa %%xmm3,%%xmm8;"
280 "ror $0xe,%10;"
281 "xor %9,%10;"
282 "mov %3,%12;"
283 "ror $0x9,%11;"
284 "pslld $0xe,%%xmm3;"
285 "xor %6,%11;"
286 "ror $0x5,%10;"
287 "xor %4,%12;"
288 "psrld $0x12,%%xmm2;"
289 "ror $0xb,%11;"
290 "xor %9,%10;"
291 "and %9,%12;"
292 "ror $0x6,%10;"
293 "pxor %%xmm3,%%xmm1;"
294 "xor %6,%11;"
295 "xor %4,%12;"
296 "psrld $0x3,%%xmm8;"
297 "add %10,%12;"
298 "add 4+%16,%12;"
299 "ror $0x2,%11;"
300 "pxor %%xmm2,%%xmm1;"
301 "mov %6,%10;"
302 "add %12,%5;"
303 "mov %6,%12;"
304 "pxor %%xmm8,%%xmm1;"
305 "or %7,%10;"
306 "add %5,%8;"
307 "and %7,%12;"
308 "pshufd $0xfa,%%xmm4,%%xmm2;"
309 "and %k2,%10;"
310 "add %11,%5;"
311 "paddd %%xmm1,%%xmm0;"
312 "or %12,%10;"
313 "add %10,%5;"
314 "movdqa %%xmm2,%%xmm3;"
315 "mov %8,%10;"
316 "mov %5,%11;"
317 "ror $0xe,%10;"
318 "movdqa %%xmm2,%%xmm8;"
319 "xor %8,%10;"
320 "ror $0x9,%11;"
321 "mov %9,%12;"
322 "xor %5,%11;"
323 "ror $0x5,%10;"
324 "psrlq $0x11,%%xmm2;"
325 "xor %3,%12;"
326 "psrlq $0x13,%%xmm3;"
327 "xor %8,%10;"
328 "and %8,%12;"
329 "psrld $0xa,%%xmm8;"
330 "ror $0xb,%11;"
331 "xor %5,%11;"
332 "xor %3,%12;"
333 "ror $0x6,%10;"
334 "pxor %%xmm3,%%xmm2;"
335 "add %10,%12;"
336 "ror $0x2,%11;"
337 "add 8+%16,%12;"
338 "pxor %%xmm2,%%xmm8;"
339 "mov %5,%10;"
340 "add %12,%4;"
341 "mov %5,%12;"
342 "pshufb %%xmm10,%%xmm8;"
343 "or %k2,%10;"
344 "add %4,%7;"
345 "and %k2,%12;"
346 "paddd %%xmm8,%%xmm0;"
347 "and %6,%10;"
348 "add %11,%4;"
349 "pshufd $0x50,%%xmm0,%%xmm2;"
350 "or %12,%10;"
351 "add %10,%4;"
352 "movdqa %%xmm2,%%xmm3;"
353 "mov %7,%10;"
354 "ror $0xe,%10;"
355 "mov %4,%11;"
356 "movdqa %%xmm2,%%xmm5;"
357 "ror $0x9,%11;"
358 "xor %7,%10;"
359 "mov %8,%12;"
360 "ror $0x5,%10;"
361 "psrlq $0x11,%%xmm2;"
362 "xor %4,%11;"
363 "xor %9,%12;"
364 "psrlq $0x13,%%xmm3;"
365 "xor %7,%10;"
366 "and %7,%12;"
367 "ror $0xb,%11;"
368 "psrld $0xa,%%xmm5;"
369 "xor %4,%11;"
370 "ror $0x6,%10;"
371 "xor %9,%12;"
372 "pxor %%xmm3,%%xmm2;"
373 "ror $0x2,%11;"
374 "add %10,%12;"
375 "add 12+%16,%12;"
376 "pxor %%xmm2,%%xmm5;"
377 "mov %4,%10;"
378 "add %12,%3;"
379 "mov %4,%12;"
380 "pshufb %%xmm11,%%xmm5;"
381 "or %6,%10;"
382 "add %3,%k2;"
383 "and %6,%12;"
384 "paddd %%xmm0,%%xmm5;"
385 "and %5,%10;"
386 "add %11,%3;"
387 "or %12,%10;"
388 "add %10,%3;"
389 "movdqa 0x20(%13),%%xmm9;"
390 "paddd %%xmm6,%%xmm9;"
391 "movdqa %%xmm9,%16;"
392 "movdqa %%xmm5,%%xmm0;"
393 "mov %k2,%10;"
394 "ror $0xe,%10;"
395 "mov %3,%11;"
396 "palignr $0x4,%%xmm4,%%xmm0;"
397 "ror $0x9,%11;"
398 "xor %k2,%10;"
399 "mov %7,%12;"
400 "ror $0x5,%10;"
401 "movdqa %%xmm7,%%xmm1;"
402 "xor %3,%11;"
403 "xor %8,%12;"
404 "paddd %%xmm6,%%xmm0;"
405 "xor %k2,%10;"
406 "and %k2,%12;"
407 "ror $0xb,%11;"
408 "palignr $0x4,%%xmm6,%%xmm1;"
409 "xor %3,%11;"
410 "ror $0x6,%10;"
411 "xor %8,%12;"
412 "movdqa %%xmm1,%%xmm2;"
413 "ror $0x2,%11;"
414 "add %10,%12;"
415 "add %16,%12;"
416 "movdqa %%xmm1,%%xmm3;"
417 "mov %3,%10;"
418 "add %12,%9;"
419 "mov %3,%12;"
420 "pslld $0x19,%%xmm1;"
421 "or %5,%10;"
422 "add %9,%6;"
423 "and %5,%12;"
424 "psrld $0x7,%%xmm2;"
425 "and %4,%10;"
426 "add %11,%9;"
427 "por %%xmm2,%%xmm1;"
428 "or %12,%10;"
429 "add %10,%9;"
430 "movdqa %%xmm3,%%xmm2;"
431 "mov %6,%10;"
432 "mov %9,%11;"
433 "movdqa %%xmm3,%%xmm8;"
434 "ror $0xe,%10;"
435 "xor %6,%10;"
436 "mov %k2,%12;"
437 "ror $0x9,%11;"
438 "pslld $0xe,%%xmm3;"
439 "xor %9,%11;"
440 "ror $0x5,%10;"
441 "xor %7,%12;"
442 "psrld $0x12,%%xmm2;"
443 "ror $0xb,%11;"
444 "xor %6,%10;"
445 "and %6,%12;"
446 "ror $0x6,%10;"
447 "pxor %%xmm3,%%xmm1;"
448 "xor %9,%11;"
449 "xor %7,%12;"
450 "psrld $0x3,%%xmm8;"
451 "add %10,%12;"
452 "add 4+%16,%12;"
453 "ror $0x2,%11;"
454 "pxor %%xmm2,%%xmm1;"
455 "mov %9,%10;"
456 "add %12,%8;"
457 "mov %9,%12;"
458 "pxor %%xmm8,%%xmm1;"
459 "or %4,%10;"
460 "add %8,%5;"
461 "and %4,%12;"
462 "pshufd $0xfa,%%xmm5,%%xmm2;"
463 "and %3,%10;"
464 "add %11,%8;"
465 "paddd %%xmm1,%%xmm0;"
466 "or %12,%10;"
467 "add %10,%8;"
468 "movdqa %%xmm2,%%xmm3;"
469 "mov %5,%10;"
470 "mov %8,%11;"
471 "ror $0xe,%10;"
472 "movdqa %%xmm2,%%xmm8;"
473 "xor %5,%10;"
474 "ror $0x9,%11;"
475 "mov %6,%12;"
476 "xor %8,%11;"
477 "ror $0x5,%10;"
478 "psrlq $0x11,%%xmm2;"
479 "xor %k2,%12;"
480 "psrlq $0x13,%%xmm3;"
481 "xor %5,%10;"
482 "and %5,%12;"
483 "psrld $0xa,%%xmm8;"
484 "ror $0xb,%11;"
485 "xor %8,%11;"
486 "xor %k2,%12;"
487 "ror $0x6,%10;"
488 "pxor %%xmm3,%%xmm2;"
489 "add %10,%12;"
490 "ror $0x2,%11;"
491 "add 8+%16,%12;"
492 "pxor %%xmm2,%%xmm8;"
493 "mov %8,%10;"
494 "add %12,%7;"
495 "mov %8,%12;"
496 "pshufb %%xmm10,%%xmm8;"
497 "or %3,%10;"
498 "add %7,%4;"
499 "and %3,%12;"
500 "paddd %%xmm8,%%xmm0;"
501 "and %9,%10;"
502 "add %11,%7;"
503 "pshufd $0x50,%%xmm0,%%xmm2;"
504 "or %12,%10;"
505 "add %10,%7;"
506 "movdqa %%xmm2,%%xmm3;"
507 "mov %4,%10;"
508 "ror $0xe,%10;"
509 "mov %7,%11;"
510 "movdqa %%xmm2,%%xmm6;"
511 "ror $0x9,%11;"
512 "xor %4,%10;"
513 "mov %5,%12;"
514 "ror $0x5,%10;"
515 "psrlq $0x11,%%xmm2;"
516 "xor %7,%11;"
517 "xor %6,%12;"
518 "psrlq $0x13,%%xmm3;"
519 "xor %4,%10;"
520 "and %4,%12;"
521 "ror $0xb,%11;"
522 "psrld $0xa,%%xmm6;"
523 "xor %7,%11;"
524 "ror $0x6,%10;"
525 "xor %6,%12;"
526 "pxor %%xmm3,%%xmm2;"
527 "ror $0x2,%11;"
528 "add %10,%12;"
529 "add 12+%16,%12;"
530 "pxor %%xmm2,%%xmm6;"
531 "mov %7,%10;"
532 "add %12,%k2;"
533 "mov %7,%12;"
534 "pshufb %%xmm11,%%xmm6;"
535 "or %9,%10;"
536 "add %k2,%3;"
537 "and %9,%12;"
538 "paddd %%xmm0,%%xmm6;"
539 "and %8,%10;"
540 "add %11,%k2;"
541 "or %12,%10;"
542 "add %10,%k2;"
543 "movdqa 0x30(%13),%%xmm9;"
544 "paddd %%xmm7,%%xmm9;"
545 "movdqa %%xmm9,%16;"
546 "add $0x40,%13;"
547 "movdqa %%xmm6,%%xmm0;"
548 "mov %3,%10;"
549 "ror $0xe,%10;"
550 "mov %k2,%11;"
551 "palignr $0x4,%%xmm5,%%xmm0;"
552 "ror $0x9,%11;"
553 "xor %3,%10;"
554 "mov %4,%12;"
555 "ror $0x5,%10;"
556 "movdqa %%xmm4,%%xmm1;"
557 "xor %k2,%11;"
558 "xor %5,%12;"
559 "paddd %%xmm7,%%xmm0;"
560 "xor %3,%10;"
561 "and %3,%12;"
562 "ror $0xb,%11;"
563 "palignr $0x4,%%xmm7,%%xmm1;"
564 "xor %k2,%11;"
565 "ror $0x6,%10;"
566 "xor %5,%12;"
567 "movdqa %%xmm1,%%xmm2;"
568 "ror $0x2,%11;"
569 "add %10,%12;"
570 "add %16,%12;"
571 "movdqa %%xmm1,%%xmm3;"
572 "mov %k2,%10;"
573 "add %12,%6;"
574 "mov %k2,%12;"
575 "pslld $0x19,%%xmm1;"
576 "or %8,%10;"
577 "add %6,%9;"
578 "and %8,%12;"
579 "psrld $0x7,%%xmm2;"
580 "and %7,%10;"
581 "add %11,%6;"
582 "por %%xmm2,%%xmm1;"
583 "or %12,%10;"
584 "add %10,%6;"
585 "movdqa %%xmm3,%%xmm2;"
586 "mov %9,%10;"
587 "mov %6,%11;"
588 "movdqa %%xmm3,%%xmm8;"
589 "ror $0xe,%10;"
590 "xor %9,%10;"
591 "mov %3,%12;"
592 "ror $0x9,%11;"
593 "pslld $0xe,%%xmm3;"
594 "xor %6,%11;"
595 "ror $0x5,%10;"
596 "xor %4,%12;"
597 "psrld $0x12,%%xmm2;"
598 "ror $0xb,%11;"
599 "xor %9,%10;"
600 "and %9,%12;"
601 "ror $0x6,%10;"
602 "pxor %%xmm3,%%xmm1;"
603 "xor %6,%11;"
604 "xor %4,%12;"
605 "psrld $0x3,%%xmm8;"
606 "add %10,%12;"
607 "add 4+%16,%12;"
608 "ror $0x2,%11;"
609 "pxor %%xmm2,%%xmm1;"
610 "mov %6,%10;"
611 "add %12,%5;"
612 "mov %6,%12;"
613 "pxor %%xmm8,%%xmm1;"
614 "or %7,%10;"
615 "add %5,%8;"
616 "and %7,%12;"
617 "pshufd $0xfa,%%xmm6,%%xmm2;"
618 "and %k2,%10;"
619 "add %11,%5;"
620 "paddd %%xmm1,%%xmm0;"
621 "or %12,%10;"
622 "add %10,%5;"
623 "movdqa %%xmm2,%%xmm3;"
624 "mov %8,%10;"
625 "mov %5,%11;"
626 "ror $0xe,%10;"
627 "movdqa %%xmm2,%%xmm8;"
628 "xor %8,%10;"
629 "ror $0x9,%11;"
630 "mov %9,%12;"
631 "xor %5,%11;"
632 "ror $0x5,%10;"
633 "psrlq $0x11,%%xmm2;"
634 "xor %3,%12;"
635 "psrlq $0x13,%%xmm3;"
636 "xor %8,%10;"
637 "and %8,%12;"
638 "psrld $0xa,%%xmm8;"
639 "ror $0xb,%11;"
640 "xor %5,%11;"
641 "xor %3,%12;"
642 "ror $0x6,%10;"
643 "pxor %%xmm3,%%xmm2;"
644 "add %10,%12;"
645 "ror $0x2,%11;"
646 "add 8+%16,%12;"
647 "pxor %%xmm2,%%xmm8;"
648 "mov %5,%10;"
649 "add %12,%4;"
650 "mov %5,%12;"
651 "pshufb %%xmm10,%%xmm8;"
652 "or %k2,%10;"
653 "add %4,%7;"
654 "and %k2,%12;"
655 "paddd %%xmm8,%%xmm0;"
656 "and %6,%10;"
657 "add %11,%4;"
658 "pshufd $0x50,%%xmm0,%%xmm2;"
659 "or %12,%10;"
660 "add %10,%4;"
661 "movdqa %%xmm2,%%xmm3;"
662 "mov %7,%10;"
663 "ror $0xe,%10;"
664 "mov %4,%11;"
665 "movdqa %%xmm2,%%xmm7;"
666 "ror $0x9,%11;"
667 "xor %7,%10;"
668 "mov %8,%12;"
669 "ror $0x5,%10;"
670 "psrlq $0x11,%%xmm2;"
671 "xor %4,%11;"
672 "xor %9,%12;"
673 "psrlq $0x13,%%xmm3;"
674 "xor %7,%10;"
675 "and %7,%12;"
676 "ror $0xb,%11;"
677 "psrld $0xa,%%xmm7;"
678 "xor %4,%11;"
679 "ror $0x6,%10;"
680 "xor %9,%12;"
681 "pxor %%xmm3,%%xmm2;"
682 "ror $0x2,%11;"
683 "add %10,%12;"
684 "add 12+%16,%12;"
685 "pxor %%xmm2,%%xmm7;"
686 "mov %4,%10;"
687 "add %12,%3;"
688 "mov %4,%12;"
689 "pshufb %%xmm11,%%xmm7;"
690 "or %6,%10;"
691 "add %3,%k2;"
692 "and %6,%12;"
693 "paddd %%xmm0,%%xmm7;"
694 "and %5,%10;"
695 "add %11,%3;"
696 "or %12,%10;"
697 "add %10,%3;"
698 "sub $0x1,%1;"
699 "jne Lloop1_%=;"
700 "mov $0x2,%1;"
701
702 "Lloop2_%=:"
703 "paddd 0x0(%13),%%xmm4;"
704 "movdqa %%xmm4,%16;"
705 "mov %k2,%10;"
706 "ror $0xe,%10;"
707 "mov %3,%11;"
708 "xor %k2,%10;"
709 "ror $0x9,%11;"
710 "mov %7,%12;"
711 "xor %3,%11;"
712 "ror $0x5,%10;"
713 "xor %8,%12;"
714 "xor %k2,%10;"
715 "ror $0xb,%11;"
716 "and %k2,%12;"
717 "xor %3,%11;"
718 "ror $0x6,%10;"
719 "xor %8,%12;"
720 "add %10,%12;"
721 "ror $0x2,%11;"
722 "add %16,%12;"
723 "mov %3,%10;"
724 "add %12,%9;"
725 "mov %3,%12;"
726 "or %5,%10;"
727 "add %9,%6;"
728 "and %5,%12;"
729 "and %4,%10;"
730 "add %11,%9;"
731 "or %12,%10;"
732 "add %10,%9;"
733 "mov %6,%10;"
734 "ror $0xe,%10;"
735 "mov %9,%11;"
736 "xor %6,%10;"
737 "ror $0x9,%11;"
738 "mov %k2,%12;"
739 "xor %9,%11;"
740 "ror $0x5,%10;"
741 "xor %7,%12;"
742 "xor %6,%10;"
743 "ror $0xb,%11;"
744 "and %6,%12;"
745 "xor %9,%11;"
746 "ror $0x6,%10;"
747 "xor %7,%12;"
748 "add %10,%12;"
749 "ror $0x2,%11;"
750 "add 4+%16,%12;"
751 "mov %9,%10;"
752 "add %12,%8;"
753 "mov %9,%12;"
754 "or %4,%10;"
755 "add %8,%5;"
756 "and %4,%12;"
757 "and %3,%10;"
758 "add %11,%8;"
759 "or %12,%10;"
760 "add %10,%8;"
761 "mov %5,%10;"
762 "ror $0xe,%10;"
763 "mov %8,%11;"
764 "xor %5,%10;"
765 "ror $0x9,%11;"
766 "mov %6,%12;"
767 "xor %8,%11;"
768 "ror $0x5,%10;"
769 "xor %k2,%12;"
770 "xor %5,%10;"
771 "ror $0xb,%11;"
772 "and %5,%12;"
773 "xor %8,%11;"
774 "ror $0x6,%10;"
775 "xor %k2,%12;"
776 "add %10,%12;"
777 "ror $0x2,%11;"
778 "add 8+%16,%12;"
779 "mov %8,%10;"
780 "add %12,%7;"
781 "mov %8,%12;"
782 "or %3,%10;"
783 "add %7,%4;"
784 "and %3,%12;"
785 "and %9,%10;"
786 "add %11,%7;"
787 "or %12,%10;"
788 "add %10,%7;"
789 "mov %4,%10;"
790 "ror $0xe,%10;"
791 "mov %7,%11;"
792 "xor %4,%10;"
793 "ror $0x9,%11;"
794 "mov %5,%12;"
795 "xor %7,%11;"
796 "ror $0x5,%10;"
797 "xor %6,%12;"
798 "xor %4,%10;"
799 "ror $0xb,%11;"
800 "and %4,%12;"
801 "xor %7,%11;"
802 "ror $0x6,%10;"
803 "xor %6,%12;"
804 "add %10,%12;"
805 "ror $0x2,%11;"
806 "add 12+%16,%12;"
807 "mov %7,%10;"
808 "add %12,%k2;"
809 "mov %7,%12;"
810 "or %9,%10;"
811 "add %k2,%3;"
812 "and %9,%12;"
813 "and %8,%10;"
814 "add %11,%k2;"
815 "or %12,%10;"
816 "add %10,%k2;"
817 "paddd 0x10(%13),%%xmm5;"
818 "movdqa %%xmm5,%16;"
819 "add $0x20,%13;"
820 "mov %3,%10;"
821 "ror $0xe,%10;"
822 "mov %k2,%11;"
823 "xor %3,%10;"
824 "ror $0x9,%11;"
825 "mov %4,%12;"
826 "xor %k2,%11;"
827 "ror $0x5,%10;"
828 "xor %5,%12;"
829 "xor %3,%10;"
830 "ror $0xb,%11;"
831 "and %3,%12;"
832 "xor %k2,%11;"
833 "ror $0x6,%10;"
834 "xor %5,%12;"
835 "add %10,%12;"
836 "ror $0x2,%11;"
837 "add %16,%12;"
838 "mov %k2,%10;"
839 "add %12,%6;"
840 "mov %k2,%12;"
841 "or %8,%10;"
842 "add %6,%9;"
843 "and %8,%12;"
844 "and %7,%10;"
845 "add %11,%6;"
846 "or %12,%10;"
847 "add %10,%6;"
848 "mov %9,%10;"
849 "ror $0xe,%10;"
850 "mov %6,%11;"
851 "xor %9,%10;"
852 "ror $0x9,%11;"
853 "mov %3,%12;"
854 "xor %6,%11;"
855 "ror $0x5,%10;"
856 "xor %4,%12;"
857 "xor %9,%10;"
858 "ror $0xb,%11;"
859 "and %9,%12;"
860 "xor %6,%11;"
861 "ror $0x6,%10;"
862 "xor %4,%12;"
863 "add %10,%12;"
864 "ror $0x2,%11;"
865 "add 4+%16,%12;"
866 "mov %6,%10;"
867 "add %12,%5;"
868 "mov %6,%12;"
869 "or %7,%10;"
870 "add %5,%8;"
871 "and %7,%12;"
872 "and %k2,%10;"
873 "add %11,%5;"
874 "or %12,%10;"
875 "add %10,%5;"
876 "mov %8,%10;"
877 "ror $0xe,%10;"
878 "mov %5,%11;"
879 "xor %8,%10;"
880 "ror $0x9,%11;"
881 "mov %9,%12;"
882 "xor %5,%11;"
883 "ror $0x5,%10;"
884 "xor %3,%12;"
885 "xor %8,%10;"
886 "ror $0xb,%11;"
887 "and %8,%12;"
888 "xor %5,%11;"
889 "ror $0x6,%10;"
890 "xor %3,%12;"
891 "add %10,%12;"
892 "ror $0x2,%11;"
893 "add 8+%16,%12;"
894 "mov %5,%10;"
895 "add %12,%4;"
896 "mov %5,%12;"
897 "or %k2,%10;"
898 "add %4,%7;"
899 "and %k2,%12;"
900 "and %6,%10;"
901 "add %11,%4;"
902 "or %12,%10;"
903 "add %10,%4;"
904 "mov %7,%10;"
905 "ror $0xe,%10;"
906 "mov %4,%11;"
907 "xor %7,%10;"
908 "ror $0x9,%11;"
909 "mov %8,%12;"
910 "xor %4,%11;"
911 "ror $0x5,%10;"
912 "xor %9,%12;"
913 "xor %7,%10;"
914 "ror $0xb,%11;"
915 "and %7,%12;"
916 "xor %4,%11;"
917 "ror $0x6,%10;"
918 "xor %9,%12;"
919 "add %10,%12;"
920 "ror $0x2,%11;"
921 "add 12+%16,%12;"
922 "mov %4,%10;"
923 "add %12,%3;"
924 "mov %4,%12;"
925 "or %6,%10;"
926 "add %3,%k2;"
927 "and %6,%12;"
928 "and %5,%10;"
929 "add %11,%3;"
930 "or %12,%10;"
931 "add %10,%3;"
932 "movdqa %%xmm6,%%xmm4;"
933 "movdqa %%xmm7,%%xmm5;"
934 "sub $0x1,%1;"
935 "jne Lloop2_%=;"
936 "add (%0),%3;"
937 "mov %3,(%0);"
938 "add 0x4(%0),%4;"
939 "mov %4,0x4(%0);"
940 "add 0x8(%0),%5;"
941 "mov %5,0x8(%0);"
942 "add 0xc(%0),%6;"
943 "mov %6,0xc(%0);"
944 "add 0x10(%0),%k2;"
945 "mov %k2,0x10(%0);"
946 "add 0x14(%0),%7;"
947 "mov %7,0x14(%0);"
948 "add 0x18(%0),%8;"
949 "mov %8,0x18(%0);"
950 "add 0x1c(%0),%9;"
951 "mov %9,0x1c(%0);"
952 "mov %15,%1;"
953 "add $0x40,%1;"
954 "cmp %14,%1;"
955 "jne Lloop0_%=;"
956
957 "Ldone_hash_%=:"
958
959 : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
960 : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
961 : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
962 );
963}
964}
965
966/*
967;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
968; Copyright (c) 2012, Intel Corporation
969;
970; All rights reserved.
971;
972; Redistribution and use in source and binary forms, with or without
973; modification, are permitted provided that the following conditions are
974; met:
975;
976; * Redistributions of source code must retain the above copyright
977; notice, this list of conditions and the following disclaimer.
978;
979; * Redistributions in binary form must reproduce the above copyright
980; notice, this list of conditions and the following disclaimer in the
981; documentation and/or other materials provided with the
982; distribution.
983;
984; * Neither the name of the Intel Corporation nor the names of its
985; contributors may be used to endorse or promote products derived from
986; this software without specific prior written permission.
987;
988;
989; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
990; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
991; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
992; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
993; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
994; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
995; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
996; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
997; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
998; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
999; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1000;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1001;
1002; Example YASM command lines:
1003; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1004; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1005;
1006;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1007;
1008; This code is described in an Intel White-Paper:
1009; "Fast SHA-256 Implementations on Intel Architecture Processors"
1010;
1011; To find it, surf to https://www.intel.com/p/en_US/embedded
1012; and search for that title.
1013; The paper is expected to be released roughly at the end of April, 2012
1014;
1015;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1016; This code schedules 1 blocks at a time, with 4 lanes per block
1017;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1018
1019%define MOVDQ movdqu ;; assume buffers not aligned
1020
1021;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1022
1023; addm [mem], reg
1024; Add reg to mem using reg-mem add and store
1025%macro addm 2
1026 add %2, %1
1027 mov %1, %2
1028%endm
1029
1030;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1031
1032; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1033; Load xmm with mem and byte swap each dword
1034%macro COPY_XMM_AND_BSWAP 3
1035 MOVDQ %1, %2
1036 pshufb %1, %3
1037%endmacro
1038
1039;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1040
1041%define X0 xmm4
1042%define X1 xmm5
1043%define X2 xmm6
1044%define X3 xmm7
1045
1046%define XTMP0 xmm0
1047%define XTMP1 xmm1
1048%define XTMP2 xmm2
1049%define XTMP3 xmm3
1050%define XTMP4 xmm8
1051%define XFER xmm9
1052
1053%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1054%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1055%define BYTE_FLIP_MASK xmm12
1056
1057%ifdef LINUX
1058%define NUM_BLKS rdx ; 3rd arg
1059%define CTX rsi ; 2nd arg
1060%define INP rdi ; 1st arg
1061
1062%define SRND rdi ; clobbers INP
1063%define c ecx
1064%define d r8d
1065%define e edx
1066%else
1067%define NUM_BLKS r8 ; 3rd arg
1068%define CTX rdx ; 2nd arg
1069%define INP rcx ; 1st arg
1070
1071%define SRND rcx ; clobbers INP
1072%define c edi
1073%define d esi
1074%define e r8d
1075
1076%endif
1077%define TBL rbp
1078%define a eax
1079%define b ebx
1080
1081%define f r9d
1082%define g r10d
1083%define h r11d
1084
1085%define y0 r13d
1086%define y1 r14d
1087%define y2 r15d
1088
1089
1090
1091_INP_END_SIZE equ 8
1092_INP_SIZE equ 8
1093_XFER_SIZE equ 8
1094%ifdef LINUX
1095_XMM_SAVE_SIZE equ 0
1096%else
1097_XMM_SAVE_SIZE equ 7*16
1098%endif
1099; STACK_SIZE plus pushes must be an odd multiple of 8
1100_ALIGN_SIZE equ 8
1101
1102_INP_END equ 0
1103_INP equ _INP_END + _INP_END_SIZE
1104_XFER equ _INP + _INP_SIZE
1105_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1106STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1107
1108; rotate_Xs
1109; Rotate values of symbols X0...X3
1110%macro rotate_Xs 0
1111%xdefine X_ X0
1112%xdefine X0 X1
1113%xdefine X1 X2
1114%xdefine X2 X3
1115%xdefine X3 X_
1116%endm
1117
1118; ROTATE_ARGS
1119; Rotate values of symbols a...h
1120%macro ROTATE_ARGS 0
1121%xdefine TMP_ h
1122%xdefine h g
1123%xdefine g f
1124%xdefine f e
1125%xdefine e d
1126%xdefine d c
1127%xdefine c b
1128%xdefine b a
1129%xdefine a TMP_
1130%endm
1131
1132%macro FOUR_ROUNDS_AND_SCHED 0
1133 ;; compute s0 four at a time and s1 two at a time
1134 ;; compute W[-16] + W[-7] 4 at a time
1135 movdqa XTMP0, X3
1136 mov y0, e ; y0 = e
1137 ror y0, (25-11) ; y0 = e >> (25-11)
1138 mov y1, a ; y1 = a
1139 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1140 ror y1, (22-13) ; y1 = a >> (22-13)
1141 xor y0, e ; y0 = e ^ (e >> (25-11))
1142 mov y2, f ; y2 = f
1143 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1144 movdqa XTMP1, X1
1145 xor y1, a ; y1 = a ^ (a >> (22-13)
1146 xor y2, g ; y2 = f^g
1147 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1148 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1149 and y2, e ; y2 = (f^g)&e
1150 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1151 ;; compute s0
1152 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1153 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1154 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1155 xor y2, g ; y2 = CH = ((f^g)&e)^g
1156 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1157 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1158 add y2, y0 ; y2 = S1 + CH
1159 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1160 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1161 mov y0, a ; y0 = a
1162 add h, y2 ; h = h + S1 + CH + k + w
1163 mov y2, a ; y2 = a
1164 pslld XTMP1, (32-7)
1165 or y0, c ; y0 = a|c
1166 add d, h ; d = d + h + S1 + CH + k + w
1167 and y2, c ; y2 = a&c
1168 psrld XTMP2, 7
1169 and y0, b ; y0 = (a|c)&b
1170 add h, y1 ; h = h + S1 + CH + k + w + S0
1171 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1172 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1173 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1174
1175ROTATE_ARGS
1176 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1177 mov y0, e ; y0 = e
1178 mov y1, a ; y1 = a
1179 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1180 ror y0, (25-11) ; y0 = e >> (25-11)
1181 xor y0, e ; y0 = e ^ (e >> (25-11))
1182 mov y2, f ; y2 = f
1183 ror y1, (22-13) ; y1 = a >> (22-13)
1184 pslld XTMP3, (32-18)
1185 xor y1, a ; y1 = a ^ (a >> (22-13)
1186 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1187 xor y2, g ; y2 = f^g
1188 psrld XTMP2, 18
1189 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1190 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1191 and y2, e ; y2 = (f^g)&e
1192 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1193 pxor XTMP1, XTMP3
1194 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1195 xor y2, g ; y2 = CH = ((f^g)&e)^g
1196 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1197 add y2, y0 ; y2 = S1 + CH
1198 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1199 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1200 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1201 mov y0, a ; y0 = a
1202 add h, y2 ; h = h + S1 + CH + k + w
1203 mov y2, a ; y2 = a
1204 pxor XTMP1, XTMP4 ; XTMP1 = s0
1205 or y0, c ; y0 = a|c
1206 add d, h ; d = d + h + S1 + CH + k + w
1207 and y2, c ; y2 = a&c
1208 ;; compute low s1
1209 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1210 and y0, b ; y0 = (a|c)&b
1211 add h, y1 ; h = h + S1 + CH + k + w + S0
1212 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1213 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1214 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1215
1216ROTATE_ARGS
1217 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1218 mov y0, e ; y0 = e
1219 mov y1, a ; y1 = a
1220 ror y0, (25-11) ; y0 = e >> (25-11)
1221 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1222 xor y0, e ; y0 = e ^ (e >> (25-11))
1223 ror y1, (22-13) ; y1 = a >> (22-13)
1224 mov y2, f ; y2 = f
1225 xor y1, a ; y1 = a ^ (a >> (22-13)
1226 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1227 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1228 xor y2, g ; y2 = f^g
1229 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1230 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1231 and y2, e ; y2 = (f^g)&e
1232 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1233 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1234 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1235 xor y2, g ; y2 = CH = ((f^g)&e)^g
1236 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1237 pxor XTMP2, XTMP3
1238 add y2, y0 ; y2 = S1 + CH
1239 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1240 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1241 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1242 mov y0, a ; y0 = a
1243 add h, y2 ; h = h + S1 + CH + k + w
1244 mov y2, a ; y2 = a
1245 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1246 or y0, c ; y0 = a|c
1247 add d, h ; d = d + h + S1 + CH + k + w
1248 and y2, c ; y2 = a&c
1249 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1250 and y0, b ; y0 = (a|c)&b
1251 add h, y1 ; h = h + S1 + CH + k + w + S0
1252 ;; compute high s1
1253 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1254 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1255 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1256
1257ROTATE_ARGS
1258 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1259 mov y0, e ; y0 = e
1260 ror y0, (25-11) ; y0 = e >> (25-11)
1261 mov y1, a ; y1 = a
1262 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1263 ror y1, (22-13) ; y1 = a >> (22-13)
1264 xor y0, e ; y0 = e ^ (e >> (25-11))
1265 mov y2, f ; y2 = f
1266 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1267 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1268 xor y1, a ; y1 = a ^ (a >> (22-13)
1269 xor y2, g ; y2 = f^g
1270 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1271 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1272 and y2, e ; y2 = (f^g)&e
1273 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1274 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1275 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1276 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1277 xor y2, g ; y2 = CH = ((f^g)&e)^g
1278 pxor XTMP2, XTMP3
1279 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1280 add y2, y0 ; y2 = S1 + CH
1281 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1282 pxor X0, XTMP2 ; X0 = s1 {xDxC}
1283 mov y0, a ; y0 = a
1284 add h, y2 ; h = h + S1 + CH + k + w
1285 mov y2, a ; y2 = a
1286 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1287 or y0, c ; y0 = a|c
1288 add d, h ; d = d + h + S1 + CH + k + w
1289 and y2, c ; y2 = a&c
1290 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1291 and y0, b ; y0 = (a|c)&b
1292 add h, y1 ; h = h + S1 + CH + k + w + S0
1293 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1294 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1295
1296ROTATE_ARGS
1297rotate_Xs
1298%endm
1299
1300;; input is [rsp + _XFER + %1 * 4]
1301%macro DO_ROUND 1
1302 mov y0, e ; y0 = e
1303 ror y0, (25-11) ; y0 = e >> (25-11)
1304 mov y1, a ; y1 = a
1305 xor y0, e ; y0 = e ^ (e >> (25-11))
1306 ror y1, (22-13) ; y1 = a >> (22-13)
1307 mov y2, f ; y2 = f
1308 xor y1, a ; y1 = a ^ (a >> (22-13)
1309 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1310 xor y2, g ; y2 = f^g
1311 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1312 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1313 and y2, e ; y2 = (f^g)&e
1314 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1315 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1316 xor y2, g ; y2 = CH = ((f^g)&e)^g
1317 add y2, y0 ; y2 = S1 + CH
1318 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1319 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1320 mov y0, a ; y0 = a
1321 add h, y2 ; h = h + S1 + CH + k + w
1322 mov y2, a ; y2 = a
1323 or y0, c ; y0 = a|c
1324 add d, h ; d = d + h + S1 + CH + k + w
1325 and y2, c ; y2 = a&c
1326 and y0, b ; y0 = (a|c)&b
1327 add h, y1 ; h = h + S1 + CH + k + w + S0
1328 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1329 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1330 ROTATE_ARGS
1331%endm
1332
1333;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1334;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1335;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1336;; arg 1 : pointer to input data
1337;; arg 2 : pointer to digest
1338;; arg 3 : Num blocks
1339section .text
1340global sha256_sse4
1341align 32
1342sha256_sse4:
1343 push rbx
1344%ifndef LINUX
1345 push rsi
1346 push rdi
1347%endif
1348 push rbp
1349 push r13
1350 push r14
1351 push r15
1352
1353 sub rsp,STACK_SIZE
1354%ifndef LINUX
1355 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1356 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1357 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1358 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1359 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1360 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1361 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1362%endif
1363
1364 shl NUM_BLKS, 6 ; convert to bytes
1365 jz done_hash
1366 add NUM_BLKS, INP ; pointer to end of data
1367 mov [rsp + _INP_END], NUM_BLKS
1368
1369 ;; load initial digest
1370 mov a,[4*0 + CTX]
1371 mov b,[4*1 + CTX]
1372 mov c,[4*2 + CTX]
1373 mov d,[4*3 + CTX]
1374 mov e,[4*4 + CTX]
1375 mov f,[4*5 + CTX]
1376 mov g,[4*6 + CTX]
1377 mov h,[4*7 + CTX]
1378
1379 movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1380 movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1381 movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1382
1383loop0:
1384 lea TBL,[K256 wrt rip]
1385
1386 ;; byte swap first 16 dwords
1387 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1388 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1389 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1390 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1391
1392 mov [rsp + _INP], INP
1393
1394 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1395 mov SRND, 3
1396align 16
1397loop1:
1398 movdqa XFER, [TBL + 0*16]
1399 paddd XFER, X0
1400 movdqa [rsp + _XFER], XFER
1401 FOUR_ROUNDS_AND_SCHED
1402
1403 movdqa XFER, [TBL + 1*16]
1404 paddd XFER, X0
1405 movdqa [rsp + _XFER], XFER
1406 FOUR_ROUNDS_AND_SCHED
1407
1408 movdqa XFER, [TBL + 2*16]
1409 paddd XFER, X0
1410 movdqa [rsp + _XFER], XFER
1411 FOUR_ROUNDS_AND_SCHED
1412
1413 movdqa XFER, [TBL + 3*16]
1414 paddd XFER, X0
1415 movdqa [rsp + _XFER], XFER
1416 add TBL, 4*16
1417 FOUR_ROUNDS_AND_SCHED
1418
1419 sub SRND, 1
1420 jne loop1
1421
1422 mov SRND, 2
1423loop2:
1424 paddd X0, [TBL + 0*16]
1425 movdqa [rsp + _XFER], X0
1426 DO_ROUND 0
1427 DO_ROUND 1
1428 DO_ROUND 2
1429 DO_ROUND 3
1430 paddd X1, [TBL + 1*16]
1431 movdqa [rsp + _XFER], X1
1432 add TBL, 2*16
1433 DO_ROUND 0
1434 DO_ROUND 1
1435 DO_ROUND 2
1436 DO_ROUND 3
1437
1438 movdqa X0, X2
1439 movdqa X1, X3
1440
1441 sub SRND, 1
1442 jne loop2
1443
1444 addm [4*0 + CTX],a
1445 addm [4*1 + CTX],b
1446 addm [4*2 + CTX],c
1447 addm [4*3 + CTX],d
1448 addm [4*4 + CTX],e
1449 addm [4*5 + CTX],f
1450 addm [4*6 + CTX],g
1451 addm [4*7 + CTX],h
1452
1453 mov INP, [rsp + _INP]
1454 add INP, 64
1455 cmp INP, [rsp + _INP_END]
1456 jne loop0
1457
1458done_hash:
1459%ifndef LINUX
1460 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1461 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1462 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1463 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1464 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1465 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1466 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1467%endif
1468
1469 add rsp, STACK_SIZE
1470
1471 pop r15
1472 pop r14
1473 pop r13
1474 pop rbp
1475%ifndef LINUX
1476 pop rdi
1477 pop rsi
1478%endif
1479 pop rbx
1480
1481 ret
1482
1483
1484section .data
1485align 64
1486K256:
1487 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1488 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1489 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1490 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1491 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1492 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1493 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1494 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1495 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1496 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1497 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1498 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1499 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1500 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1501 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1502 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1503
1504PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1505
1506; shuffle xBxA -> 00BA
1507_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1508
1509; shuffle xDxC -> DC00
1510_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1511*/
1512
1513#endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)