Bitcoin Core 29.99.0
P2P Digital Currency
sha256_sse4.cpp
Go to the documentation of this file.
1// Copyright (c) 2017-2022 The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4//
5// This is a translation to GCC extended asm syntax from YASM code by Intel
6// (available at the bottom of this file).
7
8#include <cstdlib>
9#include <stdint.h>
10
11#if defined(__x86_64__) || defined(__amd64__)
12
13namespace sha256_sse4
14{
15void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
16#if defined(__clang__)
17 /*
18 clang is unable to compile this with -O0 and -fsanitize=address.
19 See upstream bug: https://github.com/llvm/llvm-project/issues/92182.
20 This also fails to compile with -O2, -fcf-protection & -fsanitize=address.
21 See https://github.com/bitcoin/bitcoin/issues/31913.
22 */
23#if __has_feature(address_sanitizer)
24 __attribute__((no_sanitize("address")))
25#endif
26#endif
27{
28 static const uint32_t K256 alignas(16) [] = {
29 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
30 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
31 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
32 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
33 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
34 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
35 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
36 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
37 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
38 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
39 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
40 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
41 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
42 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
43 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
44 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
45 };
46 static const uint32_t FLIP_MASK alignas(16) [] = {0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f};
47 static const uint32_t SHUF_00BA alignas(16) [] = {0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff};
48 static const uint32_t SHUF_DC00 alignas(16) [] = {0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908};
49 uint32_t a, b, c, d, f, g, h, y0, y1, y2;
50 uint64_t tbl;
51 uint64_t inp_end, inp;
52 uint32_t xfer alignas(16) [4];
53
54 __asm__ __volatile__(
55 "shl $0x6,%2;"
56 "je Ldone_hash_%=;"
57 "add %1,%2;"
58 "mov %2,%14;"
59 "mov (%0),%3;"
60 "mov 0x4(%0),%4;"
61 "mov 0x8(%0),%5;"
62 "mov 0xc(%0),%6;"
63 "mov 0x10(%0),%k2;"
64 "mov 0x14(%0),%7;"
65 "mov 0x18(%0),%8;"
66 "mov 0x1c(%0),%9;"
67 "movdqa %18,%%xmm12;"
68 "movdqa %19,%%xmm10;"
69 "movdqa %20,%%xmm11;"
70
71 "Lloop0_%=:"
72 "lea %17,%13;"
73 "movdqu (%1),%%xmm4;"
74 "pshufb %%xmm12,%%xmm4;"
75 "movdqu 0x10(%1),%%xmm5;"
76 "pshufb %%xmm12,%%xmm5;"
77 "movdqu 0x20(%1),%%xmm6;"
78 "pshufb %%xmm12,%%xmm6;"
79 "movdqu 0x30(%1),%%xmm7;"
80 "pshufb %%xmm12,%%xmm7;"
81 "mov %1,%15;"
82 "mov $3,%1;"
83
84 "Lloop1_%=:"
85 "movdqa 0x0(%13),%%xmm9;"
86 "paddd %%xmm4,%%xmm9;"
87 "movdqa %%xmm9,%16;"
88 "movdqa %%xmm7,%%xmm0;"
89 "mov %k2,%10;"
90 "ror $0xe,%10;"
91 "mov %3,%11;"
92 "palignr $0x4,%%xmm6,%%xmm0;"
93 "ror $0x9,%11;"
94 "xor %k2,%10;"
95 "mov %7,%12;"
96 "ror $0x5,%10;"
97 "movdqa %%xmm5,%%xmm1;"
98 "xor %3,%11;"
99 "xor %8,%12;"
100 "paddd %%xmm4,%%xmm0;"
101 "xor %k2,%10;"
102 "and %k2,%12;"
103 "ror $0xb,%11;"
104 "palignr $0x4,%%xmm4,%%xmm1;"
105 "xor %3,%11;"
106 "ror $0x6,%10;"
107 "xor %8,%12;"
108 "movdqa %%xmm1,%%xmm2;"
109 "ror $0x2,%11;"
110 "add %10,%12;"
111 "add %16,%12;"
112 "movdqa %%xmm1,%%xmm3;"
113 "mov %3,%10;"
114 "add %12,%9;"
115 "mov %3,%12;"
116 "pslld $0x19,%%xmm1;"
117 "or %5,%10;"
118 "add %9,%6;"
119 "and %5,%12;"
120 "psrld $0x7,%%xmm2;"
121 "and %4,%10;"
122 "add %11,%9;"
123 "por %%xmm2,%%xmm1;"
124 "or %12,%10;"
125 "add %10,%9;"
126 "movdqa %%xmm3,%%xmm2;"
127 "mov %6,%10;"
128 "mov %9,%11;"
129 "movdqa %%xmm3,%%xmm8;"
130 "ror $0xe,%10;"
131 "xor %6,%10;"
132 "mov %k2,%12;"
133 "ror $0x9,%11;"
134 "pslld $0xe,%%xmm3;"
135 "xor %9,%11;"
136 "ror $0x5,%10;"
137 "xor %7,%12;"
138 "psrld $0x12,%%xmm2;"
139 "ror $0xb,%11;"
140 "xor %6,%10;"
141 "and %6,%12;"
142 "ror $0x6,%10;"
143 "pxor %%xmm3,%%xmm1;"
144 "xor %9,%11;"
145 "xor %7,%12;"
146 "psrld $0x3,%%xmm8;"
147 "add %10,%12;"
148 "add 4+%16,%12;"
149 "ror $0x2,%11;"
150 "pxor %%xmm2,%%xmm1;"
151 "mov %9,%10;"
152 "add %12,%8;"
153 "mov %9,%12;"
154 "pxor %%xmm8,%%xmm1;"
155 "or %4,%10;"
156 "add %8,%5;"
157 "and %4,%12;"
158 "pshufd $0xfa,%%xmm7,%%xmm2;"
159 "and %3,%10;"
160 "add %11,%8;"
161 "paddd %%xmm1,%%xmm0;"
162 "or %12,%10;"
163 "add %10,%8;"
164 "movdqa %%xmm2,%%xmm3;"
165 "mov %5,%10;"
166 "mov %8,%11;"
167 "ror $0xe,%10;"
168 "movdqa %%xmm2,%%xmm8;"
169 "xor %5,%10;"
170 "ror $0x9,%11;"
171 "mov %6,%12;"
172 "xor %8,%11;"
173 "ror $0x5,%10;"
174 "psrlq $0x11,%%xmm2;"
175 "xor %k2,%12;"
176 "psrlq $0x13,%%xmm3;"
177 "xor %5,%10;"
178 "and %5,%12;"
179 "psrld $0xa,%%xmm8;"
180 "ror $0xb,%11;"
181 "xor %8,%11;"
182 "xor %k2,%12;"
183 "ror $0x6,%10;"
184 "pxor %%xmm3,%%xmm2;"
185 "add %10,%12;"
186 "ror $0x2,%11;"
187 "add 8+%16,%12;"
188 "pxor %%xmm2,%%xmm8;"
189 "mov %8,%10;"
190 "add %12,%7;"
191 "mov %8,%12;"
192 "pshufb %%xmm10,%%xmm8;"
193 "or %3,%10;"
194 "add %7,%4;"
195 "and %3,%12;"
196 "paddd %%xmm8,%%xmm0;"
197 "and %9,%10;"
198 "add %11,%7;"
199 "pshufd $0x50,%%xmm0,%%xmm2;"
200 "or %12,%10;"
201 "add %10,%7;"
202 "movdqa %%xmm2,%%xmm3;"
203 "mov %4,%10;"
204 "ror $0xe,%10;"
205 "mov %7,%11;"
206 "movdqa %%xmm2,%%xmm4;"
207 "ror $0x9,%11;"
208 "xor %4,%10;"
209 "mov %5,%12;"
210 "ror $0x5,%10;"
211 "psrlq $0x11,%%xmm2;"
212 "xor %7,%11;"
213 "xor %6,%12;"
214 "psrlq $0x13,%%xmm3;"
215 "xor %4,%10;"
216 "and %4,%12;"
217 "ror $0xb,%11;"
218 "psrld $0xa,%%xmm4;"
219 "xor %7,%11;"
220 "ror $0x6,%10;"
221 "xor %6,%12;"
222 "pxor %%xmm3,%%xmm2;"
223 "ror $0x2,%11;"
224 "add %10,%12;"
225 "add 12+%16,%12;"
226 "pxor %%xmm2,%%xmm4;"
227 "mov %7,%10;"
228 "add %12,%k2;"
229 "mov %7,%12;"
230 "pshufb %%xmm11,%%xmm4;"
231 "or %9,%10;"
232 "add %k2,%3;"
233 "and %9,%12;"
234 "paddd %%xmm0,%%xmm4;"
235 "and %8,%10;"
236 "add %11,%k2;"
237 "or %12,%10;"
238 "add %10,%k2;"
239 "movdqa 0x10(%13),%%xmm9;"
240 "paddd %%xmm5,%%xmm9;"
241 "movdqa %%xmm9,%16;"
242 "movdqa %%xmm4,%%xmm0;"
243 "mov %3,%10;"
244 "ror $0xe,%10;"
245 "mov %k2,%11;"
246 "palignr $0x4,%%xmm7,%%xmm0;"
247 "ror $0x9,%11;"
248 "xor %3,%10;"
249 "mov %4,%12;"
250 "ror $0x5,%10;"
251 "movdqa %%xmm6,%%xmm1;"
252 "xor %k2,%11;"
253 "xor %5,%12;"
254 "paddd %%xmm5,%%xmm0;"
255 "xor %3,%10;"
256 "and %3,%12;"
257 "ror $0xb,%11;"
258 "palignr $0x4,%%xmm5,%%xmm1;"
259 "xor %k2,%11;"
260 "ror $0x6,%10;"
261 "xor %5,%12;"
262 "movdqa %%xmm1,%%xmm2;"
263 "ror $0x2,%11;"
264 "add %10,%12;"
265 "add %16,%12;"
266 "movdqa %%xmm1,%%xmm3;"
267 "mov %k2,%10;"
268 "add %12,%6;"
269 "mov %k2,%12;"
270 "pslld $0x19,%%xmm1;"
271 "or %8,%10;"
272 "add %6,%9;"
273 "and %8,%12;"
274 "psrld $0x7,%%xmm2;"
275 "and %7,%10;"
276 "add %11,%6;"
277 "por %%xmm2,%%xmm1;"
278 "or %12,%10;"
279 "add %10,%6;"
280 "movdqa %%xmm3,%%xmm2;"
281 "mov %9,%10;"
282 "mov %6,%11;"
283 "movdqa %%xmm3,%%xmm8;"
284 "ror $0xe,%10;"
285 "xor %9,%10;"
286 "mov %3,%12;"
287 "ror $0x9,%11;"
288 "pslld $0xe,%%xmm3;"
289 "xor %6,%11;"
290 "ror $0x5,%10;"
291 "xor %4,%12;"
292 "psrld $0x12,%%xmm2;"
293 "ror $0xb,%11;"
294 "xor %9,%10;"
295 "and %9,%12;"
296 "ror $0x6,%10;"
297 "pxor %%xmm3,%%xmm1;"
298 "xor %6,%11;"
299 "xor %4,%12;"
300 "psrld $0x3,%%xmm8;"
301 "add %10,%12;"
302 "add 4+%16,%12;"
303 "ror $0x2,%11;"
304 "pxor %%xmm2,%%xmm1;"
305 "mov %6,%10;"
306 "add %12,%5;"
307 "mov %6,%12;"
308 "pxor %%xmm8,%%xmm1;"
309 "or %7,%10;"
310 "add %5,%8;"
311 "and %7,%12;"
312 "pshufd $0xfa,%%xmm4,%%xmm2;"
313 "and %k2,%10;"
314 "add %11,%5;"
315 "paddd %%xmm1,%%xmm0;"
316 "or %12,%10;"
317 "add %10,%5;"
318 "movdqa %%xmm2,%%xmm3;"
319 "mov %8,%10;"
320 "mov %5,%11;"
321 "ror $0xe,%10;"
322 "movdqa %%xmm2,%%xmm8;"
323 "xor %8,%10;"
324 "ror $0x9,%11;"
325 "mov %9,%12;"
326 "xor %5,%11;"
327 "ror $0x5,%10;"
328 "psrlq $0x11,%%xmm2;"
329 "xor %3,%12;"
330 "psrlq $0x13,%%xmm3;"
331 "xor %8,%10;"
332 "and %8,%12;"
333 "psrld $0xa,%%xmm8;"
334 "ror $0xb,%11;"
335 "xor %5,%11;"
336 "xor %3,%12;"
337 "ror $0x6,%10;"
338 "pxor %%xmm3,%%xmm2;"
339 "add %10,%12;"
340 "ror $0x2,%11;"
341 "add 8+%16,%12;"
342 "pxor %%xmm2,%%xmm8;"
343 "mov %5,%10;"
344 "add %12,%4;"
345 "mov %5,%12;"
346 "pshufb %%xmm10,%%xmm8;"
347 "or %k2,%10;"
348 "add %4,%7;"
349 "and %k2,%12;"
350 "paddd %%xmm8,%%xmm0;"
351 "and %6,%10;"
352 "add %11,%4;"
353 "pshufd $0x50,%%xmm0,%%xmm2;"
354 "or %12,%10;"
355 "add %10,%4;"
356 "movdqa %%xmm2,%%xmm3;"
357 "mov %7,%10;"
358 "ror $0xe,%10;"
359 "mov %4,%11;"
360 "movdqa %%xmm2,%%xmm5;"
361 "ror $0x9,%11;"
362 "xor %7,%10;"
363 "mov %8,%12;"
364 "ror $0x5,%10;"
365 "psrlq $0x11,%%xmm2;"
366 "xor %4,%11;"
367 "xor %9,%12;"
368 "psrlq $0x13,%%xmm3;"
369 "xor %7,%10;"
370 "and %7,%12;"
371 "ror $0xb,%11;"
372 "psrld $0xa,%%xmm5;"
373 "xor %4,%11;"
374 "ror $0x6,%10;"
375 "xor %9,%12;"
376 "pxor %%xmm3,%%xmm2;"
377 "ror $0x2,%11;"
378 "add %10,%12;"
379 "add 12+%16,%12;"
380 "pxor %%xmm2,%%xmm5;"
381 "mov %4,%10;"
382 "add %12,%3;"
383 "mov %4,%12;"
384 "pshufb %%xmm11,%%xmm5;"
385 "or %6,%10;"
386 "add %3,%k2;"
387 "and %6,%12;"
388 "paddd %%xmm0,%%xmm5;"
389 "and %5,%10;"
390 "add %11,%3;"
391 "or %12,%10;"
392 "add %10,%3;"
393 "movdqa 0x20(%13),%%xmm9;"
394 "paddd %%xmm6,%%xmm9;"
395 "movdqa %%xmm9,%16;"
396 "movdqa %%xmm5,%%xmm0;"
397 "mov %k2,%10;"
398 "ror $0xe,%10;"
399 "mov %3,%11;"
400 "palignr $0x4,%%xmm4,%%xmm0;"
401 "ror $0x9,%11;"
402 "xor %k2,%10;"
403 "mov %7,%12;"
404 "ror $0x5,%10;"
405 "movdqa %%xmm7,%%xmm1;"
406 "xor %3,%11;"
407 "xor %8,%12;"
408 "paddd %%xmm6,%%xmm0;"
409 "xor %k2,%10;"
410 "and %k2,%12;"
411 "ror $0xb,%11;"
412 "palignr $0x4,%%xmm6,%%xmm1;"
413 "xor %3,%11;"
414 "ror $0x6,%10;"
415 "xor %8,%12;"
416 "movdqa %%xmm1,%%xmm2;"
417 "ror $0x2,%11;"
418 "add %10,%12;"
419 "add %16,%12;"
420 "movdqa %%xmm1,%%xmm3;"
421 "mov %3,%10;"
422 "add %12,%9;"
423 "mov %3,%12;"
424 "pslld $0x19,%%xmm1;"
425 "or %5,%10;"
426 "add %9,%6;"
427 "and %5,%12;"
428 "psrld $0x7,%%xmm2;"
429 "and %4,%10;"
430 "add %11,%9;"
431 "por %%xmm2,%%xmm1;"
432 "or %12,%10;"
433 "add %10,%9;"
434 "movdqa %%xmm3,%%xmm2;"
435 "mov %6,%10;"
436 "mov %9,%11;"
437 "movdqa %%xmm3,%%xmm8;"
438 "ror $0xe,%10;"
439 "xor %6,%10;"
440 "mov %k2,%12;"
441 "ror $0x9,%11;"
442 "pslld $0xe,%%xmm3;"
443 "xor %9,%11;"
444 "ror $0x5,%10;"
445 "xor %7,%12;"
446 "psrld $0x12,%%xmm2;"
447 "ror $0xb,%11;"
448 "xor %6,%10;"
449 "and %6,%12;"
450 "ror $0x6,%10;"
451 "pxor %%xmm3,%%xmm1;"
452 "xor %9,%11;"
453 "xor %7,%12;"
454 "psrld $0x3,%%xmm8;"
455 "add %10,%12;"
456 "add 4+%16,%12;"
457 "ror $0x2,%11;"
458 "pxor %%xmm2,%%xmm1;"
459 "mov %9,%10;"
460 "add %12,%8;"
461 "mov %9,%12;"
462 "pxor %%xmm8,%%xmm1;"
463 "or %4,%10;"
464 "add %8,%5;"
465 "and %4,%12;"
466 "pshufd $0xfa,%%xmm5,%%xmm2;"
467 "and %3,%10;"
468 "add %11,%8;"
469 "paddd %%xmm1,%%xmm0;"
470 "or %12,%10;"
471 "add %10,%8;"
472 "movdqa %%xmm2,%%xmm3;"
473 "mov %5,%10;"
474 "mov %8,%11;"
475 "ror $0xe,%10;"
476 "movdqa %%xmm2,%%xmm8;"
477 "xor %5,%10;"
478 "ror $0x9,%11;"
479 "mov %6,%12;"
480 "xor %8,%11;"
481 "ror $0x5,%10;"
482 "psrlq $0x11,%%xmm2;"
483 "xor %k2,%12;"
484 "psrlq $0x13,%%xmm3;"
485 "xor %5,%10;"
486 "and %5,%12;"
487 "psrld $0xa,%%xmm8;"
488 "ror $0xb,%11;"
489 "xor %8,%11;"
490 "xor %k2,%12;"
491 "ror $0x6,%10;"
492 "pxor %%xmm3,%%xmm2;"
493 "add %10,%12;"
494 "ror $0x2,%11;"
495 "add 8+%16,%12;"
496 "pxor %%xmm2,%%xmm8;"
497 "mov %8,%10;"
498 "add %12,%7;"
499 "mov %8,%12;"
500 "pshufb %%xmm10,%%xmm8;"
501 "or %3,%10;"
502 "add %7,%4;"
503 "and %3,%12;"
504 "paddd %%xmm8,%%xmm0;"
505 "and %9,%10;"
506 "add %11,%7;"
507 "pshufd $0x50,%%xmm0,%%xmm2;"
508 "or %12,%10;"
509 "add %10,%7;"
510 "movdqa %%xmm2,%%xmm3;"
511 "mov %4,%10;"
512 "ror $0xe,%10;"
513 "mov %7,%11;"
514 "movdqa %%xmm2,%%xmm6;"
515 "ror $0x9,%11;"
516 "xor %4,%10;"
517 "mov %5,%12;"
518 "ror $0x5,%10;"
519 "psrlq $0x11,%%xmm2;"
520 "xor %7,%11;"
521 "xor %6,%12;"
522 "psrlq $0x13,%%xmm3;"
523 "xor %4,%10;"
524 "and %4,%12;"
525 "ror $0xb,%11;"
526 "psrld $0xa,%%xmm6;"
527 "xor %7,%11;"
528 "ror $0x6,%10;"
529 "xor %6,%12;"
530 "pxor %%xmm3,%%xmm2;"
531 "ror $0x2,%11;"
532 "add %10,%12;"
533 "add 12+%16,%12;"
534 "pxor %%xmm2,%%xmm6;"
535 "mov %7,%10;"
536 "add %12,%k2;"
537 "mov %7,%12;"
538 "pshufb %%xmm11,%%xmm6;"
539 "or %9,%10;"
540 "add %k2,%3;"
541 "and %9,%12;"
542 "paddd %%xmm0,%%xmm6;"
543 "and %8,%10;"
544 "add %11,%k2;"
545 "or %12,%10;"
546 "add %10,%k2;"
547 "movdqa 0x30(%13),%%xmm9;"
548 "paddd %%xmm7,%%xmm9;"
549 "movdqa %%xmm9,%16;"
550 "add $0x40,%13;"
551 "movdqa %%xmm6,%%xmm0;"
552 "mov %3,%10;"
553 "ror $0xe,%10;"
554 "mov %k2,%11;"
555 "palignr $0x4,%%xmm5,%%xmm0;"
556 "ror $0x9,%11;"
557 "xor %3,%10;"
558 "mov %4,%12;"
559 "ror $0x5,%10;"
560 "movdqa %%xmm4,%%xmm1;"
561 "xor %k2,%11;"
562 "xor %5,%12;"
563 "paddd %%xmm7,%%xmm0;"
564 "xor %3,%10;"
565 "and %3,%12;"
566 "ror $0xb,%11;"
567 "palignr $0x4,%%xmm7,%%xmm1;"
568 "xor %k2,%11;"
569 "ror $0x6,%10;"
570 "xor %5,%12;"
571 "movdqa %%xmm1,%%xmm2;"
572 "ror $0x2,%11;"
573 "add %10,%12;"
574 "add %16,%12;"
575 "movdqa %%xmm1,%%xmm3;"
576 "mov %k2,%10;"
577 "add %12,%6;"
578 "mov %k2,%12;"
579 "pslld $0x19,%%xmm1;"
580 "or %8,%10;"
581 "add %6,%9;"
582 "and %8,%12;"
583 "psrld $0x7,%%xmm2;"
584 "and %7,%10;"
585 "add %11,%6;"
586 "por %%xmm2,%%xmm1;"
587 "or %12,%10;"
588 "add %10,%6;"
589 "movdqa %%xmm3,%%xmm2;"
590 "mov %9,%10;"
591 "mov %6,%11;"
592 "movdqa %%xmm3,%%xmm8;"
593 "ror $0xe,%10;"
594 "xor %9,%10;"
595 "mov %3,%12;"
596 "ror $0x9,%11;"
597 "pslld $0xe,%%xmm3;"
598 "xor %6,%11;"
599 "ror $0x5,%10;"
600 "xor %4,%12;"
601 "psrld $0x12,%%xmm2;"
602 "ror $0xb,%11;"
603 "xor %9,%10;"
604 "and %9,%12;"
605 "ror $0x6,%10;"
606 "pxor %%xmm3,%%xmm1;"
607 "xor %6,%11;"
608 "xor %4,%12;"
609 "psrld $0x3,%%xmm8;"
610 "add %10,%12;"
611 "add 4+%16,%12;"
612 "ror $0x2,%11;"
613 "pxor %%xmm2,%%xmm1;"
614 "mov %6,%10;"
615 "add %12,%5;"
616 "mov %6,%12;"
617 "pxor %%xmm8,%%xmm1;"
618 "or %7,%10;"
619 "add %5,%8;"
620 "and %7,%12;"
621 "pshufd $0xfa,%%xmm6,%%xmm2;"
622 "and %k2,%10;"
623 "add %11,%5;"
624 "paddd %%xmm1,%%xmm0;"
625 "or %12,%10;"
626 "add %10,%5;"
627 "movdqa %%xmm2,%%xmm3;"
628 "mov %8,%10;"
629 "mov %5,%11;"
630 "ror $0xe,%10;"
631 "movdqa %%xmm2,%%xmm8;"
632 "xor %8,%10;"
633 "ror $0x9,%11;"
634 "mov %9,%12;"
635 "xor %5,%11;"
636 "ror $0x5,%10;"
637 "psrlq $0x11,%%xmm2;"
638 "xor %3,%12;"
639 "psrlq $0x13,%%xmm3;"
640 "xor %8,%10;"
641 "and %8,%12;"
642 "psrld $0xa,%%xmm8;"
643 "ror $0xb,%11;"
644 "xor %5,%11;"
645 "xor %3,%12;"
646 "ror $0x6,%10;"
647 "pxor %%xmm3,%%xmm2;"
648 "add %10,%12;"
649 "ror $0x2,%11;"
650 "add 8+%16,%12;"
651 "pxor %%xmm2,%%xmm8;"
652 "mov %5,%10;"
653 "add %12,%4;"
654 "mov %5,%12;"
655 "pshufb %%xmm10,%%xmm8;"
656 "or %k2,%10;"
657 "add %4,%7;"
658 "and %k2,%12;"
659 "paddd %%xmm8,%%xmm0;"
660 "and %6,%10;"
661 "add %11,%4;"
662 "pshufd $0x50,%%xmm0,%%xmm2;"
663 "or %12,%10;"
664 "add %10,%4;"
665 "movdqa %%xmm2,%%xmm3;"
666 "mov %7,%10;"
667 "ror $0xe,%10;"
668 "mov %4,%11;"
669 "movdqa %%xmm2,%%xmm7;"
670 "ror $0x9,%11;"
671 "xor %7,%10;"
672 "mov %8,%12;"
673 "ror $0x5,%10;"
674 "psrlq $0x11,%%xmm2;"
675 "xor %4,%11;"
676 "xor %9,%12;"
677 "psrlq $0x13,%%xmm3;"
678 "xor %7,%10;"
679 "and %7,%12;"
680 "ror $0xb,%11;"
681 "psrld $0xa,%%xmm7;"
682 "xor %4,%11;"
683 "ror $0x6,%10;"
684 "xor %9,%12;"
685 "pxor %%xmm3,%%xmm2;"
686 "ror $0x2,%11;"
687 "add %10,%12;"
688 "add 12+%16,%12;"
689 "pxor %%xmm2,%%xmm7;"
690 "mov %4,%10;"
691 "add %12,%3;"
692 "mov %4,%12;"
693 "pshufb %%xmm11,%%xmm7;"
694 "or %6,%10;"
695 "add %3,%k2;"
696 "and %6,%12;"
697 "paddd %%xmm0,%%xmm7;"
698 "and %5,%10;"
699 "add %11,%3;"
700 "or %12,%10;"
701 "add %10,%3;"
702 "sub $0x1,%1;"
703 "jne Lloop1_%=;"
704 "mov $0x2,%1;"
705
706 "Lloop2_%=:"
707 "paddd 0x0(%13),%%xmm4;"
708 "movdqa %%xmm4,%16;"
709 "mov %k2,%10;"
710 "ror $0xe,%10;"
711 "mov %3,%11;"
712 "xor %k2,%10;"
713 "ror $0x9,%11;"
714 "mov %7,%12;"
715 "xor %3,%11;"
716 "ror $0x5,%10;"
717 "xor %8,%12;"
718 "xor %k2,%10;"
719 "ror $0xb,%11;"
720 "and %k2,%12;"
721 "xor %3,%11;"
722 "ror $0x6,%10;"
723 "xor %8,%12;"
724 "add %10,%12;"
725 "ror $0x2,%11;"
726 "add %16,%12;"
727 "mov %3,%10;"
728 "add %12,%9;"
729 "mov %3,%12;"
730 "or %5,%10;"
731 "add %9,%6;"
732 "and %5,%12;"
733 "and %4,%10;"
734 "add %11,%9;"
735 "or %12,%10;"
736 "add %10,%9;"
737 "mov %6,%10;"
738 "ror $0xe,%10;"
739 "mov %9,%11;"
740 "xor %6,%10;"
741 "ror $0x9,%11;"
742 "mov %k2,%12;"
743 "xor %9,%11;"
744 "ror $0x5,%10;"
745 "xor %7,%12;"
746 "xor %6,%10;"
747 "ror $0xb,%11;"
748 "and %6,%12;"
749 "xor %9,%11;"
750 "ror $0x6,%10;"
751 "xor %7,%12;"
752 "add %10,%12;"
753 "ror $0x2,%11;"
754 "add 4+%16,%12;"
755 "mov %9,%10;"
756 "add %12,%8;"
757 "mov %9,%12;"
758 "or %4,%10;"
759 "add %8,%5;"
760 "and %4,%12;"
761 "and %3,%10;"
762 "add %11,%8;"
763 "or %12,%10;"
764 "add %10,%8;"
765 "mov %5,%10;"
766 "ror $0xe,%10;"
767 "mov %8,%11;"
768 "xor %5,%10;"
769 "ror $0x9,%11;"
770 "mov %6,%12;"
771 "xor %8,%11;"
772 "ror $0x5,%10;"
773 "xor %k2,%12;"
774 "xor %5,%10;"
775 "ror $0xb,%11;"
776 "and %5,%12;"
777 "xor %8,%11;"
778 "ror $0x6,%10;"
779 "xor %k2,%12;"
780 "add %10,%12;"
781 "ror $0x2,%11;"
782 "add 8+%16,%12;"
783 "mov %8,%10;"
784 "add %12,%7;"
785 "mov %8,%12;"
786 "or %3,%10;"
787 "add %7,%4;"
788 "and %3,%12;"
789 "and %9,%10;"
790 "add %11,%7;"
791 "or %12,%10;"
792 "add %10,%7;"
793 "mov %4,%10;"
794 "ror $0xe,%10;"
795 "mov %7,%11;"
796 "xor %4,%10;"
797 "ror $0x9,%11;"
798 "mov %5,%12;"
799 "xor %7,%11;"
800 "ror $0x5,%10;"
801 "xor %6,%12;"
802 "xor %4,%10;"
803 "ror $0xb,%11;"
804 "and %4,%12;"
805 "xor %7,%11;"
806 "ror $0x6,%10;"
807 "xor %6,%12;"
808 "add %10,%12;"
809 "ror $0x2,%11;"
810 "add 12+%16,%12;"
811 "mov %7,%10;"
812 "add %12,%k2;"
813 "mov %7,%12;"
814 "or %9,%10;"
815 "add %k2,%3;"
816 "and %9,%12;"
817 "and %8,%10;"
818 "add %11,%k2;"
819 "or %12,%10;"
820 "add %10,%k2;"
821 "paddd 0x10(%13),%%xmm5;"
822 "movdqa %%xmm5,%16;"
823 "add $0x20,%13;"
824 "mov %3,%10;"
825 "ror $0xe,%10;"
826 "mov %k2,%11;"
827 "xor %3,%10;"
828 "ror $0x9,%11;"
829 "mov %4,%12;"
830 "xor %k2,%11;"
831 "ror $0x5,%10;"
832 "xor %5,%12;"
833 "xor %3,%10;"
834 "ror $0xb,%11;"
835 "and %3,%12;"
836 "xor %k2,%11;"
837 "ror $0x6,%10;"
838 "xor %5,%12;"
839 "add %10,%12;"
840 "ror $0x2,%11;"
841 "add %16,%12;"
842 "mov %k2,%10;"
843 "add %12,%6;"
844 "mov %k2,%12;"
845 "or %8,%10;"
846 "add %6,%9;"
847 "and %8,%12;"
848 "and %7,%10;"
849 "add %11,%6;"
850 "or %12,%10;"
851 "add %10,%6;"
852 "mov %9,%10;"
853 "ror $0xe,%10;"
854 "mov %6,%11;"
855 "xor %9,%10;"
856 "ror $0x9,%11;"
857 "mov %3,%12;"
858 "xor %6,%11;"
859 "ror $0x5,%10;"
860 "xor %4,%12;"
861 "xor %9,%10;"
862 "ror $0xb,%11;"
863 "and %9,%12;"
864 "xor %6,%11;"
865 "ror $0x6,%10;"
866 "xor %4,%12;"
867 "add %10,%12;"
868 "ror $0x2,%11;"
869 "add 4+%16,%12;"
870 "mov %6,%10;"
871 "add %12,%5;"
872 "mov %6,%12;"
873 "or %7,%10;"
874 "add %5,%8;"
875 "and %7,%12;"
876 "and %k2,%10;"
877 "add %11,%5;"
878 "or %12,%10;"
879 "add %10,%5;"
880 "mov %8,%10;"
881 "ror $0xe,%10;"
882 "mov %5,%11;"
883 "xor %8,%10;"
884 "ror $0x9,%11;"
885 "mov %9,%12;"
886 "xor %5,%11;"
887 "ror $0x5,%10;"
888 "xor %3,%12;"
889 "xor %8,%10;"
890 "ror $0xb,%11;"
891 "and %8,%12;"
892 "xor %5,%11;"
893 "ror $0x6,%10;"
894 "xor %3,%12;"
895 "add %10,%12;"
896 "ror $0x2,%11;"
897 "add 8+%16,%12;"
898 "mov %5,%10;"
899 "add %12,%4;"
900 "mov %5,%12;"
901 "or %k2,%10;"
902 "add %4,%7;"
903 "and %k2,%12;"
904 "and %6,%10;"
905 "add %11,%4;"
906 "or %12,%10;"
907 "add %10,%4;"
908 "mov %7,%10;"
909 "ror $0xe,%10;"
910 "mov %4,%11;"
911 "xor %7,%10;"
912 "ror $0x9,%11;"
913 "mov %8,%12;"
914 "xor %4,%11;"
915 "ror $0x5,%10;"
916 "xor %9,%12;"
917 "xor %7,%10;"
918 "ror $0xb,%11;"
919 "and %7,%12;"
920 "xor %4,%11;"
921 "ror $0x6,%10;"
922 "xor %9,%12;"
923 "add %10,%12;"
924 "ror $0x2,%11;"
925 "add 12+%16,%12;"
926 "mov %4,%10;"
927 "add %12,%3;"
928 "mov %4,%12;"
929 "or %6,%10;"
930 "add %3,%k2;"
931 "and %6,%12;"
932 "and %5,%10;"
933 "add %11,%3;"
934 "or %12,%10;"
935 "add %10,%3;"
936 "movdqa %%xmm6,%%xmm4;"
937 "movdqa %%xmm7,%%xmm5;"
938 "sub $0x1,%1;"
939 "jne Lloop2_%=;"
940 "add (%0),%3;"
941 "mov %3,(%0);"
942 "add 0x4(%0),%4;"
943 "mov %4,0x4(%0);"
944 "add 0x8(%0),%5;"
945 "mov %5,0x8(%0);"
946 "add 0xc(%0),%6;"
947 "mov %6,0xc(%0);"
948 "add 0x10(%0),%k2;"
949 "mov %k2,0x10(%0);"
950 "add 0x14(%0),%7;"
951 "mov %7,0x14(%0);"
952 "add 0x18(%0),%8;"
953 "mov %8,0x18(%0);"
954 "add 0x1c(%0),%9;"
955 "mov %9,0x1c(%0);"
956 "mov %15,%1;"
957 "add $0x40,%1;"
958 "cmp %14,%1;"
959 "jne Lloop0_%=;"
960
961 "Ldone_hash_%=:"
962
963 : "+r"(s), "+r"(chunk), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(tbl), "+m"(inp_end), "+m"(inp), "+m"(xfer)
964 : "m"(K256), "m"(FLIP_MASK), "m"(SHUF_00BA), "m"(SHUF_DC00)
965 : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
966 );
967}
968}
969
970/*
971;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
972; Copyright (c) 2012, Intel Corporation
973;
974; All rights reserved.
975;
976; Redistribution and use in source and binary forms, with or without
977; modification, are permitted provided that the following conditions are
978; met:
979;
980; * Redistributions of source code must retain the above copyright
981; notice, this list of conditions and the following disclaimer.
982;
983; * Redistributions in binary form must reproduce the above copyright
984; notice, this list of conditions and the following disclaimer in the
985; documentation and/or other materials provided with the
986; distribution.
987;
988; * Neither the name of the Intel Corporation nor the names of its
989; contributors may be used to endorse or promote products derived from
990; this software without specific prior written permission.
991;
992;
993; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
994; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
995; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
996; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
997; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
998; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
999; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
1000; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
1001; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1002; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
1003; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1004;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1005;
1006; Example YASM command lines:
1007; Windows: yasm -Xvc -f x64 -rnasm -pnasm -o sha256_sse4.obj -g cv8 sha256_sse4.asm
1008; Linux: yasm -f x64 -f elf64 -X gnu -g dwarf2 -D LINUX -o sha256_sse4.o sha256_sse4.asm
1009;
1010;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1011;
1012; This code is described in an Intel White-Paper:
1013; "Fast SHA-256 Implementations on Intel Architecture Processors"
1014;
1015; To find it, surf to https://www.intel.com/p/en_US/embedded
1016; and search for that title.
1017; The paper is expected to be released roughly at the end of April, 2012
1018;
1019;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1020; This code schedules 1 blocks at a time, with 4 lanes per block
1021;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1022
1023%define MOVDQ movdqu ;; assume buffers not aligned
1024
1025;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
1026
1027; addm [mem], reg
1028; Add reg to mem using reg-mem add and store
1029%macro addm 2
1030 add %2, %1
1031 mov %1, %2
1032%endm
1033
1034;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1035
1036; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
1037; Load xmm with mem and byte swap each dword
1038%macro COPY_XMM_AND_BSWAP 3
1039 MOVDQ %1, %2
1040 pshufb %1, %3
1041%endmacro
1042
1043;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1044
1045%define X0 xmm4
1046%define X1 xmm5
1047%define X2 xmm6
1048%define X3 xmm7
1049
1050%define XTMP0 xmm0
1051%define XTMP1 xmm1
1052%define XTMP2 xmm2
1053%define XTMP3 xmm3
1054%define XTMP4 xmm8
1055%define XFER xmm9
1056
1057%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
1058%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
1059%define BYTE_FLIP_MASK xmm12
1060
1061%ifdef LINUX
1062%define NUM_BLKS rdx ; 3rd arg
1063%define CTX rsi ; 2nd arg
1064%define INP rdi ; 1st arg
1065
1066%define SRND rdi ; clobbers INP
1067%define c ecx
1068%define d r8d
1069%define e edx
1070%else
1071%define NUM_BLKS r8 ; 3rd arg
1072%define CTX rdx ; 2nd arg
1073%define INP rcx ; 1st arg
1074
1075%define SRND rcx ; clobbers INP
1076%define c edi
1077%define d esi
1078%define e r8d
1079
1080%endif
1081%define TBL rbp
1082%define a eax
1083%define b ebx
1084
1085%define f r9d
1086%define g r10d
1087%define h r11d
1088
1089%define y0 r13d
1090%define y1 r14d
1091%define y2 r15d
1092
1093
1094
1095_INP_END_SIZE equ 8
1096_INP_SIZE equ 8
1097_XFER_SIZE equ 8
1098%ifdef LINUX
1099_XMM_SAVE_SIZE equ 0
1100%else
1101_XMM_SAVE_SIZE equ 7*16
1102%endif
1103; STACK_SIZE plus pushes must be an odd multiple of 8
1104_ALIGN_SIZE equ 8
1105
1106_INP_END equ 0
1107_INP equ _INP_END + _INP_END_SIZE
1108_XFER equ _INP + _INP_SIZE
1109_XMM_SAVE equ _XFER + _XFER_SIZE + _ALIGN_SIZE
1110STACK_SIZE equ _XMM_SAVE + _XMM_SAVE_SIZE
1111
1112; rotate_Xs
1113; Rotate values of symbols X0...X3
1114%macro rotate_Xs 0
1115%xdefine X_ X0
1116%xdefine X0 X1
1117%xdefine X1 X2
1118%xdefine X2 X3
1119%xdefine X3 X_
1120%endm
1121
1122; ROTATE_ARGS
1123; Rotate values of symbols a...h
1124%macro ROTATE_ARGS 0
1125%xdefine TMP_ h
1126%xdefine h g
1127%xdefine g f
1128%xdefine f e
1129%xdefine e d
1130%xdefine d c
1131%xdefine c b
1132%xdefine b a
1133%xdefine a TMP_
1134%endm
1135
1136%macro FOUR_ROUNDS_AND_SCHED 0
1137 ;; compute s0 four at a time and s1 two at a time
1138 ;; compute W[-16] + W[-7] 4 at a time
1139 movdqa XTMP0, X3
1140 mov y0, e ; y0 = e
1141 ror y0, (25-11) ; y0 = e >> (25-11)
1142 mov y1, a ; y1 = a
1143 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
1144 ror y1, (22-13) ; y1 = a >> (22-13)
1145 xor y0, e ; y0 = e ^ (e >> (25-11))
1146 mov y2, f ; y2 = f
1147 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1148 movdqa XTMP1, X1
1149 xor y1, a ; y1 = a ^ (a >> (22-13)
1150 xor y2, g ; y2 = f^g
1151 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
1152 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1153 and y2, e ; y2 = (f^g)&e
1154 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1155 ;; compute s0
1156 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
1157 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1158 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1159 xor y2, g ; y2 = CH = ((f^g)&e)^g
1160 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
1161 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1162 add y2, y0 ; y2 = S1 + CH
1163 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
1164 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
1165 mov y0, a ; y0 = a
1166 add h, y2 ; h = h + S1 + CH + k + w
1167 mov y2, a ; y2 = a
1168 pslld XTMP1, (32-7)
1169 or y0, c ; y0 = a|c
1170 add d, h ; d = d + h + S1 + CH + k + w
1171 and y2, c ; y2 = a&c
1172 psrld XTMP2, 7
1173 and y0, b ; y0 = (a|c)&b
1174 add h, y1 ; h = h + S1 + CH + k + w + S0
1175 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
1176 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1177 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1178
1179ROTATE_ARGS
1180 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
1181 mov y0, e ; y0 = e
1182 mov y1, a ; y1 = a
1183 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
1184 ror y0, (25-11) ; y0 = e >> (25-11)
1185 xor y0, e ; y0 = e ^ (e >> (25-11))
1186 mov y2, f ; y2 = f
1187 ror y1, (22-13) ; y1 = a >> (22-13)
1188 pslld XTMP3, (32-18)
1189 xor y1, a ; y1 = a ^ (a >> (22-13)
1190 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1191 xor y2, g ; y2 = f^g
1192 psrld XTMP2, 18
1193 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1194 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1195 and y2, e ; y2 = (f^g)&e
1196 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1197 pxor XTMP1, XTMP3
1198 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1199 xor y2, g ; y2 = CH = ((f^g)&e)^g
1200 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
1201 add y2, y0 ; y2 = S1 + CH
1202 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
1203 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1204 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
1205 mov y0, a ; y0 = a
1206 add h, y2 ; h = h + S1 + CH + k + w
1207 mov y2, a ; y2 = a
1208 pxor XTMP1, XTMP4 ; XTMP1 = s0
1209 or y0, c ; y0 = a|c
1210 add d, h ; d = d + h + S1 + CH + k + w
1211 and y2, c ; y2 = a&c
1212 ;; compute low s1
1213 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
1214 and y0, b ; y0 = (a|c)&b
1215 add h, y1 ; h = h + S1 + CH + k + w + S0
1216 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
1217 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1218 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1219
1220ROTATE_ARGS
1221 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
1222 mov y0, e ; y0 = e
1223 mov y1, a ; y1 = a
1224 ror y0, (25-11) ; y0 = e >> (25-11)
1225 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
1226 xor y0, e ; y0 = e ^ (e >> (25-11))
1227 ror y1, (22-13) ; y1 = a >> (22-13)
1228 mov y2, f ; y2 = f
1229 xor y1, a ; y1 = a ^ (a >> (22-13)
1230 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1231 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
1232 xor y2, g ; y2 = f^g
1233 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
1234 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1235 and y2, e ; y2 = (f^g)&e
1236 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
1237 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1238 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1239 xor y2, g ; y2 = CH = ((f^g)&e)^g
1240 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1241 pxor XTMP2, XTMP3
1242 add y2, y0 ; y2 = S1 + CH
1243 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1244 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
1245 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
1246 mov y0, a ; y0 = a
1247 add h, y2 ; h = h + S1 + CH + k + w
1248 mov y2, a ; y2 = a
1249 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
1250 or y0, c ; y0 = a|c
1251 add d, h ; d = d + h + S1 + CH + k + w
1252 and y2, c ; y2 = a&c
1253 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
1254 and y0, b ; y0 = (a|c)&b
1255 add h, y1 ; h = h + S1 + CH + k + w + S0
1256 ;; compute high s1
1257 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
1258 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1259 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1260
1261ROTATE_ARGS
1262 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
1263 mov y0, e ; y0 = e
1264 ror y0, (25-11) ; y0 = e >> (25-11)
1265 mov y1, a ; y1 = a
1266 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
1267 ror y1, (22-13) ; y1 = a >> (22-13)
1268 xor y0, e ; y0 = e ^ (e >> (25-11))
1269 mov y2, f ; y2 = f
1270 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1271 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
1272 xor y1, a ; y1 = a ^ (a >> (22-13)
1273 xor y2, g ; y2 = f^g
1274 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
1275 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1276 and y2, e ; y2 = (f^g)&e
1277 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1278 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
1279 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1280 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1281 xor y2, g ; y2 = CH = ((f^g)&e)^g
1282 pxor XTMP2, XTMP3
1283 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1284 add y2, y0 ; y2 = S1 + CH
1285 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
1286 pxor X0, XTMP2 ; X0 = s1 {xDxC}
1287 mov y0, a ; y0 = a
1288 add h, y2 ; h = h + S1 + CH + k + w
1289 mov y2, a ; y2 = a
1290 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
1291 or y0, c ; y0 = a|c
1292 add d, h ; d = d + h + S1 + CH + k + w
1293 and y2, c ; y2 = a&c
1294 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
1295 and y0, b ; y0 = (a|c)&b
1296 add h, y1 ; h = h + S1 + CH + k + w + S0
1297 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1298 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1299
1300ROTATE_ARGS
1301rotate_Xs
1302%endm
1303
1304;; input is [rsp + _XFER + %1 * 4]
1305%macro DO_ROUND 1
1306 mov y0, e ; y0 = e
1307 ror y0, (25-11) ; y0 = e >> (25-11)
1308 mov y1, a ; y1 = a
1309 xor y0, e ; y0 = e ^ (e >> (25-11))
1310 ror y1, (22-13) ; y1 = a >> (22-13)
1311 mov y2, f ; y2 = f
1312 xor y1, a ; y1 = a ^ (a >> (22-13)
1313 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
1314 xor y2, g ; y2 = f^g
1315 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
1316 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
1317 and y2, e ; y2 = (f^g)&e
1318 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
1319 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
1320 xor y2, g ; y2 = CH = ((f^g)&e)^g
1321 add y2, y0 ; y2 = S1 + CH
1322 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
1323 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
1324 mov y0, a ; y0 = a
1325 add h, y2 ; h = h + S1 + CH + k + w
1326 mov y2, a ; y2 = a
1327 or y0, c ; y0 = a|c
1328 add d, h ; d = d + h + S1 + CH + k + w
1329 and y2, c ; y2 = a&c
1330 and y0, b ; y0 = (a|c)&b
1331 add h, y1 ; h = h + S1 + CH + k + w + S0
1332 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
1333 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
1334 ROTATE_ARGS
1335%endm
1336
1337;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1338;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1339;; void sha256_sse4(void *input_data, UINT32 digest[8], UINT64 num_blks)
1340;; arg 1 : pointer to input data
1341;; arg 2 : pointer to digest
1342;; arg 3 : Num blocks
1343section .text
1344global sha256_sse4
1345align 32
1346sha256_sse4:
1347 push rbx
1348%ifndef LINUX
1349 push rsi
1350 push rdi
1351%endif
1352 push rbp
1353 push r13
1354 push r14
1355 push r15
1356
1357 sub rsp,STACK_SIZE
1358%ifndef LINUX
1359 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
1360 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
1361 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
1362 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
1363 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
1364 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
1365 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
1366%endif
1367
1368 shl NUM_BLKS, 6 ; convert to bytes
1369 jz done_hash
1370 add NUM_BLKS, INP ; pointer to end of data
1371 mov [rsp + _INP_END], NUM_BLKS
1372
1373 ;; load initial digest
1374 mov a,[4*0 + CTX]
1375 mov b,[4*1 + CTX]
1376 mov c,[4*2 + CTX]
1377 mov d,[4*3 + CTX]
1378 mov e,[4*4 + CTX]
1379 mov f,[4*5 + CTX]
1380 mov g,[4*6 + CTX]
1381 mov h,[4*7 + CTX]
1382
1383 movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK wrt rip]
1384 movdqa SHUF_00BA, [_SHUF_00BA wrt rip]
1385 movdqa SHUF_DC00, [_SHUF_DC00 wrt rip]
1386
1387loop0:
1388 lea TBL,[K256 wrt rip]
1389
1390 ;; byte swap first 16 dwords
1391 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
1392 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
1393 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
1394 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
1395
1396 mov [rsp + _INP], INP
1397
1398 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
1399 mov SRND, 3
1400align 16
1401loop1:
1402 movdqa XFER, [TBL + 0*16]
1403 paddd XFER, X0
1404 movdqa [rsp + _XFER], XFER
1405 FOUR_ROUNDS_AND_SCHED
1406
1407 movdqa XFER, [TBL + 1*16]
1408 paddd XFER, X0
1409 movdqa [rsp + _XFER], XFER
1410 FOUR_ROUNDS_AND_SCHED
1411
1412 movdqa XFER, [TBL + 2*16]
1413 paddd XFER, X0
1414 movdqa [rsp + _XFER], XFER
1415 FOUR_ROUNDS_AND_SCHED
1416
1417 movdqa XFER, [TBL + 3*16]
1418 paddd XFER, X0
1419 movdqa [rsp + _XFER], XFER
1420 add TBL, 4*16
1421 FOUR_ROUNDS_AND_SCHED
1422
1423 sub SRND, 1
1424 jne loop1
1425
1426 mov SRND, 2
1427loop2:
1428 paddd X0, [TBL + 0*16]
1429 movdqa [rsp + _XFER], X0
1430 DO_ROUND 0
1431 DO_ROUND 1
1432 DO_ROUND 2
1433 DO_ROUND 3
1434 paddd X1, [TBL + 1*16]
1435 movdqa [rsp + _XFER], X1
1436 add TBL, 2*16
1437 DO_ROUND 0
1438 DO_ROUND 1
1439 DO_ROUND 2
1440 DO_ROUND 3
1441
1442 movdqa X0, X2
1443 movdqa X1, X3
1444
1445 sub SRND, 1
1446 jne loop2
1447
1448 addm [4*0 + CTX],a
1449 addm [4*1 + CTX],b
1450 addm [4*2 + CTX],c
1451 addm [4*3 + CTX],d
1452 addm [4*4 + CTX],e
1453 addm [4*5 + CTX],f
1454 addm [4*6 + CTX],g
1455 addm [4*7 + CTX],h
1456
1457 mov INP, [rsp + _INP]
1458 add INP, 64
1459 cmp INP, [rsp + _INP_END]
1460 jne loop0
1461
1462done_hash:
1463%ifndef LINUX
1464 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
1465 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
1466 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
1467 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
1468 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
1469 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
1470 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
1471%endif
1472
1473 add rsp, STACK_SIZE
1474
1475 pop r15
1476 pop r14
1477 pop r13
1478 pop rbp
1479%ifndef LINUX
1480 pop rdi
1481 pop rsi
1482%endif
1483 pop rbx
1484
1485 ret
1486
1487
1488section .data
1489align 64
1490K256:
1491 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1492 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1493 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1494 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1495 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1496 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1497 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1498 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1499 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1500 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1501 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1502 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1503 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1504 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1505 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1506 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1507
1508PSHUFFLE_BYTE_FLIP_MASK: ddq 0x0c0d0e0f08090a0b0405060700010203
1509
1510; shuffle xBxA -> 00BA
1511_SHUF_00BA: ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
1512
1513; shuffle xDxC -> DC00
1514_SHUF_DC00: ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
1515*/
1516
1517#endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)