Bitcoin Core 30.99.0
P2P Digital Currency
sha256_arm_shani.cpp
Go to the documentation of this file.
1// Copyright (c) 2022 The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4//
5// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6// Written and placed in public domain by Jeffrey Walton.
7// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8// Barry O'Rourke for the mbedTLS project.
9// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11#ifdef ENABLE_ARM_SHANI
12
13#include <array>
14#include <cstdint>
15#include <cstddef>
16#include <arm_neon.h>
17
18namespace {
19alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
20{
21 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
22 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
23 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
24 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
25 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
26 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
27 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
28 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
29 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
30 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
31 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
32 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
33 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
34 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
35 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
36 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
37};
38}
39
40namespace sha256_arm_shani {
41void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
42{
43 uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
44 uint32x4_t MSG0, MSG1, MSG2, MSG3;
45 uint32x4_t TMP0, TMP2;
46
47 // Load state
48 STATE0 = vld1q_u32(&s[0]);
49 STATE1 = vld1q_u32(&s[4]);
50
51 while (blocks--)
52 {
53 // Save state
54 ABEF_SAVE = STATE0;
55 CDGH_SAVE = STATE1;
56
57 // Load and convert input chunk to Big Endian
58 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
59 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
60 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
61 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
62 chunk += 64;
63
64 // Original implementation preloaded message and constant addition which was 1-3% slower.
65 // Now included as first step in quad round code saving one Q Neon register
66 // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
67
68 // Rounds 1-4
69 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
70 TMP2 = STATE0;
71 MSG0 = vsha256su0q_u32(MSG0, MSG1);
72 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
73 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
74 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
75
76 // Rounds 5-8
77 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
78 TMP2 = STATE0;
79 MSG1 = vsha256su0q_u32(MSG1, MSG2);
80 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
81 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
82 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
83
84 // Rounds 9-12
85 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
86 TMP2 = STATE0;
87 MSG2 = vsha256su0q_u32(MSG2, MSG3);
88 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
89 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
90 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
91
92 // Rounds 13-16
93 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
94 TMP2 = STATE0;
95 MSG3 = vsha256su0q_u32(MSG3, MSG0);
96 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
97 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
98 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
99
100 // Rounds 17-20
101 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
102 TMP2 = STATE0;
103 MSG0 = vsha256su0q_u32(MSG0, MSG1);
104 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
105 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
106 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
107
108 // Rounds 21-24
109 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
110 TMP2 = STATE0;
111 MSG1 = vsha256su0q_u32(MSG1, MSG2);
112 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
113 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
114 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
115
116 // Rounds 25-28
117 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
118 TMP2 = STATE0;
119 MSG2 = vsha256su0q_u32(MSG2, MSG3);
120 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
121 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
122 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
123
124 // Rounds 29-32
125 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
126 TMP2 = STATE0;
127 MSG3 = vsha256su0q_u32(MSG3, MSG0);
128 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
129 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
130 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
131
132 // Rounds 33-36
133 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
134 TMP2 = STATE0;
135 MSG0 = vsha256su0q_u32(MSG0, MSG1);
136 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
137 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
138 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
139
140 // Rounds 37-40
141 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
142 TMP2 = STATE0;
143 MSG1 = vsha256su0q_u32(MSG1, MSG2);
144 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
145 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
146 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
147
148 // Rounds 41-44
149 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
150 TMP2 = STATE0;
151 MSG2 = vsha256su0q_u32(MSG2, MSG3);
152 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
153 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
154 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
155
156 // Rounds 45-48
157 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
158 TMP2 = STATE0;
159 MSG3 = vsha256su0q_u32(MSG3, MSG0);
160 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
161 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
162 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
163
164 // Rounds 49-52
165 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
166 TMP2 = STATE0;
167 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
168 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
169
170 // Rounds 53-56
171 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
172 TMP2 = STATE0;
173 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
174 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
175
176 // Rounds 57-60
177 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
178 TMP2 = STATE0;
179 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
180 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
181
182 // Rounds 61-64
183 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
184 TMP2 = STATE0;
185 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
186 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
187
188 // Update state
189 STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
190 STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
191 }
192
193 // Save final state
194 vst1q_u32(&s[0], STATE0);
195 vst1q_u32(&s[4], STATE1);
196}
197}
198
199namespace sha256d64_arm_shani {
200void Transform_2way(unsigned char* output, const unsigned char* input)
201{
202 /* Initial state. */
203 alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
204 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
205 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
206 };
207
208 /* Precomputed message schedule for the 2nd transform. */
209 alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
210 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
211 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
212 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
213 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
214 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
215 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
216 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
217 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
218 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
219 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
220 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
221 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
222 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
223 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
224 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
225 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
226 };
227
228 /* A few precomputed message schedule values for the 3rd transform. */
229 alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
230 0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
231 0x80000000, 0x00000000, 0x00000000, 0x00000000,
232 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
233 };
234
235 /* Padding processed in the 3rd transform (byteswapped). */
236 alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
237
238 uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
239 uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
240 uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
241
242 // Transform 1: Load state
243 STATE0A = vld1q_u32(&INIT[0]);
244 STATE0B = STATE0A;
245 STATE1A = vld1q_u32(&INIT[4]);
246 STATE1B = STATE1A;
247
248 // Transform 1: Load and convert input chunk to Big Endian
249 MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
250 MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
251 MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
252 MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
253 MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
254 MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
255 MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
256 MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
257
258 // Transform 1: Rounds 1-4
259 TMP = vld1q_u32(&K[0]);
260 TMP0A = vaddq_u32(MSG0A, TMP);
261 TMP0B = vaddq_u32(MSG0B, TMP);
262 TMP2A = STATE0A;
263 TMP2B = STATE0B;
264 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
265 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
266 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
267 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
268 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
269 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
270 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
271 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
272
273 // Transform 1: Rounds 5-8
274 TMP = vld1q_u32(&K[4]);
275 TMP0A = vaddq_u32(MSG1A, TMP);
276 TMP0B = vaddq_u32(MSG1B, TMP);
277 TMP2A = STATE0A;
278 TMP2B = STATE0B;
279 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
280 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
281 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
282 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
283 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
284 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
285 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
286 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
287
288 // Transform 1: Rounds 9-12
289 TMP = vld1q_u32(&K[8]);
290 TMP0A = vaddq_u32(MSG2A, TMP);
291 TMP0B = vaddq_u32(MSG2B, TMP);
292 TMP2A = STATE0A;
293 TMP2B = STATE0B;
294 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
295 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
296 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
297 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
298 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
299 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
300 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
301 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
302
303 // Transform 1: Rounds 13-16
304 TMP = vld1q_u32(&K[12]);
305 TMP0A = vaddq_u32(MSG3A, TMP);
306 TMP0B = vaddq_u32(MSG3B, TMP);
307 TMP2A = STATE0A;
308 TMP2B = STATE0B;
309 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
310 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
311 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
312 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
313 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
314 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
315 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
316 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
317
318 // Transform 1: Rounds 17-20
319 TMP = vld1q_u32(&K[16]);
320 TMP0A = vaddq_u32(MSG0A, TMP);
321 TMP0B = vaddq_u32(MSG0B, TMP);
322 TMP2A = STATE0A;
323 TMP2B = STATE0B;
324 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
325 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
326 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
327 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
328 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
329 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
330 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
331 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
332
333 // Transform 1: Rounds 21-24
334 TMP = vld1q_u32(&K[20]);
335 TMP0A = vaddq_u32(MSG1A, TMP);
336 TMP0B = vaddq_u32(MSG1B, TMP);
337 TMP2A = STATE0A;
338 TMP2B = STATE0B;
339 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
340 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
341 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
342 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
343 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
344 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
345 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
346 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
347
348 // Transform 1: Rounds 25-28
349 TMP = vld1q_u32(&K[24]);
350 TMP0A = vaddq_u32(MSG2A, TMP);
351 TMP0B = vaddq_u32(MSG2B, TMP);
352 TMP2A = STATE0A;
353 TMP2B = STATE0B;
354 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
355 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
356 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
357 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
358 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
359 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
360 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
361 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
362
363 // Transform 1: Rounds 29-32
364 TMP = vld1q_u32(&K[28]);
365 TMP0A = vaddq_u32(MSG3A, TMP);
366 TMP0B = vaddq_u32(MSG3B, TMP);
367 TMP2A = STATE0A;
368 TMP2B = STATE0B;
369 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
370 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
371 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
372 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
373 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
374 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
375 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
376 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
377
378 // Transform 1: Rounds 33-36
379 TMP = vld1q_u32(&K[32]);
380 TMP0A = vaddq_u32(MSG0A, TMP);
381 TMP0B = vaddq_u32(MSG0B, TMP);
382 TMP2A = STATE0A;
383 TMP2B = STATE0B;
384 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
385 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
386 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
387 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
388 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
389 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
390 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
391 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
392
393 // Transform 1: Rounds 37-40
394 TMP = vld1q_u32(&K[36]);
395 TMP0A = vaddq_u32(MSG1A, TMP);
396 TMP0B = vaddq_u32(MSG1B, TMP);
397 TMP2A = STATE0A;
398 TMP2B = STATE0B;
399 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
400 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
401 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
402 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
403 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
404 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
405 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
406 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
407
408 // Transform 1: Rounds 41-44
409 TMP = vld1q_u32(&K[40]);
410 TMP0A = vaddq_u32(MSG2A, TMP);
411 TMP0B = vaddq_u32(MSG2B, TMP);
412 TMP2A = STATE0A;
413 TMP2B = STATE0B;
414 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
415 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
416 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
417 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
418 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
419 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
420 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
421 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
422
423 // Transform 1: Rounds 45-48
424 TMP = vld1q_u32(&K[44]);
425 TMP0A = vaddq_u32(MSG3A, TMP);
426 TMP0B = vaddq_u32(MSG3B, TMP);
427 TMP2A = STATE0A;
428 TMP2B = STATE0B;
429 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
430 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
431 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
432 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
433 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
434 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
435 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
436 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
437
438 // Transform 1: Rounds 49-52
439 TMP = vld1q_u32(&K[48]);
440 TMP0A = vaddq_u32(MSG0A, TMP);
441 TMP0B = vaddq_u32(MSG0B, TMP);
442 TMP2A = STATE0A;
443 TMP2B = STATE0B;
444 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
445 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
446 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
447 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
448
449 // Transform 1: Rounds 53-56
450 TMP = vld1q_u32(&K[52]);
451 TMP0A = vaddq_u32(MSG1A, TMP);
452 TMP0B = vaddq_u32(MSG1B, TMP);
453 TMP2A = STATE0A;
454 TMP2B = STATE0B;
455 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
456 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
457 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
458 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
459
460 // Transform 1: Rounds 57-60
461 TMP = vld1q_u32(&K[56]);
462 TMP0A = vaddq_u32(MSG2A, TMP);
463 TMP0B = vaddq_u32(MSG2B, TMP);
464 TMP2A = STATE0A;
465 TMP2B = STATE0B;
466 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
467 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
468 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
469 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
470
471 // Transform 1: Rounds 61-64
472 TMP = vld1q_u32(&K[60]);
473 TMP0A = vaddq_u32(MSG3A, TMP);
474 TMP0B = vaddq_u32(MSG3B, TMP);
475 TMP2A = STATE0A;
476 TMP2B = STATE0B;
477 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
478 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
479 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
480 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
481
482 // Transform 1: Update state
483 TMP = vld1q_u32(&INIT[0]);
484 STATE0A = vaddq_u32(STATE0A, TMP);
485 STATE0B = vaddq_u32(STATE0B, TMP);
486 TMP = vld1q_u32(&INIT[4]);
487 STATE1A = vaddq_u32(STATE1A, TMP);
488 STATE1B = vaddq_u32(STATE1B, TMP);
489
490 // Transform 2: Save state
491 ABEF_SAVEA = STATE0A;
492 ABEF_SAVEB = STATE0B;
493 CDGH_SAVEA = STATE1A;
494 CDGH_SAVEB = STATE1B;
495
496 // Transform 2: Rounds 1-4
497 TMP = vld1q_u32(&MIDS[0]);
498 TMP2A = STATE0A;
499 TMP2B = STATE0B;
500 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
501 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
502 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
503 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
504
505 // Transform 2: Rounds 5-8
506 TMP = vld1q_u32(&MIDS[4]);
507 TMP2A = STATE0A;
508 TMP2B = STATE0B;
509 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
510 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
511 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
512 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
513
514 // Transform 2: Rounds 9-12
515 TMP = vld1q_u32(&MIDS[8]);
516 TMP2A = STATE0A;
517 TMP2B = STATE0B;
518 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
519 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
520 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
521 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
522
523 // Transform 2: Rounds 13-16
524 TMP = vld1q_u32(&MIDS[12]);
525 TMP2A = STATE0A;
526 TMP2B = STATE0B;
527 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
528 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
529 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
530 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
531
532 // Transform 2: Rounds 17-20
533 TMP = vld1q_u32(&MIDS[16]);
534 TMP2A = STATE0A;
535 TMP2B = STATE0B;
536 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
537 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
538 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
539 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
540
541 // Transform 2: Rounds 21-24
542 TMP = vld1q_u32(&MIDS[20]);
543 TMP2A = STATE0A;
544 TMP2B = STATE0B;
545 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
546 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
547 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
548 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
549
550 // Transform 2: Rounds 25-28
551 TMP = vld1q_u32(&MIDS[24]);
552 TMP2A = STATE0A;
553 TMP2B = STATE0B;
554 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
555 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
556 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
557 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
558
559 // Transform 2: Rounds 29-32
560 TMP = vld1q_u32(&MIDS[28]);
561 TMP2A = STATE0A;
562 TMP2B = STATE0B;
563 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
564 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
565 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
566 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
567
568 // Transform 2: Rounds 33-36
569 TMP = vld1q_u32(&MIDS[32]);
570 TMP2A = STATE0A;
571 TMP2B = STATE0B;
572 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
573 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
574 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
575 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
576
577 // Transform 2: Rounds 37-40
578 TMP = vld1q_u32(&MIDS[36]);
579 TMP2A = STATE0A;
580 TMP2B = STATE0B;
581 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
582 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
583 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
584 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
585
586 // Transform 2: Rounds 41-44
587 TMP = vld1q_u32(&MIDS[40]);
588 TMP2A = STATE0A;
589 TMP2B = STATE0B;
590 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
591 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
592 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
593 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
594
595 // Transform 2: Rounds 45-48
596 TMP = vld1q_u32(&MIDS[44]);
597 TMP2A = STATE0A;
598 TMP2B = STATE0B;
599 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
600 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
601 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
602 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
603
604 // Transform 2: Rounds 49-52
605 TMP = vld1q_u32(&MIDS[48]);
606 TMP2A = STATE0A;
607 TMP2B = STATE0B;
608 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
609 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
610 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
611 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
612
613 // Transform 2: Rounds 53-56
614 TMP = vld1q_u32(&MIDS[52]);
615 TMP2A = STATE0A;
616 TMP2B = STATE0B;
617 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
618 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
619 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
620 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
621
622 // Transform 2: Rounds 57-60
623 TMP = vld1q_u32(&MIDS[56]);
624 TMP2A = STATE0A;
625 TMP2B = STATE0B;
626 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
627 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
628 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
629 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
630
631 // Transform 2: Rounds 61-64
632 TMP = vld1q_u32(&MIDS[60]);
633 TMP2A = STATE0A;
634 TMP2B = STATE0B;
635 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
636 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
637 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
638 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
639
640 // Transform 2: Update state
641 STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
642 STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
643 STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
644 STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
645
646 // Transform 3: Pad previous output
647 MSG0A = STATE0A;
648 MSG0B = STATE0B;
649 MSG1A = STATE1A;
650 MSG1B = STATE1B;
651 MSG2A = vld1q_u32(&FINAL[0]);
652 MSG2B = MSG2A;
653 MSG3A = vld1q_u32(&FINAL[4]);
654 MSG3B = MSG3A;
655
656 // Transform 3: Load state
657 STATE0A = vld1q_u32(&INIT[0]);
658 STATE0B = STATE0A;
659 STATE1A = vld1q_u32(&INIT[4]);
660 STATE1B = STATE1A;
661
662 // Transform 3: Rounds 1-4
663 TMP = vld1q_u32(&K[0]);
664 TMP0A = vaddq_u32(MSG0A, TMP);
665 TMP0B = vaddq_u32(MSG0B, TMP);
666 TMP2A = STATE0A;
667 TMP2B = STATE0B;
668 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
669 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
670 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
671 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
672 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
673 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
674 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
675 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
676
677 // Transform 3: Rounds 5-8
678 TMP = vld1q_u32(&K[4]);
679 TMP0A = vaddq_u32(MSG1A, TMP);
680 TMP0B = vaddq_u32(MSG1B, TMP);
681 TMP2A = STATE0A;
682 TMP2B = STATE0B;
683 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
684 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
685 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
686 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
687 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
688 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
689 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
690 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
691
692 // Transform 3: Rounds 9-12
693 TMP = vld1q_u32(&FINS[0]);
694 TMP2A = STATE0A;
695 TMP2B = STATE0B;
696 MSG2A = vld1q_u32(&FINS[4]);
697 MSG2B = MSG2A;
698 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
699 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
700 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
701 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
702 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
703 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
704
705 // Transform 3: Rounds 13-16
706 TMP = vld1q_u32(&FINS[8]);
707 TMP2A = STATE0A;
708 TMP2B = STATE0B;
709 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
710 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
711 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
712 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
713 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
714 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
715 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
716 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
717
718 // Transform 3: Rounds 17-20
719 TMP = vld1q_u32(&K[16]);
720 TMP0A = vaddq_u32(MSG0A, TMP);
721 TMP0B = vaddq_u32(MSG0B, TMP);
722 TMP2A = STATE0A;
723 TMP2B = STATE0B;
724 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
725 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
726 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
727 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
728 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
729 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
730 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
731 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
732
733 // Transform 3: Rounds 21-24
734 TMP = vld1q_u32(&K[20]);
735 TMP0A = vaddq_u32(MSG1A, TMP);
736 TMP0B = vaddq_u32(MSG1B, TMP);
737 TMP2A = STATE0A;
738 TMP2B = STATE0B;
739 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
740 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
741 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
742 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
743 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
744 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
745 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
746 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
747
748 // Transform 3: Rounds 25-28
749 TMP = vld1q_u32(&K[24]);
750 TMP0A = vaddq_u32(MSG2A, TMP);
751 TMP0B = vaddq_u32(MSG2B, TMP);
752 TMP2A = STATE0A;
753 TMP2B = STATE0B;
754 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
755 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
756 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
757 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
758 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
759 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
760 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
761 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
762
763 // Transform 3: Rounds 29-32
764 TMP = vld1q_u32(&K[28]);
765 TMP0A = vaddq_u32(MSG3A, TMP);
766 TMP0B = vaddq_u32(MSG3B, TMP);
767 TMP2A = STATE0A;
768 TMP2B = STATE0B;
769 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
770 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
771 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
772 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
773 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
774 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
775 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
776 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
777
778 // Transform 3: Rounds 33-36
779 TMP = vld1q_u32(&K[32]);
780 TMP0A = vaddq_u32(MSG0A, TMP);
781 TMP0B = vaddq_u32(MSG0B, TMP);
782 TMP2A = STATE0A;
783 TMP2B = STATE0B;
784 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
785 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
786 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
787 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
788 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
789 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
790 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
791 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
792
793 // Transform 3: Rounds 37-40
794 TMP = vld1q_u32(&K[36]);
795 TMP0A = vaddq_u32(MSG1A, TMP);
796 TMP0B = vaddq_u32(MSG1B, TMP);
797 TMP2A = STATE0A;
798 TMP2B = STATE0B;
799 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
800 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
801 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
802 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
803 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
804 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
805 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
806 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
807
808 // Transform 3: Rounds 41-44
809 TMP = vld1q_u32(&K[40]);
810 TMP0A = vaddq_u32(MSG2A, TMP);
811 TMP0B = vaddq_u32(MSG2B, TMP);
812 TMP2A = STATE0A;
813 TMP2B = STATE0B;
814 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
815 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
816 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
817 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
818 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
819 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
820 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
821 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
822
823 // Transform 3: Rounds 45-48
824 TMP = vld1q_u32(&K[44]);
825 TMP0A = vaddq_u32(MSG3A, TMP);
826 TMP0B = vaddq_u32(MSG3B, TMP);
827 TMP2A = STATE0A;
828 TMP2B = STATE0B;
829 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
830 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
831 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
832 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
833 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
834 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
835 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
836 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
837
838 // Transform 3: Rounds 49-52
839 TMP = vld1q_u32(&K[48]);
840 TMP0A = vaddq_u32(MSG0A, TMP);
841 TMP0B = vaddq_u32(MSG0B, TMP);
842 TMP2A = STATE0A;
843 TMP2B = STATE0B;
844 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
845 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
846 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
847 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
848
849 // Transform 3: Rounds 53-56
850 TMP = vld1q_u32(&K[52]);
851 TMP0A = vaddq_u32(MSG1A, TMP);
852 TMP0B = vaddq_u32(MSG1B, TMP);
853 TMP2A = STATE0A;
854 TMP2B = STATE0B;
855 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
856 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
857 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
858 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
859
860 // Transform 3: Rounds 57-60
861 TMP = vld1q_u32(&K[56]);
862 TMP0A = vaddq_u32(MSG2A, TMP);
863 TMP0B = vaddq_u32(MSG2B, TMP);
864 TMP2A = STATE0A;
865 TMP2B = STATE0B;
866 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
867 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
868 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
869 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
870
871 // Transform 3: Rounds 61-64
872 TMP = vld1q_u32(&K[60]);
873 TMP0A = vaddq_u32(MSG3A, TMP);
874 TMP0B = vaddq_u32(MSG3B, TMP);
875 TMP2A = STATE0A;
876 TMP2B = STATE0B;
877 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
878 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
879 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
880 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
881
882 // Transform 3: Update state
883 TMP = vld1q_u32(&INIT[0]);
884 STATE0A = vaddq_u32(STATE0A, TMP);
885 STATE0B = vaddq_u32(STATE0B, TMP);
886 TMP = vld1q_u32(&INIT[4]);
887 STATE1A = vaddq_u32(STATE1A, TMP);
888 STATE1B = vaddq_u32(STATE1B, TMP);
889
890 // Store result
891 vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
892 vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
893 vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
894 vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
895}
896}
897
898#endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)
void Transform_2way(unsigned char *out, const unsigned char *in)