Bitcoin Core 28.99.0
P2P Digital Currency
sha256_arm_shani.cpp
Go to the documentation of this file.
1// Copyright (c) 2022 The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4//
5// Based on https://github.com/noloader/SHA-Intrinsics/blob/master/sha256-arm.c,
6// Written and placed in public domain by Jeffrey Walton.
7// Based on code from ARM, and by Johannes Schneiders, Skip Hovsmith and
8// Barry O'Rourke for the mbedTLS project.
9// Variant specialized for 64-byte inputs added by Pieter Wuille.
10
11#ifdef ENABLE_ARM_SHANI
12
13#include <array>
14#include <cstdint>
15#include <cstddef>
16#include <arm_acle.h>
17#include <arm_neon.h>
18
19namespace {
20alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> K =
21{
22 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
23 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
24 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
25 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
26 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
27 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
28 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
29 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
30 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
31 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
32 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
33 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
34 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
35 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
36 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
37 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
38};
39}
40
41namespace sha256_arm_shani {
42void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
43{
44 uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
45 uint32x4_t MSG0, MSG1, MSG2, MSG3;
46 uint32x4_t TMP0, TMP2;
47
48 // Load state
49 STATE0 = vld1q_u32(&s[0]);
50 STATE1 = vld1q_u32(&s[4]);
51
52 while (blocks--)
53 {
54 // Save state
55 ABEF_SAVE = STATE0;
56 CDGH_SAVE = STATE1;
57
58 // Load and convert input chunk to Big Endian
59 MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
60 MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
61 MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
62 MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
63 chunk += 64;
64
65 // Original implementation preloaded message and constant addition which was 1-3% slower.
66 // Now included as first step in quad round code saving one Q Neon register
67 // "TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));"
68
69 // Rounds 1-4
70 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0]));
71 TMP2 = STATE0;
72 MSG0 = vsha256su0q_u32(MSG0, MSG1);
73 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
74 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
75 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
76
77 // Rounds 5-8
78 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[4]));
79 TMP2 = STATE0;
80 MSG1 = vsha256su0q_u32(MSG1, MSG2);
81 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
82 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
83 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
84
85 // Rounds 9-12
86 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[8]));
87 TMP2 = STATE0;
88 MSG2 = vsha256su0q_u32(MSG2, MSG3);
89 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
90 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
91 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
92
93 // Rounds 13-16
94 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[12]));
95 TMP2 = STATE0;
96 MSG3 = vsha256su0q_u32(MSG3, MSG0);
97 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
98 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
99 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
100
101 // Rounds 17-20
102 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[16]));
103 TMP2 = STATE0;
104 MSG0 = vsha256su0q_u32(MSG0, MSG1);
105 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
106 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
107 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
108
109 // Rounds 21-24
110 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[20]));
111 TMP2 = STATE0;
112 MSG1 = vsha256su0q_u32(MSG1, MSG2);
113 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
114 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
115 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
116
117 // Rounds 25-28
118 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[24]));
119 TMP2 = STATE0;
120 MSG2 = vsha256su0q_u32(MSG2, MSG3);
121 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
122 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
123 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
124
125 // Rounds 29-32
126 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[28]));
127 TMP2 = STATE0;
128 MSG3 = vsha256su0q_u32(MSG3, MSG0);
129 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
130 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
131 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
132
133 // Rounds 33-36
134 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[32]));
135 TMP2 = STATE0;
136 MSG0 = vsha256su0q_u32(MSG0, MSG1);
137 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
138 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
139 MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
140
141 // Rounds 37-40
142 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[36]));
143 TMP2 = STATE0;
144 MSG1 = vsha256su0q_u32(MSG1, MSG2);
145 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
146 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
147 MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
148
149 // Rounds 41-44
150 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[40]));
151 TMP2 = STATE0;
152 MSG2 = vsha256su0q_u32(MSG2, MSG3);
153 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
154 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
155 MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
156
157 // Rounds 45-48
158 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[44]));
159 TMP2 = STATE0;
160 MSG3 = vsha256su0q_u32(MSG3, MSG0);
161 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
162 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
163 MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
164
165 // Rounds 49-52
166 TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[48]));
167 TMP2 = STATE0;
168 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
169 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
170
171 // Rounds 53-56
172 TMP0 = vaddq_u32(MSG1, vld1q_u32(&K[52]));
173 TMP2 = STATE0;
174 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
175 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
176
177 // Rounds 57-60
178 TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[56]));
179 TMP2 = STATE0;
180 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
181 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
182
183 // Rounds 61-64
184 TMP0 = vaddq_u32(MSG3, vld1q_u32(&K[60]));
185 TMP2 = STATE0;
186 STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
187 STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
188
189 // Update state
190 STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
191 STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
192 }
193
194 // Save final state
195 vst1q_u32(&s[0], STATE0);
196 vst1q_u32(&s[4], STATE1);
197}
198}
199
200namespace sha256d64_arm_shani {
201void Transform_2way(unsigned char* output, const unsigned char* input)
202{
203 /* Initial state. */
204 alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> INIT = {
205 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
206 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
207 };
208
209 /* Precomputed message schedule for the 2nd transform. */
210 alignas(uint32x4_t) static constexpr std::array<uint32_t, 64> MIDS = {
211 0xc28a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
212 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
213 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
214 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf374,
215 0x649b69c1, 0xf0fe4786, 0x0fe1edc6, 0x240cf254,
216 0x4fe9346f, 0x6cc984be, 0x61b9411e, 0x16f988fa,
217 0xf2c65152, 0xa88e5a6d, 0xb019fc65, 0xb9d99ec7,
218 0x9a1231c3, 0xe70eeaa0, 0xfdb1232b, 0xc7353eb0,
219 0x3069bad5, 0xcb976d5f, 0x5a0f118f, 0xdc1eeefd,
220 0x0a35b689, 0xde0b7a04, 0x58f4ca9d, 0xe15d5b16,
221 0x007f3e86, 0x37088980, 0xa507ea32, 0x6fab9537,
222 0x17406110, 0x0d8cd6f1, 0xcdaa3b6d, 0xc0bbbe37,
223 0x83613bda, 0xdb48a363, 0x0b02e931, 0x6fd15ca7,
224 0x521afaca, 0x31338431, 0x6ed41a95, 0x6d437890,
225 0xc39c91f2, 0x9eccabbd, 0xb5c9a0e6, 0x532fb63c,
226 0xd2c741c6, 0x07237ea3, 0xa4954b68, 0x4c191d76
227 };
228
229 /* A few precomputed message schedule values for the 3rd transform. */
230 alignas(uint32x4_t) static constexpr std::array<uint32_t, 12> FINS = {
231 0x5807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
232 0x80000000, 0x00000000, 0x00000000, 0x00000000,
233 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf274
234 };
235
236 /* Padding processed in the 3rd transform (byteswapped). */
237 alignas(uint32x4_t) static constexpr std::array<uint32_t, 8> FINAL = {0x80000000, 0, 0, 0, 0, 0, 0, 0x100};
238
239 uint32x4_t STATE0A, STATE0B, STATE1A, STATE1B, ABEF_SAVEA, ABEF_SAVEB, CDGH_SAVEA, CDGH_SAVEB;
240 uint32x4_t MSG0A, MSG0B, MSG1A, MSG1B, MSG2A, MSG2B, MSG3A, MSG3B;
241 uint32x4_t TMP0A, TMP0B, TMP2A, TMP2B, TMP;
242
243 // Transform 1: Load state
244 STATE0A = vld1q_u32(&INIT[0]);
245 STATE0B = STATE0A;
246 STATE1A = vld1q_u32(&INIT[4]);
247 STATE1B = STATE1A;
248
249 // Transform 1: Load and convert input chunk to Big Endian
250 MSG0A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 0)));
251 MSG1A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 16)));
252 MSG2A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 32)));
253 MSG3A = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 48)));
254 MSG0B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 64)));
255 MSG1B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 80)));
256 MSG2B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 96)));
257 MSG3B = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(input + 112)));
258
259 // Transform 1: Rounds 1-4
260 TMP = vld1q_u32(&K[0]);
261 TMP0A = vaddq_u32(MSG0A, TMP);
262 TMP0B = vaddq_u32(MSG0B, TMP);
263 TMP2A = STATE0A;
264 TMP2B = STATE0B;
265 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
266 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
267 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
268 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
269 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
270 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
271 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
272 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
273
274 // Transform 1: Rounds 5-8
275 TMP = vld1q_u32(&K[4]);
276 TMP0A = vaddq_u32(MSG1A, TMP);
277 TMP0B = vaddq_u32(MSG1B, TMP);
278 TMP2A = STATE0A;
279 TMP2B = STATE0B;
280 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
281 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
282 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
283 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
284 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
285 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
286 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
287 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
288
289 // Transform 1: Rounds 9-12
290 TMP = vld1q_u32(&K[8]);
291 TMP0A = vaddq_u32(MSG2A, TMP);
292 TMP0B = vaddq_u32(MSG2B, TMP);
293 TMP2A = STATE0A;
294 TMP2B = STATE0B;
295 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
296 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
297 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
298 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
299 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
300 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
301 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
302 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
303
304 // Transform 1: Rounds 13-16
305 TMP = vld1q_u32(&K[12]);
306 TMP0A = vaddq_u32(MSG3A, TMP);
307 TMP0B = vaddq_u32(MSG3B, TMP);
308 TMP2A = STATE0A;
309 TMP2B = STATE0B;
310 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
311 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
312 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
313 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
314 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
315 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
316 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
317 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
318
319 // Transform 1: Rounds 17-20
320 TMP = vld1q_u32(&K[16]);
321 TMP0A = vaddq_u32(MSG0A, TMP);
322 TMP0B = vaddq_u32(MSG0B, TMP);
323 TMP2A = STATE0A;
324 TMP2B = STATE0B;
325 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
326 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
327 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
328 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
329 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
330 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
331 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
332 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
333
334 // Transform 1: Rounds 21-24
335 TMP = vld1q_u32(&K[20]);
336 TMP0A = vaddq_u32(MSG1A, TMP);
337 TMP0B = vaddq_u32(MSG1B, TMP);
338 TMP2A = STATE0A;
339 TMP2B = STATE0B;
340 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
341 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
342 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
343 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
344 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
345 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
346 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
347 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
348
349 // Transform 1: Rounds 25-28
350 TMP = vld1q_u32(&K[24]);
351 TMP0A = vaddq_u32(MSG2A, TMP);
352 TMP0B = vaddq_u32(MSG2B, TMP);
353 TMP2A = STATE0A;
354 TMP2B = STATE0B;
355 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
356 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
357 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
358 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
359 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
360 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
361 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
362 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
363
364 // Transform 1: Rounds 29-32
365 TMP = vld1q_u32(&K[28]);
366 TMP0A = vaddq_u32(MSG3A, TMP);
367 TMP0B = vaddq_u32(MSG3B, TMP);
368 TMP2A = STATE0A;
369 TMP2B = STATE0B;
370 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
371 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
372 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
373 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
374 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
375 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
376 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
377 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
378
379 // Transform 1: Rounds 33-36
380 TMP = vld1q_u32(&K[32]);
381 TMP0A = vaddq_u32(MSG0A, TMP);
382 TMP0B = vaddq_u32(MSG0B, TMP);
383 TMP2A = STATE0A;
384 TMP2B = STATE0B;
385 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
386 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
387 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
388 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
389 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
390 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
391 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
392 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
393
394 // Transform 1: Rounds 37-40
395 TMP = vld1q_u32(&K[36]);
396 TMP0A = vaddq_u32(MSG1A, TMP);
397 TMP0B = vaddq_u32(MSG1B, TMP);
398 TMP2A = STATE0A;
399 TMP2B = STATE0B;
400 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
401 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
402 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
403 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
404 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
405 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
406 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
407 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
408
409 // Transform 1: Rounds 41-44
410 TMP = vld1q_u32(&K[40]);
411 TMP0A = vaddq_u32(MSG2A, TMP);
412 TMP0B = vaddq_u32(MSG2B, TMP);
413 TMP2A = STATE0A;
414 TMP2B = STATE0B;
415 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
416 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
417 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
418 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
419 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
420 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
421 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
422 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
423
424 // Transform 1: Rounds 45-48
425 TMP = vld1q_u32(&K[44]);
426 TMP0A = vaddq_u32(MSG3A, TMP);
427 TMP0B = vaddq_u32(MSG3B, TMP);
428 TMP2A = STATE0A;
429 TMP2B = STATE0B;
430 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
431 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
432 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
433 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
434 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
435 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
436 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
437 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
438
439 // Transform 1: Rounds 49-52
440 TMP = vld1q_u32(&K[48]);
441 TMP0A = vaddq_u32(MSG0A, TMP);
442 TMP0B = vaddq_u32(MSG0B, TMP);
443 TMP2A = STATE0A;
444 TMP2B = STATE0B;
445 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
446 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
447 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
448 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
449
450 // Transform 1: Rounds 53-56
451 TMP = vld1q_u32(&K[52]);
452 TMP0A = vaddq_u32(MSG1A, TMP);
453 TMP0B = vaddq_u32(MSG1B, TMP);
454 TMP2A = STATE0A;
455 TMP2B = STATE0B;
456 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
457 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
458 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
459 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
460
461 // Transform 1: Rounds 57-60
462 TMP = vld1q_u32(&K[56]);
463 TMP0A = vaddq_u32(MSG2A, TMP);
464 TMP0B = vaddq_u32(MSG2B, TMP);
465 TMP2A = STATE0A;
466 TMP2B = STATE0B;
467 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
468 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
469 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
470 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
471
472 // Transform 1: Rounds 61-64
473 TMP = vld1q_u32(&K[60]);
474 TMP0A = vaddq_u32(MSG3A, TMP);
475 TMP0B = vaddq_u32(MSG3B, TMP);
476 TMP2A = STATE0A;
477 TMP2B = STATE0B;
478 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
479 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
480 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
481 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
482
483 // Transform 1: Update state
484 TMP = vld1q_u32(&INIT[0]);
485 STATE0A = vaddq_u32(STATE0A, TMP);
486 STATE0B = vaddq_u32(STATE0B, TMP);
487 TMP = vld1q_u32(&INIT[4]);
488 STATE1A = vaddq_u32(STATE1A, TMP);
489 STATE1B = vaddq_u32(STATE1B, TMP);
490
491 // Transform 2: Save state
492 ABEF_SAVEA = STATE0A;
493 ABEF_SAVEB = STATE0B;
494 CDGH_SAVEA = STATE1A;
495 CDGH_SAVEB = STATE1B;
496
497 // Transform 2: Rounds 1-4
498 TMP = vld1q_u32(&MIDS[0]);
499 TMP2A = STATE0A;
500 TMP2B = STATE0B;
501 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
502 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
503 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
504 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
505
506 // Transform 2: Rounds 5-8
507 TMP = vld1q_u32(&MIDS[4]);
508 TMP2A = STATE0A;
509 TMP2B = STATE0B;
510 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
511 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
512 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
513 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
514
515 // Transform 2: Rounds 9-12
516 TMP = vld1q_u32(&MIDS[8]);
517 TMP2A = STATE0A;
518 TMP2B = STATE0B;
519 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
520 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
521 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
522 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
523
524 // Transform 2: Rounds 13-16
525 TMP = vld1q_u32(&MIDS[12]);
526 TMP2A = STATE0A;
527 TMP2B = STATE0B;
528 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
529 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
530 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
531 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
532
533 // Transform 2: Rounds 17-20
534 TMP = vld1q_u32(&MIDS[16]);
535 TMP2A = STATE0A;
536 TMP2B = STATE0B;
537 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
538 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
539 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
540 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
541
542 // Transform 2: Rounds 21-24
543 TMP = vld1q_u32(&MIDS[20]);
544 TMP2A = STATE0A;
545 TMP2B = STATE0B;
546 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
547 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
548 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
549 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
550
551 // Transform 2: Rounds 25-28
552 TMP = vld1q_u32(&MIDS[24]);
553 TMP2A = STATE0A;
554 TMP2B = STATE0B;
555 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
556 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
557 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
558 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
559
560 // Transform 2: Rounds 29-32
561 TMP = vld1q_u32(&MIDS[28]);
562 TMP2A = STATE0A;
563 TMP2B = STATE0B;
564 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
565 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
566 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
567 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
568
569 // Transform 2: Rounds 33-36
570 TMP = vld1q_u32(&MIDS[32]);
571 TMP2A = STATE0A;
572 TMP2B = STATE0B;
573 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
574 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
575 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
576 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
577
578 // Transform 2: Rounds 37-40
579 TMP = vld1q_u32(&MIDS[36]);
580 TMP2A = STATE0A;
581 TMP2B = STATE0B;
582 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
583 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
584 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
585 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
586
587 // Transform 2: Rounds 41-44
588 TMP = vld1q_u32(&MIDS[40]);
589 TMP2A = STATE0A;
590 TMP2B = STATE0B;
591 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
592 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
593 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
594 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
595
596 // Transform 2: Rounds 45-48
597 TMP = vld1q_u32(&MIDS[44]);
598 TMP2A = STATE0A;
599 TMP2B = STATE0B;
600 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
601 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
602 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
603 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
604
605 // Transform 2: Rounds 49-52
606 TMP = vld1q_u32(&MIDS[48]);
607 TMP2A = STATE0A;
608 TMP2B = STATE0B;
609 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
610 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
611 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
612 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
613
614 // Transform 2: Rounds 53-56
615 TMP = vld1q_u32(&MIDS[52]);
616 TMP2A = STATE0A;
617 TMP2B = STATE0B;
618 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
619 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
620 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
621 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
622
623 // Transform 2: Rounds 57-60
624 TMP = vld1q_u32(&MIDS[56]);
625 TMP2A = STATE0A;
626 TMP2B = STATE0B;
627 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
628 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
629 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
630 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
631
632 // Transform 2: Rounds 61-64
633 TMP = vld1q_u32(&MIDS[60]);
634 TMP2A = STATE0A;
635 TMP2B = STATE0B;
636 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
637 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
638 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
639 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
640
641 // Transform 2: Update state
642 STATE0A = vaddq_u32(STATE0A, ABEF_SAVEA);
643 STATE0B = vaddq_u32(STATE0B, ABEF_SAVEB);
644 STATE1A = vaddq_u32(STATE1A, CDGH_SAVEA);
645 STATE1B = vaddq_u32(STATE1B, CDGH_SAVEB);
646
647 // Transform 3: Pad previous output
648 MSG0A = STATE0A;
649 MSG0B = STATE0B;
650 MSG1A = STATE1A;
651 MSG1B = STATE1B;
652 MSG2A = vld1q_u32(&FINAL[0]);
653 MSG2B = MSG2A;
654 MSG3A = vld1q_u32(&FINAL[4]);
655 MSG3B = MSG3A;
656
657 // Transform 3: Load state
658 STATE0A = vld1q_u32(&INIT[0]);
659 STATE0B = STATE0A;
660 STATE1A = vld1q_u32(&INIT[4]);
661 STATE1B = STATE1A;
662
663 // Transform 3: Rounds 1-4
664 TMP = vld1q_u32(&K[0]);
665 TMP0A = vaddq_u32(MSG0A, TMP);
666 TMP0B = vaddq_u32(MSG0B, TMP);
667 TMP2A = STATE0A;
668 TMP2B = STATE0B;
669 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
670 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
671 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
672 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
673 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
674 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
675 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
676 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
677
678 // Transform 3: Rounds 5-8
679 TMP = vld1q_u32(&K[4]);
680 TMP0A = vaddq_u32(MSG1A, TMP);
681 TMP0B = vaddq_u32(MSG1B, TMP);
682 TMP2A = STATE0A;
683 TMP2B = STATE0B;
684 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
685 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
686 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
687 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
688 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
689 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
690 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
691 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
692
693 // Transform 3: Rounds 9-12
694 TMP = vld1q_u32(&FINS[0]);
695 TMP2A = STATE0A;
696 TMP2B = STATE0B;
697 MSG2A = vld1q_u32(&FINS[4]);
698 MSG2B = MSG2A;
699 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
700 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
701 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
702 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
703 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
704 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
705
706 // Transform 3: Rounds 13-16
707 TMP = vld1q_u32(&FINS[8]);
708 TMP2A = STATE0A;
709 TMP2B = STATE0B;
710 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
711 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
712 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP);
713 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP);
714 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP);
715 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP);
716 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
717 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
718
719 // Transform 3: Rounds 17-20
720 TMP = vld1q_u32(&K[16]);
721 TMP0A = vaddq_u32(MSG0A, TMP);
722 TMP0B = vaddq_u32(MSG0B, TMP);
723 TMP2A = STATE0A;
724 TMP2B = STATE0B;
725 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
726 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
727 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
728 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
729 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
730 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
731 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
732 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
733
734 // Transform 3: Rounds 21-24
735 TMP = vld1q_u32(&K[20]);
736 TMP0A = vaddq_u32(MSG1A, TMP);
737 TMP0B = vaddq_u32(MSG1B, TMP);
738 TMP2A = STATE0A;
739 TMP2B = STATE0B;
740 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
741 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
742 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
743 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
744 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
745 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
746 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
747 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
748
749 // Transform 3: Rounds 25-28
750 TMP = vld1q_u32(&K[24]);
751 TMP0A = vaddq_u32(MSG2A, TMP);
752 TMP0B = vaddq_u32(MSG2B, TMP);
753 TMP2A = STATE0A;
754 TMP2B = STATE0B;
755 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
756 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
757 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
758 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
759 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
760 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
761 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
762 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
763
764 // Transform 3: Rounds 29-32
765 TMP = vld1q_u32(&K[28]);
766 TMP0A = vaddq_u32(MSG3A, TMP);
767 TMP0B = vaddq_u32(MSG3B, TMP);
768 TMP2A = STATE0A;
769 TMP2B = STATE0B;
770 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
771 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
772 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
773 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
774 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
775 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
776 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
777 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
778
779 // Transform 3: Rounds 33-36
780 TMP = vld1q_u32(&K[32]);
781 TMP0A = vaddq_u32(MSG0A, TMP);
782 TMP0B = vaddq_u32(MSG0B, TMP);
783 TMP2A = STATE0A;
784 TMP2B = STATE0B;
785 MSG0A = vsha256su0q_u32(MSG0A, MSG1A);
786 MSG0B = vsha256su0q_u32(MSG0B, MSG1B);
787 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
788 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
789 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
790 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
791 MSG0A = vsha256su1q_u32(MSG0A, MSG2A, MSG3A);
792 MSG0B = vsha256su1q_u32(MSG0B, MSG2B, MSG3B);
793
794 // Transform 3: Rounds 37-40
795 TMP = vld1q_u32(&K[36]);
796 TMP0A = vaddq_u32(MSG1A, TMP);
797 TMP0B = vaddq_u32(MSG1B, TMP);
798 TMP2A = STATE0A;
799 TMP2B = STATE0B;
800 MSG1A = vsha256su0q_u32(MSG1A, MSG2A);
801 MSG1B = vsha256su0q_u32(MSG1B, MSG2B);
802 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
803 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
804 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
805 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
806 MSG1A = vsha256su1q_u32(MSG1A, MSG3A, MSG0A);
807 MSG1B = vsha256su1q_u32(MSG1B, MSG3B, MSG0B);
808
809 // Transform 3: Rounds 41-44
810 TMP = vld1q_u32(&K[40]);
811 TMP0A = vaddq_u32(MSG2A, TMP);
812 TMP0B = vaddq_u32(MSG2B, TMP);
813 TMP2A = STATE0A;
814 TMP2B = STATE0B;
815 MSG2A = vsha256su0q_u32(MSG2A, MSG3A);
816 MSG2B = vsha256su0q_u32(MSG2B, MSG3B);
817 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
818 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
819 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
820 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
821 MSG2A = vsha256su1q_u32(MSG2A, MSG0A, MSG1A);
822 MSG2B = vsha256su1q_u32(MSG2B, MSG0B, MSG1B);
823
824 // Transform 3: Rounds 45-48
825 TMP = vld1q_u32(&K[44]);
826 TMP0A = vaddq_u32(MSG3A, TMP);
827 TMP0B = vaddq_u32(MSG3B, TMP);
828 TMP2A = STATE0A;
829 TMP2B = STATE0B;
830 MSG3A = vsha256su0q_u32(MSG3A, MSG0A);
831 MSG3B = vsha256su0q_u32(MSG3B, MSG0B);
832 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
833 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
834 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
835 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
836 MSG3A = vsha256su1q_u32(MSG3A, MSG1A, MSG2A);
837 MSG3B = vsha256su1q_u32(MSG3B, MSG1B, MSG2B);
838
839 // Transform 3: Rounds 49-52
840 TMP = vld1q_u32(&K[48]);
841 TMP0A = vaddq_u32(MSG0A, TMP);
842 TMP0B = vaddq_u32(MSG0B, TMP);
843 TMP2A = STATE0A;
844 TMP2B = STATE0B;
845 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
846 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
847 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
848 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
849
850 // Transform 3: Rounds 53-56
851 TMP = vld1q_u32(&K[52]);
852 TMP0A = vaddq_u32(MSG1A, TMP);
853 TMP0B = vaddq_u32(MSG1B, TMP);
854 TMP2A = STATE0A;
855 TMP2B = STATE0B;
856 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
857 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
858 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
859 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
860
861 // Transform 3: Rounds 57-60
862 TMP = vld1q_u32(&K[56]);
863 TMP0A = vaddq_u32(MSG2A, TMP);
864 TMP0B = vaddq_u32(MSG2B, TMP);
865 TMP2A = STATE0A;
866 TMP2B = STATE0B;
867 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
868 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
869 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
870 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
871
872 // Transform 3: Rounds 61-64
873 TMP = vld1q_u32(&K[60]);
874 TMP0A = vaddq_u32(MSG3A, TMP);
875 TMP0B = vaddq_u32(MSG3B, TMP);
876 TMP2A = STATE0A;
877 TMP2B = STATE0B;
878 STATE0A = vsha256hq_u32(STATE0A, STATE1A, TMP0A);
879 STATE0B = vsha256hq_u32(STATE0B, STATE1B, TMP0B);
880 STATE1A = vsha256h2q_u32(STATE1A, TMP2A, TMP0A);
881 STATE1B = vsha256h2q_u32(STATE1B, TMP2B, TMP0B);
882
883 // Transform 3: Update state
884 TMP = vld1q_u32(&INIT[0]);
885 STATE0A = vaddq_u32(STATE0A, TMP);
886 STATE0B = vaddq_u32(STATE0B, TMP);
887 TMP = vld1q_u32(&INIT[4]);
888 STATE1A = vaddq_u32(STATE1A, TMP);
889 STATE1B = vaddq_u32(STATE1B, TMP);
890
891 // Store result
892 vst1q_u8(output, vrev32q_u8(vreinterpretq_u8_u32(STATE0A)));
893 vst1q_u8(output + 16, vrev32q_u8(vreinterpretq_u8_u32(STATE1A)));
894 vst1q_u8(output + 32, vrev32q_u8(vreinterpretq_u8_u32(STATE0B)));
895 vst1q_u8(output + 48, vrev32q_u8(vreinterpretq_u8_u32(STATE1B)));
896}
897}
898
899#endif
void Transform(uint32_t *s, const unsigned char *chunk, size_t blocks)
void Transform_2way(unsigned char *out, const unsigned char *in)