GNU Linux-libre 4.19.286-gnu1
[releases.git] / arch / arm64 / crypto / chacha20-neon-core.S
1 /*
2  * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
3  *
4  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  *
10  * Based on:
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12  *
13  * Copyright (C) 2015 Martin Willi
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  */
20
21 #include <linux/linkage.h>
22
23         .text
24         .align          6
25
26 ENTRY(chacha20_block_xor_neon)
27         // x0: Input state matrix, s
28         // x1: 1 data block output, o
29         // x2: 1 data block input, i
30
31         //
32         // This function encrypts one ChaCha20 block by loading the state matrix
33         // in four NEON registers. It performs matrix operation on four words in
34         // parallel, but requires shuffling to rearrange the words after each
35         // round.
36         //
37
38         // x0..3 = s0..3
39         adr             x3, ROT8
40         ld1             {v0.4s-v3.4s}, [x0]
41         ld1             {v8.4s-v11.4s}, [x0]
42         ld1             {v12.4s}, [x3]
43
44         mov             x3, #10
45
46 .Ldoubleround:
47         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
48         add             v0.4s, v0.4s, v1.4s
49         eor             v3.16b, v3.16b, v0.16b
50         rev32           v3.8h, v3.8h
51
52         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
53         add             v2.4s, v2.4s, v3.4s
54         eor             v4.16b, v1.16b, v2.16b
55         shl             v1.4s, v4.4s, #12
56         sri             v1.4s, v4.4s, #20
57
58         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
59         add             v0.4s, v0.4s, v1.4s
60         eor             v3.16b, v3.16b, v0.16b
61         tbl             v3.16b, {v3.16b}, v12.16b
62
63         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
64         add             v2.4s, v2.4s, v3.4s
65         eor             v4.16b, v1.16b, v2.16b
66         shl             v1.4s, v4.4s, #7
67         sri             v1.4s, v4.4s, #25
68
69         // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
70         ext             v1.16b, v1.16b, v1.16b, #4
71         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
72         ext             v2.16b, v2.16b, v2.16b, #8
73         // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
74         ext             v3.16b, v3.16b, v3.16b, #12
75
76         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
77         add             v0.4s, v0.4s, v1.4s
78         eor             v3.16b, v3.16b, v0.16b
79         rev32           v3.8h, v3.8h
80
81         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
82         add             v2.4s, v2.4s, v3.4s
83         eor             v4.16b, v1.16b, v2.16b
84         shl             v1.4s, v4.4s, #12
85         sri             v1.4s, v4.4s, #20
86
87         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
88         add             v0.4s, v0.4s, v1.4s
89         eor             v3.16b, v3.16b, v0.16b
90         tbl             v3.16b, {v3.16b}, v12.16b
91
92         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
93         add             v2.4s, v2.4s, v3.4s
94         eor             v4.16b, v1.16b, v2.16b
95         shl             v1.4s, v4.4s, #7
96         sri             v1.4s, v4.4s, #25
97
98         // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
99         ext             v1.16b, v1.16b, v1.16b, #12
100         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
101         ext             v2.16b, v2.16b, v2.16b, #8
102         // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
103         ext             v3.16b, v3.16b, v3.16b, #4
104
105         subs            x3, x3, #1
106         b.ne            .Ldoubleround
107
108         ld1             {v4.16b-v7.16b}, [x2]
109
110         // o0 = i0 ^ (x0 + s0)
111         add             v0.4s, v0.4s, v8.4s
112         eor             v0.16b, v0.16b, v4.16b
113
114         // o1 = i1 ^ (x1 + s1)
115         add             v1.4s, v1.4s, v9.4s
116         eor             v1.16b, v1.16b, v5.16b
117
118         // o2 = i2 ^ (x2 + s2)
119         add             v2.4s, v2.4s, v10.4s
120         eor             v2.16b, v2.16b, v6.16b
121
122         // o3 = i3 ^ (x3 + s3)
123         add             v3.4s, v3.4s, v11.4s
124         eor             v3.16b, v3.16b, v7.16b
125
126         st1             {v0.16b-v3.16b}, [x1]
127
128         ret
129 ENDPROC(chacha20_block_xor_neon)
130
131         .align          6
132 ENTRY(chacha20_4block_xor_neon)
133         // x0: Input state matrix, s
134         // x1: 4 data blocks output, o
135         // x2: 4 data blocks input, i
136
137         //
138         // This function encrypts four consecutive ChaCha20 blocks by loading
139         // the state matrix in NEON registers four times. The algorithm performs
140         // each operation on the corresponding word of each state matrix, hence
141         // requires no word shuffling. For final XORing step we transpose the
142         // matrix by interleaving 32- and then 64-bit words, which allows us to
143         // do XOR in NEON registers.
144         //
145         adr             x3, CTRINC              // ... and ROT8
146         ld1             {v30.4s-v31.4s}, [x3]
147
148         // x0..15[0-3] = s0..3[0..3]
149         mov             x4, x0
150         ld4r            { v0.4s- v3.4s}, [x4], #16
151         ld4r            { v4.4s- v7.4s}, [x4], #16
152         ld4r            { v8.4s-v11.4s}, [x4], #16
153         ld4r            {v12.4s-v15.4s}, [x4]
154
155         // x12 += counter values 0-3
156         add             v12.4s, v12.4s, v30.4s
157
158         mov             x3, #10
159
160 .Ldoubleround4:
161         // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
162         // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
163         // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
164         // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
165         add             v0.4s, v0.4s, v4.4s
166         add             v1.4s, v1.4s, v5.4s
167         add             v2.4s, v2.4s, v6.4s
168         add             v3.4s, v3.4s, v7.4s
169
170         eor             v12.16b, v12.16b, v0.16b
171         eor             v13.16b, v13.16b, v1.16b
172         eor             v14.16b, v14.16b, v2.16b
173         eor             v15.16b, v15.16b, v3.16b
174
175         rev32           v12.8h, v12.8h
176         rev32           v13.8h, v13.8h
177         rev32           v14.8h, v14.8h
178         rev32           v15.8h, v15.8h
179
180         // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
181         // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
182         // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
183         // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
184         add             v8.4s, v8.4s, v12.4s
185         add             v9.4s, v9.4s, v13.4s
186         add             v10.4s, v10.4s, v14.4s
187         add             v11.4s, v11.4s, v15.4s
188
189         eor             v16.16b, v4.16b, v8.16b
190         eor             v17.16b, v5.16b, v9.16b
191         eor             v18.16b, v6.16b, v10.16b
192         eor             v19.16b, v7.16b, v11.16b
193
194         shl             v4.4s, v16.4s, #12
195         shl             v5.4s, v17.4s, #12
196         shl             v6.4s, v18.4s, #12
197         shl             v7.4s, v19.4s, #12
198
199         sri             v4.4s, v16.4s, #20
200         sri             v5.4s, v17.4s, #20
201         sri             v6.4s, v18.4s, #20
202         sri             v7.4s, v19.4s, #20
203
204         // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
205         // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
206         // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
207         // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
208         add             v0.4s, v0.4s, v4.4s
209         add             v1.4s, v1.4s, v5.4s
210         add             v2.4s, v2.4s, v6.4s
211         add             v3.4s, v3.4s, v7.4s
212
213         eor             v12.16b, v12.16b, v0.16b
214         eor             v13.16b, v13.16b, v1.16b
215         eor             v14.16b, v14.16b, v2.16b
216         eor             v15.16b, v15.16b, v3.16b
217
218         tbl             v12.16b, {v12.16b}, v31.16b
219         tbl             v13.16b, {v13.16b}, v31.16b
220         tbl             v14.16b, {v14.16b}, v31.16b
221         tbl             v15.16b, {v15.16b}, v31.16b
222
223         // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
224         // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
225         // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
226         // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
227         add             v8.4s, v8.4s, v12.4s
228         add             v9.4s, v9.4s, v13.4s
229         add             v10.4s, v10.4s, v14.4s
230         add             v11.4s, v11.4s, v15.4s
231
232         eor             v16.16b, v4.16b, v8.16b
233         eor             v17.16b, v5.16b, v9.16b
234         eor             v18.16b, v6.16b, v10.16b
235         eor             v19.16b, v7.16b, v11.16b
236
237         shl             v4.4s, v16.4s, #7
238         shl             v5.4s, v17.4s, #7
239         shl             v6.4s, v18.4s, #7
240         shl             v7.4s, v19.4s, #7
241
242         sri             v4.4s, v16.4s, #25
243         sri             v5.4s, v17.4s, #25
244         sri             v6.4s, v18.4s, #25
245         sri             v7.4s, v19.4s, #25
246
247         // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
248         // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
249         // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
250         // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
251         add             v0.4s, v0.4s, v5.4s
252         add             v1.4s, v1.4s, v6.4s
253         add             v2.4s, v2.4s, v7.4s
254         add             v3.4s, v3.4s, v4.4s
255
256         eor             v15.16b, v15.16b, v0.16b
257         eor             v12.16b, v12.16b, v1.16b
258         eor             v13.16b, v13.16b, v2.16b
259         eor             v14.16b, v14.16b, v3.16b
260
261         rev32           v15.8h, v15.8h
262         rev32           v12.8h, v12.8h
263         rev32           v13.8h, v13.8h
264         rev32           v14.8h, v14.8h
265
266         // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
267         // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
268         // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
269         // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
270         add             v10.4s, v10.4s, v15.4s
271         add             v11.4s, v11.4s, v12.4s
272         add             v8.4s, v8.4s, v13.4s
273         add             v9.4s, v9.4s, v14.4s
274
275         eor             v16.16b, v5.16b, v10.16b
276         eor             v17.16b, v6.16b, v11.16b
277         eor             v18.16b, v7.16b, v8.16b
278         eor             v19.16b, v4.16b, v9.16b
279
280         shl             v5.4s, v16.4s, #12
281         shl             v6.4s, v17.4s, #12
282         shl             v7.4s, v18.4s, #12
283         shl             v4.4s, v19.4s, #12
284
285         sri             v5.4s, v16.4s, #20
286         sri             v6.4s, v17.4s, #20
287         sri             v7.4s, v18.4s, #20
288         sri             v4.4s, v19.4s, #20
289
290         // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
291         // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
292         // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
293         // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
294         add             v0.4s, v0.4s, v5.4s
295         add             v1.4s, v1.4s, v6.4s
296         add             v2.4s, v2.4s, v7.4s
297         add             v3.4s, v3.4s, v4.4s
298
299         eor             v15.16b, v15.16b, v0.16b
300         eor             v12.16b, v12.16b, v1.16b
301         eor             v13.16b, v13.16b, v2.16b
302         eor             v14.16b, v14.16b, v3.16b
303
304         tbl             v15.16b, {v15.16b}, v31.16b
305         tbl             v12.16b, {v12.16b}, v31.16b
306         tbl             v13.16b, {v13.16b}, v31.16b
307         tbl             v14.16b, {v14.16b}, v31.16b
308
309         // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
310         // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
311         // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
312         // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
313         add             v10.4s, v10.4s, v15.4s
314         add             v11.4s, v11.4s, v12.4s
315         add             v8.4s, v8.4s, v13.4s
316         add             v9.4s, v9.4s, v14.4s
317
318         eor             v16.16b, v5.16b, v10.16b
319         eor             v17.16b, v6.16b, v11.16b
320         eor             v18.16b, v7.16b, v8.16b
321         eor             v19.16b, v4.16b, v9.16b
322
323         shl             v5.4s, v16.4s, #7
324         shl             v6.4s, v17.4s, #7
325         shl             v7.4s, v18.4s, #7
326         shl             v4.4s, v19.4s, #7
327
328         sri             v5.4s, v16.4s, #25
329         sri             v6.4s, v17.4s, #25
330         sri             v7.4s, v18.4s, #25
331         sri             v4.4s, v19.4s, #25
332
333         subs            x3, x3, #1
334         b.ne            .Ldoubleround4
335
336         ld4r            {v16.4s-v19.4s}, [x0], #16
337         ld4r            {v20.4s-v23.4s}, [x0], #16
338
339         // x12 += counter values 0-3
340         add             v12.4s, v12.4s, v30.4s
341
342         // x0[0-3] += s0[0]
343         // x1[0-3] += s0[1]
344         // x2[0-3] += s0[2]
345         // x3[0-3] += s0[3]
346         add             v0.4s, v0.4s, v16.4s
347         add             v1.4s, v1.4s, v17.4s
348         add             v2.4s, v2.4s, v18.4s
349         add             v3.4s, v3.4s, v19.4s
350
351         ld4r            {v24.4s-v27.4s}, [x0], #16
352         ld4r            {v28.4s-v31.4s}, [x0]
353
354         // x4[0-3] += s1[0]
355         // x5[0-3] += s1[1]
356         // x6[0-3] += s1[2]
357         // x7[0-3] += s1[3]
358         add             v4.4s, v4.4s, v20.4s
359         add             v5.4s, v5.4s, v21.4s
360         add             v6.4s, v6.4s, v22.4s
361         add             v7.4s, v7.4s, v23.4s
362
363         // x8[0-3] += s2[0]
364         // x9[0-3] += s2[1]
365         // x10[0-3] += s2[2]
366         // x11[0-3] += s2[3]
367         add             v8.4s, v8.4s, v24.4s
368         add             v9.4s, v9.4s, v25.4s
369         add             v10.4s, v10.4s, v26.4s
370         add             v11.4s, v11.4s, v27.4s
371
372         // x12[0-3] += s3[0]
373         // x13[0-3] += s3[1]
374         // x14[0-3] += s3[2]
375         // x15[0-3] += s3[3]
376         add             v12.4s, v12.4s, v28.4s
377         add             v13.4s, v13.4s, v29.4s
378         add             v14.4s, v14.4s, v30.4s
379         add             v15.4s, v15.4s, v31.4s
380
381         // interleave 32-bit words in state n, n+1
382         zip1            v16.4s, v0.4s, v1.4s
383         zip2            v17.4s, v0.4s, v1.4s
384         zip1            v18.4s, v2.4s, v3.4s
385         zip2            v19.4s, v2.4s, v3.4s
386         zip1            v20.4s, v4.4s, v5.4s
387         zip2            v21.4s, v4.4s, v5.4s
388         zip1            v22.4s, v6.4s, v7.4s
389         zip2            v23.4s, v6.4s, v7.4s
390         zip1            v24.4s, v8.4s, v9.4s
391         zip2            v25.4s, v8.4s, v9.4s
392         zip1            v26.4s, v10.4s, v11.4s
393         zip2            v27.4s, v10.4s, v11.4s
394         zip1            v28.4s, v12.4s, v13.4s
395         zip2            v29.4s, v12.4s, v13.4s
396         zip1            v30.4s, v14.4s, v15.4s
397         zip2            v31.4s, v14.4s, v15.4s
398
399         // interleave 64-bit words in state n, n+2
400         zip1            v0.2d, v16.2d, v18.2d
401         zip2            v4.2d, v16.2d, v18.2d
402         zip1            v8.2d, v17.2d, v19.2d
403         zip2            v12.2d, v17.2d, v19.2d
404         ld1             {v16.16b-v19.16b}, [x2], #64
405
406         zip1            v1.2d, v20.2d, v22.2d
407         zip2            v5.2d, v20.2d, v22.2d
408         zip1            v9.2d, v21.2d, v23.2d
409         zip2            v13.2d, v21.2d, v23.2d
410         ld1             {v20.16b-v23.16b}, [x2], #64
411
412         zip1            v2.2d, v24.2d, v26.2d
413         zip2            v6.2d, v24.2d, v26.2d
414         zip1            v10.2d, v25.2d, v27.2d
415         zip2            v14.2d, v25.2d, v27.2d
416         ld1             {v24.16b-v27.16b}, [x2], #64
417
418         zip1            v3.2d, v28.2d, v30.2d
419         zip2            v7.2d, v28.2d, v30.2d
420         zip1            v11.2d, v29.2d, v31.2d
421         zip2            v15.2d, v29.2d, v31.2d
422         ld1             {v28.16b-v31.16b}, [x2]
423
424         // xor with corresponding input, write to output
425         eor             v16.16b, v16.16b, v0.16b
426         eor             v17.16b, v17.16b, v1.16b
427         eor             v18.16b, v18.16b, v2.16b
428         eor             v19.16b, v19.16b, v3.16b
429         eor             v20.16b, v20.16b, v4.16b
430         eor             v21.16b, v21.16b, v5.16b
431         st1             {v16.16b-v19.16b}, [x1], #64
432         eor             v22.16b, v22.16b, v6.16b
433         eor             v23.16b, v23.16b, v7.16b
434         eor             v24.16b, v24.16b, v8.16b
435         eor             v25.16b, v25.16b, v9.16b
436         st1             {v20.16b-v23.16b}, [x1], #64
437         eor             v26.16b, v26.16b, v10.16b
438         eor             v27.16b, v27.16b, v11.16b
439         eor             v28.16b, v28.16b, v12.16b
440         st1             {v24.16b-v27.16b}, [x1], #64
441         eor             v29.16b, v29.16b, v13.16b
442         eor             v30.16b, v30.16b, v14.16b
443         eor             v31.16b, v31.16b, v15.16b
444         st1             {v28.16b-v31.16b}, [x1]
445
446         ret
447 ENDPROC(chacha20_4block_xor_neon)
448
449 CTRINC: .word           0, 1, 2, 3
450 ROT8:   .word           0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f