mirror of
				git://git.openwrt.org/openwrt/openwrt.git
				synced 2025-11-03 22:44:27 -05:00 
			
		
		
		
	Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
		
			
				
	
	
		
			325 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			325 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 | 
						|
From: Ard Biesheuvel <ardb@kernel.org>
 | 
						|
Date: Fri, 6 Nov 2020 17:39:38 +0100
 | 
						|
Subject: [PATCH] crypto: arm64/chacha - simplify tail block handling
 | 
						|
 | 
						|
commit c4fc6328d6c67690a7e6e03f43a5a976a13120ef upstream.
 | 
						|
 | 
						|
Based on lessons learnt from optimizing the 32-bit version of this driver,
 | 
						|
we can simplify the arm64 version considerably, by reordering the final
 | 
						|
two stores when the last block is not a multiple of 64 bytes. This removes
 | 
						|
the need to use permutation instructions to calculate the elements that are
 | 
						|
clobbered by the final overlapping store, given that the store of the
 | 
						|
penultimate block now follows it, and that one carries the correct values
 | 
						|
for those elements already.
 | 
						|
 | 
						|
While at it, simplify the overlapping loads as well, by calculating the
 | 
						|
address of the final overlapping load upfront, and switching to this
 | 
						|
address for every load that would otherwise extend past the end of the
 | 
						|
source buffer.
 | 
						|
 | 
						|
There is no impact on performance, but the resulting code is substantially
 | 
						|
smaller and easier to follow.
 | 
						|
 | 
						|
Cc: Eric Biggers <ebiggers@google.com>
 | 
						|
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
 | 
						|
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
 | 
						|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 | 
						|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | 
						|
---
 | 
						|
 arch/arm64/crypto/chacha-neon-core.S | 193 ++++++++++-----------------
 | 
						|
 1 file changed, 69 insertions(+), 124 deletions(-)
 | 
						|
 | 
						|
--- a/arch/arm64/crypto/chacha-neon-core.S
 | 
						|
+++ b/arch/arm64/crypto/chacha-neon-core.S
 | 
						|
@@ -195,7 +195,6 @@ ENTRY(chacha_4block_xor_neon)
 | 
						|
 	adr_l		x10, .Lpermute
 | 
						|
 	and		x5, x4, #63
 | 
						|
 	add		x10, x10, x5
 | 
						|
-	add		x11, x10, #64
 | 
						|
 
 | 
						|
 	//
 | 
						|
 	// This function encrypts four consecutive ChaCha blocks by loading
 | 
						|
@@ -645,11 +644,11 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip2		v31.4s, v14.4s, v15.4s
 | 
						|
 	  eor		a15, a15, w9
 | 
						|
 
 | 
						|
-	mov		x3, #64
 | 
						|
+	add		x3, x2, x4
 | 
						|
+	sub		x3, x3, #128		// start of last block
 | 
						|
+
 | 
						|
 	subs		x5, x4, #128
 | 
						|
-	add		x6, x5, x2
 | 
						|
-	csel		x3, x3, xzr, ge
 | 
						|
-	csel		x2, x2, x6, ge
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	// interleave 64-bit words in state n, n+2
 | 
						|
 	zip1		v0.2d, v16.2d, v18.2d
 | 
						|
@@ -658,13 +657,10 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v8.2d, v17.2d, v19.2d
 | 
						|
 	zip2		v12.2d, v17.2d, v19.2d
 | 
						|
 	  stp		a2, a3, [x1, #-56]
 | 
						|
-	ld1		{v16.16b-v19.16b}, [x2], x3
 | 
						|
 
 | 
						|
 	subs		x6, x4, #192
 | 
						|
-	ccmp		x3, xzr, #4, lt
 | 
						|
-	add		x7, x6, x2
 | 
						|
-	csel		x3, x3, xzr, eq
 | 
						|
-	csel		x2, x2, x7, eq
 | 
						|
+	ld1		{v16.16b-v19.16b}, [x2], #64
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	zip1		v1.2d, v20.2d, v22.2d
 | 
						|
 	zip2		v5.2d, v20.2d, v22.2d
 | 
						|
@@ -672,13 +668,10 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v9.2d, v21.2d, v23.2d
 | 
						|
 	zip2		v13.2d, v21.2d, v23.2d
 | 
						|
 	  stp		a6, a7, [x1, #-40]
 | 
						|
-	ld1		{v20.16b-v23.16b}, [x2], x3
 | 
						|
 
 | 
						|
 	subs		x7, x4, #256
 | 
						|
-	ccmp		x3, xzr, #4, lt
 | 
						|
-	add		x8, x7, x2
 | 
						|
-	csel		x3, x3, xzr, eq
 | 
						|
-	csel		x2, x2, x8, eq
 | 
						|
+	ld1		{v20.16b-v23.16b}, [x2], #64
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	zip1		v2.2d, v24.2d, v26.2d
 | 
						|
 	zip2		v6.2d, v24.2d, v26.2d
 | 
						|
@@ -686,12 +679,10 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v10.2d, v25.2d, v27.2d
 | 
						|
 	zip2		v14.2d, v25.2d, v27.2d
 | 
						|
 	  stp		a10, a11, [x1, #-24]
 | 
						|
-	ld1		{v24.16b-v27.16b}, [x2], x3
 | 
						|
 
 | 
						|
 	subs		x8, x4, #320
 | 
						|
-	ccmp		x3, xzr, #4, lt
 | 
						|
-	add		x9, x8, x2
 | 
						|
-	csel		x2, x2, x9, eq
 | 
						|
+	ld1		{v24.16b-v27.16b}, [x2], #64
 | 
						|
+	csel		x2, x2, x3, ge
 | 
						|
 
 | 
						|
 	zip1		v3.2d, v28.2d, v30.2d
 | 
						|
 	zip2		v7.2d, v28.2d, v30.2d
 | 
						|
@@ -699,151 +690,105 @@ CPU_BE(	  rev		a15, a15	)
 | 
						|
 	zip1		v11.2d, v29.2d, v31.2d
 | 
						|
 	zip2		v15.2d, v29.2d, v31.2d
 | 
						|
 	  stp		a14, a15, [x1, #-8]
 | 
						|
+
 | 
						|
+	tbnz		x5, #63, .Lt128
 | 
						|
 	ld1		{v28.16b-v31.16b}, [x2]
 | 
						|
 
 | 
						|
 	// xor with corresponding input, write to output
 | 
						|
-	tbnz		x5, #63, 0f
 | 
						|
 	eor		v16.16b, v16.16b, v0.16b
 | 
						|
 	eor		v17.16b, v17.16b, v1.16b
 | 
						|
 	eor		v18.16b, v18.16b, v2.16b
 | 
						|
 	eor		v19.16b, v19.16b, v3.16b
 | 
						|
-	st1		{v16.16b-v19.16b}, [x1], #64
 | 
						|
-	cbz		x5, .Lout
 | 
						|
 
 | 
						|
-	tbnz		x6, #63, 1f
 | 
						|
+	tbnz		x6, #63, .Lt192
 | 
						|
+
 | 
						|
 	eor		v20.16b, v20.16b, v4.16b
 | 
						|
 	eor		v21.16b, v21.16b, v5.16b
 | 
						|
 	eor		v22.16b, v22.16b, v6.16b
 | 
						|
 	eor		v23.16b, v23.16b, v7.16b
 | 
						|
-	st1		{v20.16b-v23.16b}, [x1], #64
 | 
						|
-	cbz		x6, .Lout
 | 
						|
 
 | 
						|
-	tbnz		x7, #63, 2f
 | 
						|
+	st1		{v16.16b-v19.16b}, [x1], #64
 | 
						|
+	tbnz		x7, #63, .Lt256
 | 
						|
+
 | 
						|
 	eor		v24.16b, v24.16b, v8.16b
 | 
						|
 	eor		v25.16b, v25.16b, v9.16b
 | 
						|
 	eor		v26.16b, v26.16b, v10.16b
 | 
						|
 	eor		v27.16b, v27.16b, v11.16b
 | 
						|
-	st1		{v24.16b-v27.16b}, [x1], #64
 | 
						|
-	cbz		x7, .Lout
 | 
						|
 
 | 
						|
-	tbnz		x8, #63, 3f
 | 
						|
+	st1		{v20.16b-v23.16b}, [x1], #64
 | 
						|
+	tbnz		x8, #63, .Lt320
 | 
						|
+
 | 
						|
 	eor		v28.16b, v28.16b, v12.16b
 | 
						|
 	eor		v29.16b, v29.16b, v13.16b
 | 
						|
 	eor		v30.16b, v30.16b, v14.16b
 | 
						|
 	eor		v31.16b, v31.16b, v15.16b
 | 
						|
+
 | 
						|
+	st1		{v24.16b-v27.16b}, [x1], #64
 | 
						|
 	st1		{v28.16b-v31.16b}, [x1]
 | 
						|
 
 | 
						|
 .Lout:	frame_pop
 | 
						|
 	ret
 | 
						|
 
 | 
						|
-	// fewer than 128 bytes of in/output
 | 
						|
-0:	ld1		{v8.16b}, [x10]
 | 
						|
-	ld1		{v9.16b}, [x11]
 | 
						|
-	movi		v10.16b, #16
 | 
						|
-	sub		x2, x1, #64
 | 
						|
-	add		x1, x1, x5
 | 
						|
-	ld1		{v16.16b-v19.16b}, [x2]
 | 
						|
-	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
 | 
						|
-	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-
 | 
						|
-	eor		v20.16b, v20.16b, v4.16b
 | 
						|
-	eor		v21.16b, v21.16b, v5.16b
 | 
						|
-	eor		v22.16b, v22.16b, v6.16b
 | 
						|
-	eor		v23.16b, v23.16b, v7.16b
 | 
						|
-	st1		{v20.16b-v23.16b}, [x1]
 | 
						|
-	b		.Lout
 | 
						|
-
 | 
						|
 	// fewer than 192 bytes of in/output
 | 
						|
-1:	ld1		{v8.16b}, [x10]
 | 
						|
-	ld1		{v9.16b}, [x11]
 | 
						|
-	movi		v10.16b, #16
 | 
						|
-	add		x1, x1, x6
 | 
						|
-	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-	add		v8.16b, v8.16b, v10.16b
 | 
						|
-	add		v9.16b, v9.16b, v10.16b
 | 
						|
-	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
 | 
						|
-	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
 | 
						|
-
 | 
						|
-	eor		v20.16b, v20.16b, v0.16b
 | 
						|
-	eor		v21.16b, v21.16b, v1.16b
 | 
						|
-	eor		v22.16b, v22.16b, v2.16b
 | 
						|
-	eor		v23.16b, v23.16b, v3.16b
 | 
						|
-	st1		{v20.16b-v23.16b}, [x1]
 | 
						|
+.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
 | 
						|
+	ld1		{v28.16b-v31.16b}, [x10]
 | 
						|
+	add		x5, x5, x1
 | 
						|
+	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
 | 
						|
+	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
 | 
						|
+	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
 | 
						|
+	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
 | 
						|
+
 | 
						|
+0:	eor		v20.16b, v20.16b, v28.16b
 | 
						|
+	eor		v21.16b, v21.16b, v29.16b
 | 
						|
+	eor		v22.16b, v22.16b, v30.16b
 | 
						|
+	eor		v23.16b, v23.16b, v31.16b
 | 
						|
+	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
 | 
						|
+1:	st1		{v16.16b-v19.16b}, [x1]
 | 
						|
 	b		.Lout
 | 
						|
 
 | 
						|
+	// fewer than 128 bytes of in/output
 | 
						|
+.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
 | 
						|
+	add		x5, x5, x1
 | 
						|
+	sub		x1, x1, #64
 | 
						|
+	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
 | 
						|
+	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
 | 
						|
+	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
 | 
						|
+	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
 | 
						|
+	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
 | 
						|
+	b		0b
 | 
						|
+
 | 
						|
 	// fewer than 256 bytes of in/output
 | 
						|
-2:	ld1		{v4.16b}, [x10]
 | 
						|
-	ld1		{v5.16b}, [x11]
 | 
						|
-	movi		v6.16b, #16
 | 
						|
-	add		x1, x1, x7
 | 
						|
+.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
 | 
						|
+	ld1		{v4.16b-v7.16b}, [x10]
 | 
						|
+	add		x6, x6, x1
 | 
						|
 	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
 | 
						|
-	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
 | 
						|
-
 | 
						|
-	eor		v24.16b, v24.16b, v0.16b
 | 
						|
-	eor		v25.16b, v25.16b, v1.16b
 | 
						|
-	eor		v26.16b, v26.16b, v2.16b
 | 
						|
-	eor		v27.16b, v27.16b, v3.16b
 | 
						|
-	st1		{v24.16b-v27.16b}, [x1]
 | 
						|
+	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
 | 
						|
+	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
 | 
						|
+	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
 | 
						|
+
 | 
						|
+	eor		v28.16b, v28.16b, v0.16b
 | 
						|
+	eor		v29.16b, v29.16b, v1.16b
 | 
						|
+	eor		v30.16b, v30.16b, v2.16b
 | 
						|
+	eor		v31.16b, v31.16b, v3.16b
 | 
						|
+	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
 | 
						|
+2:	st1		{v20.16b-v23.16b}, [x1]
 | 
						|
 	b		.Lout
 | 
						|
 
 | 
						|
 	// fewer than 320 bytes of in/output
 | 
						|
-3:	ld1		{v4.16b}, [x10]
 | 
						|
-	ld1		{v5.16b}, [x11]
 | 
						|
-	movi		v6.16b, #16
 | 
						|
-	add		x1, x1, x8
 | 
						|
+.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
 | 
						|
+	ld1		{v4.16b-v7.16b}, [x10]
 | 
						|
+	add		x7, x7, x1
 | 
						|
 	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
-	add		v4.16b, v4.16b, v6.16b
 | 
						|
-	add		v5.16b, v5.16b, v6.16b
 | 
						|
-	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
 | 
						|
-	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
 | 
						|
+	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
 | 
						|
+	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
 | 
						|
+	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
 | 
						|
 
 | 
						|
 	eor		v28.16b, v28.16b, v0.16b
 | 
						|
 	eor		v29.16b, v29.16b, v1.16b
 | 
						|
 	eor		v30.16b, v30.16b, v2.16b
 | 
						|
 	eor		v31.16b, v31.16b, v3.16b
 | 
						|
-	st1		{v28.16b-v31.16b}, [x1]
 | 
						|
+	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
 | 
						|
+3:	st1		{v24.16b-v27.16b}, [x1]
 | 
						|
 	b		.Lout
 | 
						|
 ENDPROC(chacha_4block_xor_neon)
 | 
						|
 
 | 
						|
@@ -851,7 +796,7 @@ ENDPROC(chacha_4block_xor_neon)
 | 
						|
 	.align		L1_CACHE_SHIFT
 | 
						|
 .Lpermute:
 | 
						|
 	.set		.Li, 0
 | 
						|
-	.rept		192
 | 
						|
+	.rept		128
 | 
						|
 	.byte		(.Li - 64)
 | 
						|
 	.set		.Li, .Li + 1
 | 
						|
 	.endr
 |