mirror of
				git://git.openwrt.org/openwrt/openwrt.git
				synced 2025-11-03 14:34:27 -05:00 
			
		
		
		
	Without this patch, the chacha block counter is not incremented on neon rounds, resulting in incorrect calculations and corrupt packets. This also switches to using `--no-numbered --zero-commit` so that future diffs are smaller. Reported-by: Hans Geiblinger <cybrnook2002@yahoo.com> Reviewed-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com> Cc: David Bauer <mail@david-bauer.net> Cc: Petr Štetiar <ynezz@true.cz> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
		
			
				
	
	
		
			2928 lines
		
	
	
		
			75 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			2928 lines
		
	
	
		
			75 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 | 
						|
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
 | 
						|
Date: Sun, 5 Jan 2020 22:40:48 -0500
 | 
						|
Subject: [PATCH] crypto: x86/poly1305 - wire up faster implementations for
 | 
						|
 kernel
 | 
						|
 | 
						|
commit d7d7b853566254648df59f7ea27ea05952a6cfa8 upstream.
 | 
						|
 | 
						|
These x86_64 vectorized implementations support AVX, AVX-2, and AVX512F.
 | 
						|
The AVX-512F implementation is disabled on Skylake, due to throttling,
 | 
						|
but it is quite fast on >= Cannonlake.
 | 
						|
 | 
						|
On the left is cycle counts on a Core i7 6700HQ using the AVX-2
 | 
						|
codepath, comparing this implementation ("new") to the implementation in
 | 
						|
the current crypto api ("old"). On the right are benchmarks on a Xeon
 | 
						|
Gold 5120 using the AVX-512 codepath. The new implementation is faster
 | 
						|
on all benchmarks.
 | 
						|
 | 
						|
        AVX-2                  AVX-512
 | 
						|
      ---------              -----------
 | 
						|
 | 
						|
    size    old     new      size   old     new
 | 
						|
    ----    ----    ----     ----   ----    ----
 | 
						|
    0       70      68       0      74      70
 | 
						|
    16      92      90       16     96      92
 | 
						|
    32      134     104      32     136     106
 | 
						|
    48      172     120      48     184     124
 | 
						|
    64      218     136      64     218     138
 | 
						|
    80      254     158      80     260     160
 | 
						|
    96      298     174      96     300     176
 | 
						|
    112     342     192      112    342     194
 | 
						|
    128     388     212      128    384     212
 | 
						|
    144     428     228      144    420     226
 | 
						|
    160     466     246      160    464     248
 | 
						|
    176     510     264      176    504     264
 | 
						|
    192     550     282      192    544     282
 | 
						|
    208     594     302      208    582     300
 | 
						|
    224     628     316      224    624     318
 | 
						|
    240     676     334      240    662     338
 | 
						|
    256     716     354      256    708     358
 | 
						|
    272     764     374      272    748     372
 | 
						|
    288     802     352      288    788     358
 | 
						|
    304     420     366      304    422     370
 | 
						|
    320     428     360      320    432     364
 | 
						|
    336     484     378      336    486     380
 | 
						|
    352     426     384      352    434     390
 | 
						|
    368     478     400      368    480     408
 | 
						|
    384     488     394      384    490     398
 | 
						|
    400     542     408      400    542     412
 | 
						|
    416     486     416      416    492     426
 | 
						|
    432     534     430      432    538     436
 | 
						|
    448     544     422      448    546     432
 | 
						|
    464     600     438      464    600     448
 | 
						|
    480     540     448      480    548     456
 | 
						|
    496     594     464      496    594     476
 | 
						|
    512     602     456      512    606     470
 | 
						|
    528     656     476      528    656     480
 | 
						|
    544     600     480      544    606     498
 | 
						|
    560     650     494      560    652     512
 | 
						|
    576     664     490      576    662     508
 | 
						|
    592     714     508      592    716     522
 | 
						|
    608     656     514      608    664     538
 | 
						|
    624     708     532      624    710     552
 | 
						|
    640     716     524      640    720     516
 | 
						|
    656     770     536      656    772     526
 | 
						|
    672     716     548      672    722     544
 | 
						|
    688     770     562      688    768     556
 | 
						|
    704     774     552      704    778     556
 | 
						|
    720     826     568      720    832     568
 | 
						|
    736     768     574      736    780     584
 | 
						|
    752     822     592      752    826     600
 | 
						|
    768     830     584      768    836     560
 | 
						|
    784     884     602      784    888     572
 | 
						|
    800     828     610      800    838     588
 | 
						|
    816     884     628      816    884     604
 | 
						|
    832     888     618      832    894     598
 | 
						|
    848     942     632      848    946     612
 | 
						|
    864     884     644      864    896     628
 | 
						|
    880     936     660      880    942     644
 | 
						|
    896     948     652      896    952     608
 | 
						|
    912     1000    664      912    1004    616
 | 
						|
    928     942     676      928    954     634
 | 
						|
    944     994     690      944    1000    646
 | 
						|
    960     1002    680      960    1008    646
 | 
						|
    976     1054    694      976    1062    658
 | 
						|
    992     1002    706      992    1012    674
 | 
						|
    1008    1052    720      1008   1058    690
 | 
						|
 | 
						|
This commit wires in the prior implementation from Andy, and makes the
 | 
						|
following changes to be suitable for kernel land.
 | 
						|
 | 
						|
  - Some cosmetic and structural changes, like renaming labels to
 | 
						|
    .Lname, constants, and other Linux conventions, as well as making
 | 
						|
    the code easy for us to maintain moving forward.
 | 
						|
 | 
						|
  - CPU feature checking is done in C by the glue code.
 | 
						|
 | 
						|
  - We avoid jumping into the middle of functions, to appease objtool,
 | 
						|
    and instead parameterize shared code.
 | 
						|
 | 
						|
  - We maintain frame pointers so that stack traces make sense.
 | 
						|
 | 
						|
  - We remove the dependency on the perl xlate code, which transforms
 | 
						|
    the output into things that assemblers we don't care about use.
 | 
						|
 | 
						|
Importantly, none of our changes affect the arithmetic or core code, but
 | 
						|
just involve the differing environment of kernel space.
 | 
						|
 | 
						|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | 
						|
Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
 | 
						|
Co-developed-by: Samuel Neves <sneves@dei.uc.pt>
 | 
						|
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
 | 
						|
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
 | 
						|
---
 | 
						|
 arch/x86/crypto/.gitignore                    |   1 +
 | 
						|
 arch/x86/crypto/Makefile                      |  11 +-
 | 
						|
 arch/x86/crypto/poly1305-avx2-x86_64.S        | 390 ----------
 | 
						|
 arch/x86/crypto/poly1305-sse2-x86_64.S        | 590 ---------------
 | 
						|
 arch/x86/crypto/poly1305-x86_64-cryptogams.pl | 682 ++++++++++--------
 | 
						|
 arch/x86/crypto/poly1305_glue.c               | 473 +++++-------
 | 
						|
 lib/crypto/Kconfig                            |   2 +-
 | 
						|
 7 files changed, 572 insertions(+), 1577 deletions(-)
 | 
						|
 create mode 100644 arch/x86/crypto/.gitignore
 | 
						|
 delete mode 100644 arch/x86/crypto/poly1305-avx2-x86_64.S
 | 
						|
 delete mode 100644 arch/x86/crypto/poly1305-sse2-x86_64.S
 | 
						|
 | 
						|
--- /dev/null
 | 
						|
+++ b/arch/x86/crypto/.gitignore
 | 
						|
@@ -0,0 +1 @@
 | 
						|
+poly1305-x86_64.S
 | 
						|
--- a/arch/x86/crypto/Makefile
 | 
						|
+++ b/arch/x86/crypto/Makefile
 | 
						|
@@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o
 | 
						|
 
 | 
						|
 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 | 
						|
 blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
 | 
						|
+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
 | 
						|
+ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
 | 
						|
+targets += poly1305-x86_64-cryptogams.S
 | 
						|
+endif
 | 
						|
 
 | 
						|
 ifeq ($(avx_supported),yes)
 | 
						|
 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 | 
						|
@@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni
 | 
						|
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 | 
						|
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 | 
						|
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 | 
						|
-poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 | 
						|
 ifeq ($(avx2_supported),yes)
 | 
						|
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
 | 
						|
-poly1305-x86_64-y += poly1305-avx2-x86_64.o
 | 
						|
 endif
 | 
						|
 ifeq ($(sha1_ni_supported),yes)
 | 
						|
 sha1-ssse3-y += sha1_ni_asm.o
 | 
						|
@@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
 | 
						|
 endif
 | 
						|
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 | 
						|
 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
 | 
						|
+
 | 
						|
+quiet_cmd_perlasm = PERLASM $@
 | 
						|
+      cmd_perlasm = $(PERL) $< > $@
 | 
						|
+$(obj)/%.S: $(src)/%.pl FORCE
 | 
						|
+	$(call if_changed,perlasm)
 | 
						|
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
 | 
						|
+++ /dev/null
 | 
						|
@@ -1,390 +0,0 @@
 | 
						|
-/* SPDX-License-Identifier: GPL-2.0-or-later */
 | 
						|
-/*
 | 
						|
- * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
 | 
						|
- *
 | 
						|
- * Copyright (C) 2015 Martin Willi
 | 
						|
- */
 | 
						|
-
 | 
						|
-#include <linux/linkage.h>
 | 
						|
-
 | 
						|
-.section	.rodata.cst32.ANMASK, "aM", @progbits, 32
 | 
						|
-.align 32
 | 
						|
-ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
 | 
						|
-	.octa 0x0000000003ffffff0000000003ffffff
 | 
						|
-
 | 
						|
-.section	.rodata.cst32.ORMASK, "aM", @progbits, 32
 | 
						|
-.align 32
 | 
						|
-ORMASK:	.octa 0x00000000010000000000000001000000
 | 
						|
-	.octa 0x00000000010000000000000001000000
 | 
						|
-
 | 
						|
-.text
 | 
						|
-
 | 
						|
-#define h0 0x00(%rdi)
 | 
						|
-#define h1 0x04(%rdi)
 | 
						|
-#define h2 0x08(%rdi)
 | 
						|
-#define h3 0x0c(%rdi)
 | 
						|
-#define h4 0x10(%rdi)
 | 
						|
-#define r0 0x00(%rdx)
 | 
						|
-#define r1 0x04(%rdx)
 | 
						|
-#define r2 0x08(%rdx)
 | 
						|
-#define r3 0x0c(%rdx)
 | 
						|
-#define r4 0x10(%rdx)
 | 
						|
-#define u0 0x00(%r8)
 | 
						|
-#define u1 0x04(%r8)
 | 
						|
-#define u2 0x08(%r8)
 | 
						|
-#define u3 0x0c(%r8)
 | 
						|
-#define u4 0x10(%r8)
 | 
						|
-#define w0 0x18(%r8)
 | 
						|
-#define w1 0x1c(%r8)
 | 
						|
-#define w2 0x20(%r8)
 | 
						|
-#define w3 0x24(%r8)
 | 
						|
-#define w4 0x28(%r8)
 | 
						|
-#define y0 0x30(%r8)
 | 
						|
-#define y1 0x34(%r8)
 | 
						|
-#define y2 0x38(%r8)
 | 
						|
-#define y3 0x3c(%r8)
 | 
						|
-#define y4 0x40(%r8)
 | 
						|
-#define m %rsi
 | 
						|
-#define hc0 %ymm0
 | 
						|
-#define hc1 %ymm1
 | 
						|
-#define hc2 %ymm2
 | 
						|
-#define hc3 %ymm3
 | 
						|
-#define hc4 %ymm4
 | 
						|
-#define hc0x %xmm0
 | 
						|
-#define hc1x %xmm1
 | 
						|
-#define hc2x %xmm2
 | 
						|
-#define hc3x %xmm3
 | 
						|
-#define hc4x %xmm4
 | 
						|
-#define t1 %ymm5
 | 
						|
-#define t2 %ymm6
 | 
						|
-#define t1x %xmm5
 | 
						|
-#define t2x %xmm6
 | 
						|
-#define ruwy0 %ymm7
 | 
						|
-#define ruwy1 %ymm8
 | 
						|
-#define ruwy2 %ymm9
 | 
						|
-#define ruwy3 %ymm10
 | 
						|
-#define ruwy4 %ymm11
 | 
						|
-#define ruwy0x %xmm7
 | 
						|
-#define ruwy1x %xmm8
 | 
						|
-#define ruwy2x %xmm9
 | 
						|
-#define ruwy3x %xmm10
 | 
						|
-#define ruwy4x %xmm11
 | 
						|
-#define svxz1 %ymm12
 | 
						|
-#define svxz2 %ymm13
 | 
						|
-#define svxz3 %ymm14
 | 
						|
-#define svxz4 %ymm15
 | 
						|
-#define d0 %r9
 | 
						|
-#define d1 %r10
 | 
						|
-#define d2 %r11
 | 
						|
-#define d3 %r12
 | 
						|
-#define d4 %r13
 | 
						|
-
 | 
						|
-ENTRY(poly1305_4block_avx2)
 | 
						|
-	# %rdi: Accumulator h[5]
 | 
						|
-	# %rsi: 64 byte input block m
 | 
						|
-	# %rdx: Poly1305 key r[5]
 | 
						|
-	# %rcx: Quadblock count
 | 
						|
-	# %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
 | 
						|
-
 | 
						|
-	# This four-block variant uses loop unrolled block processing. It
 | 
						|
-	# requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
 | 
						|
-	# h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
 | 
						|
-
 | 
						|
-	vzeroupper
 | 
						|
-	push		%rbx
 | 
						|
-	push		%r12
 | 
						|
-	push		%r13
 | 
						|
-
 | 
						|
-	# combine r0,u0,w0,y0
 | 
						|
-	vmovd		y0,ruwy0x
 | 
						|
-	vmovd		w0,t1x
 | 
						|
-	vpunpcklqdq	t1,ruwy0,ruwy0
 | 
						|
-	vmovd		u0,t1x
 | 
						|
-	vmovd		r0,t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,ruwy0,ruwy0
 | 
						|
-
 | 
						|
-	# combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
 | 
						|
-	vmovd		y1,ruwy1x
 | 
						|
-	vmovd		w1,t1x
 | 
						|
-	vpunpcklqdq	t1,ruwy1,ruwy1
 | 
						|
-	vmovd		u1,t1x
 | 
						|
-	vmovd		r1,t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,ruwy1,ruwy1
 | 
						|
-	vpslld		$2,ruwy1,svxz1
 | 
						|
-	vpaddd		ruwy1,svxz1,svxz1
 | 
						|
-
 | 
						|
-	# combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
 | 
						|
-	vmovd		y2,ruwy2x
 | 
						|
-	vmovd		w2,t1x
 | 
						|
-	vpunpcklqdq	t1,ruwy2,ruwy2
 | 
						|
-	vmovd		u2,t1x
 | 
						|
-	vmovd		r2,t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,ruwy2,ruwy2
 | 
						|
-	vpslld		$2,ruwy2,svxz2
 | 
						|
-	vpaddd		ruwy2,svxz2,svxz2
 | 
						|
-
 | 
						|
-	# combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
 | 
						|
-	vmovd		y3,ruwy3x
 | 
						|
-	vmovd		w3,t1x
 | 
						|
-	vpunpcklqdq	t1,ruwy3,ruwy3
 | 
						|
-	vmovd		u3,t1x
 | 
						|
-	vmovd		r3,t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,ruwy3,ruwy3
 | 
						|
-	vpslld		$2,ruwy3,svxz3
 | 
						|
-	vpaddd		ruwy3,svxz3,svxz3
 | 
						|
-
 | 
						|
-	# combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
 | 
						|
-	vmovd		y4,ruwy4x
 | 
						|
-	vmovd		w4,t1x
 | 
						|
-	vpunpcklqdq	t1,ruwy4,ruwy4
 | 
						|
-	vmovd		u4,t1x
 | 
						|
-	vmovd		r4,t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,ruwy4,ruwy4
 | 
						|
-	vpslld		$2,ruwy4,svxz4
 | 
						|
-	vpaddd		ruwy4,svxz4,svxz4
 | 
						|
-
 | 
						|
-.Ldoblock4:
 | 
						|
-	# hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
 | 
						|
-	#	 m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
 | 
						|
-	vmovd		0x00(m),hc0x
 | 
						|
-	vmovd		0x10(m),t1x
 | 
						|
-	vpunpcklqdq	t1,hc0,hc0
 | 
						|
-	vmovd		0x20(m),t1x
 | 
						|
-	vmovd		0x30(m),t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,hc0,hc0
 | 
						|
-	vpand		ANMASK(%rip),hc0,hc0
 | 
						|
-	vmovd		h0,t1x
 | 
						|
-	vpaddd		t1,hc0,hc0
 | 
						|
-	# hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
 | 
						|
-	#	 (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
 | 
						|
-	vmovd		0x03(m),hc1x
 | 
						|
-	vmovd		0x13(m),t1x
 | 
						|
-	vpunpcklqdq	t1,hc1,hc1
 | 
						|
-	vmovd		0x23(m),t1x
 | 
						|
-	vmovd		0x33(m),t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,hc1,hc1
 | 
						|
-	vpsrld		$2,hc1,hc1
 | 
						|
-	vpand		ANMASK(%rip),hc1,hc1
 | 
						|
-	vmovd		h1,t1x
 | 
						|
-	vpaddd		t1,hc1,hc1
 | 
						|
-	# hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
 | 
						|
-	#	 (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
 | 
						|
-	vmovd		0x06(m),hc2x
 | 
						|
-	vmovd		0x16(m),t1x
 | 
						|
-	vpunpcklqdq	t1,hc2,hc2
 | 
						|
-	vmovd		0x26(m),t1x
 | 
						|
-	vmovd		0x36(m),t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,hc2,hc2
 | 
						|
-	vpsrld		$4,hc2,hc2
 | 
						|
-	vpand		ANMASK(%rip),hc2,hc2
 | 
						|
-	vmovd		h2,t1x
 | 
						|
-	vpaddd		t1,hc2,hc2
 | 
						|
-	# hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
 | 
						|
-	#	 (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
 | 
						|
-	vmovd		0x09(m),hc3x
 | 
						|
-	vmovd		0x19(m),t1x
 | 
						|
-	vpunpcklqdq	t1,hc3,hc3
 | 
						|
-	vmovd		0x29(m),t1x
 | 
						|
-	vmovd		0x39(m),t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,hc3,hc3
 | 
						|
-	vpsrld		$6,hc3,hc3
 | 
						|
-	vpand		ANMASK(%rip),hc3,hc3
 | 
						|
-	vmovd		h3,t1x
 | 
						|
-	vpaddd		t1,hc3,hc3
 | 
						|
-	# hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
 | 
						|
-	#	 (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
 | 
						|
-	vmovd		0x0c(m),hc4x
 | 
						|
-	vmovd		0x1c(m),t1x
 | 
						|
-	vpunpcklqdq	t1,hc4,hc4
 | 
						|
-	vmovd		0x2c(m),t1x
 | 
						|
-	vmovd		0x3c(m),t2x
 | 
						|
-	vpunpcklqdq	t2,t1,t1
 | 
						|
-	vperm2i128	$0x20,t1,hc4,hc4
 | 
						|
-	vpsrld		$8,hc4,hc4
 | 
						|
-	vpor		ORMASK(%rip),hc4,hc4
 | 
						|
-	vmovd		h4,t1x
 | 
						|
-	vpaddd		t1,hc4,hc4
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
 | 
						|
-	vpmuludq	hc0,ruwy0,t1
 | 
						|
-	# t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
 | 
						|
-	vpmuludq	hc1,svxz4,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
 | 
						|
-	vpmuludq	hc2,svxz3,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
 | 
						|
-	vpmuludq	hc3,svxz2,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
 | 
						|
-	vpmuludq	hc4,svxz1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# d0 = t1[0] + t1[1] + t[2] + t[3]
 | 
						|
-	vpermq		$0xee,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vpsrldq		$8,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vmovq		t1x,d0
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
 | 
						|
-	vpmuludq	hc0,ruwy1,t1
 | 
						|
-	# t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
 | 
						|
-	vpmuludq	hc1,ruwy0,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
 | 
						|
-	vpmuludq	hc2,svxz4,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
 | 
						|
-	vpmuludq	hc3,svxz3,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
 | 
						|
-	vpmuludq	hc4,svxz2,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# d1 = t1[0] + t1[1] + t1[3] + t1[4]
 | 
						|
-	vpermq		$0xee,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vpsrldq		$8,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vmovq		t1x,d1
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
 | 
						|
-	vpmuludq	hc0,ruwy2,t1
 | 
						|
-	# t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
 | 
						|
-	vpmuludq	hc1,ruwy1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
 | 
						|
-	vpmuludq	hc2,ruwy0,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
 | 
						|
-	vpmuludq	hc3,svxz4,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
 | 
						|
-	vpmuludq	hc4,svxz3,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# d2 = t1[0] + t1[1] + t1[2] + t1[3]
 | 
						|
-	vpermq		$0xee,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vpsrldq		$8,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vmovq		t1x,d2
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
 | 
						|
-	vpmuludq	hc0,ruwy3,t1
 | 
						|
-	# t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
 | 
						|
-	vpmuludq	hc1,ruwy2,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
 | 
						|
-	vpmuludq	hc2,ruwy1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
 | 
						|
-	vpmuludq	hc3,ruwy0,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
 | 
						|
-	vpmuludq	hc4,svxz4,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# d3 = t1[0] + t1[1] + t1[2] + t1[3]
 | 
						|
-	vpermq		$0xee,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vpsrldq		$8,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vmovq		t1x,d3
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
 | 
						|
-	vpmuludq	hc0,ruwy4,t1
 | 
						|
-	# t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
 | 
						|
-	vpmuludq	hc1,ruwy3,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
 | 
						|
-	vpmuludq	hc2,ruwy2,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
 | 
						|
-	vpmuludq	hc3,ruwy1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
 | 
						|
-	vpmuludq	hc4,ruwy0,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	# d4 = t1[0] + t1[1] + t1[2] + t1[3]
 | 
						|
-	vpermq		$0xee,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vpsrldq		$8,t1,t2
 | 
						|
-	vpaddq		t2,t1,t1
 | 
						|
-	vmovq		t1x,d4
 | 
						|
-
 | 
						|
-	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
 | 
						|
-	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
 | 
						|
-	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
 | 
						|
-	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
 | 
						|
-	# integers.  It's true in a single-block implementation, but not here.
 | 
						|
-
 | 
						|
-	# d1 += d0 >> 26
 | 
						|
-	mov		d0,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d1
 | 
						|
-	# h0 = d0 & 0x3ffffff
 | 
						|
-	mov		d0,%rbx
 | 
						|
-	and		$0x3ffffff,%ebx
 | 
						|
-
 | 
						|
-	# d2 += d1 >> 26
 | 
						|
-	mov		d1,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d2
 | 
						|
-	# h1 = d1 & 0x3ffffff
 | 
						|
-	mov		d1,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h1
 | 
						|
-
 | 
						|
-	# d3 += d2 >> 26
 | 
						|
-	mov		d2,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d3
 | 
						|
-	# h2 = d2 & 0x3ffffff
 | 
						|
-	mov		d2,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h2
 | 
						|
-
 | 
						|
-	# d4 += d3 >> 26
 | 
						|
-	mov		d3,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d4
 | 
						|
-	# h3 = d3 & 0x3ffffff
 | 
						|
-	mov		d3,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h3
 | 
						|
-
 | 
						|
-	# h0 += (d4 >> 26) * 5
 | 
						|
-	mov		d4,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	lea		(%rax,%rax,4),%rax
 | 
						|
-	add		%rax,%rbx
 | 
						|
-	# h4 = d4 & 0x3ffffff
 | 
						|
-	mov		d4,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h4
 | 
						|
-
 | 
						|
-	# h1 += h0 >> 26
 | 
						|
-	mov		%rbx,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%eax,h1
 | 
						|
-	# h0 = h0 & 0x3ffffff
 | 
						|
-	andl		$0x3ffffff,%ebx
 | 
						|
-	mov		%ebx,h0
 | 
						|
-
 | 
						|
-	add		$0x40,m
 | 
						|
-	dec		%rcx
 | 
						|
-	jnz		.Ldoblock4
 | 
						|
-
 | 
						|
-	vzeroupper
 | 
						|
-	pop		%r13
 | 
						|
-	pop		%r12
 | 
						|
-	pop		%rbx
 | 
						|
-	ret
 | 
						|
-ENDPROC(poly1305_4block_avx2)
 | 
						|
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
 | 
						|
+++ /dev/null
 | 
						|
@@ -1,590 +0,0 @@
 | 
						|
-/* SPDX-License-Identifier: GPL-2.0-or-later */
 | 
						|
-/*
 | 
						|
- * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
 | 
						|
- *
 | 
						|
- * Copyright (C) 2015 Martin Willi
 | 
						|
- */
 | 
						|
-
 | 
						|
-#include <linux/linkage.h>
 | 
						|
-
 | 
						|
-.section	.rodata.cst16.ANMASK, "aM", @progbits, 16
 | 
						|
-.align 16
 | 
						|
-ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
 | 
						|
-
 | 
						|
-.section	.rodata.cst16.ORMASK, "aM", @progbits, 16
 | 
						|
-.align 16
 | 
						|
-ORMASK:	.octa 0x00000000010000000000000001000000
 | 
						|
-
 | 
						|
-.text
 | 
						|
-
 | 
						|
-#define h0 0x00(%rdi)
 | 
						|
-#define h1 0x04(%rdi)
 | 
						|
-#define h2 0x08(%rdi)
 | 
						|
-#define h3 0x0c(%rdi)
 | 
						|
-#define h4 0x10(%rdi)
 | 
						|
-#define r0 0x00(%rdx)
 | 
						|
-#define r1 0x04(%rdx)
 | 
						|
-#define r2 0x08(%rdx)
 | 
						|
-#define r3 0x0c(%rdx)
 | 
						|
-#define r4 0x10(%rdx)
 | 
						|
-#define s1 0x00(%rsp)
 | 
						|
-#define s2 0x04(%rsp)
 | 
						|
-#define s3 0x08(%rsp)
 | 
						|
-#define s4 0x0c(%rsp)
 | 
						|
-#define m %rsi
 | 
						|
-#define h01 %xmm0
 | 
						|
-#define h23 %xmm1
 | 
						|
-#define h44 %xmm2
 | 
						|
-#define t1 %xmm3
 | 
						|
-#define t2 %xmm4
 | 
						|
-#define t3 %xmm5
 | 
						|
-#define t4 %xmm6
 | 
						|
-#define mask %xmm7
 | 
						|
-#define d0 %r8
 | 
						|
-#define d1 %r9
 | 
						|
-#define d2 %r10
 | 
						|
-#define d3 %r11
 | 
						|
-#define d4 %r12
 | 
						|
-
 | 
						|
-ENTRY(poly1305_block_sse2)
 | 
						|
-	# %rdi: Accumulator h[5]
 | 
						|
-	# %rsi: 16 byte input block m
 | 
						|
-	# %rdx: Poly1305 key r[5]
 | 
						|
-	# %rcx: Block count
 | 
						|
-
 | 
						|
-	# This single block variant tries to improve performance by doing two
 | 
						|
-	# multiplications in parallel using SSE instructions. There is quite
 | 
						|
-	# some quardword packing involved, hence the speedup is marginal.
 | 
						|
-
 | 
						|
-	push		%rbx
 | 
						|
-	push		%r12
 | 
						|
-	sub		$0x10,%rsp
 | 
						|
-
 | 
						|
-	# s1..s4 = r1..r4 * 5
 | 
						|
-	mov		r1,%eax
 | 
						|
-	lea		(%eax,%eax,4),%eax
 | 
						|
-	mov		%eax,s1
 | 
						|
-	mov		r2,%eax
 | 
						|
-	lea		(%eax,%eax,4),%eax
 | 
						|
-	mov		%eax,s2
 | 
						|
-	mov		r3,%eax
 | 
						|
-	lea		(%eax,%eax,4),%eax
 | 
						|
-	mov		%eax,s3
 | 
						|
-	mov		r4,%eax
 | 
						|
-	lea		(%eax,%eax,4),%eax
 | 
						|
-	mov		%eax,s4
 | 
						|
-
 | 
						|
-	movdqa		ANMASK(%rip),mask
 | 
						|
-
 | 
						|
-.Ldoblock:
 | 
						|
-	# h01 = [0, h1, 0, h0]
 | 
						|
-	# h23 = [0, h3, 0, h2]
 | 
						|
-	# h44 = [0, h4, 0, h4]
 | 
						|
-	movd		h0,h01
 | 
						|
-	movd		h1,t1
 | 
						|
-	movd		h2,h23
 | 
						|
-	movd		h3,t2
 | 
						|
-	movd		h4,h44
 | 
						|
-	punpcklqdq	t1,h01
 | 
						|
-	punpcklqdq	t2,h23
 | 
						|
-	punpcklqdq	h44,h44
 | 
						|
-
 | 
						|
-	# h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
 | 
						|
-	movd		0x00(m),t1
 | 
						|
-	movd		0x03(m),t2
 | 
						|
-	psrld		$2,t2
 | 
						|
-	punpcklqdq	t2,t1
 | 
						|
-	pand		mask,t1
 | 
						|
-	paddd		t1,h01
 | 
						|
-	# h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
 | 
						|
-	movd		0x06(m),t1
 | 
						|
-	movd		0x09(m),t2
 | 
						|
-	psrld		$4,t1
 | 
						|
-	psrld		$6,t2
 | 
						|
-	punpcklqdq	t2,t1
 | 
						|
-	pand		mask,t1
 | 
						|
-	paddd		t1,h23
 | 
						|
-	# h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
 | 
						|
-	mov		0x0c(m),%eax
 | 
						|
-	shr		$8,%eax
 | 
						|
-	or		$0x01000000,%eax
 | 
						|
-	movd		%eax,t1
 | 
						|
-	pshufd		$0xc4,t1,t1
 | 
						|
-	paddd		t1,h44
 | 
						|
-
 | 
						|
-	# t1[0] = h0 * r0 + h2 * s3
 | 
						|
-	# t1[1] = h1 * s4 + h3 * s2
 | 
						|
-	movd		r0,t1
 | 
						|
-	movd		s4,t2
 | 
						|
-	punpcklqdq	t2,t1
 | 
						|
-	pmuludq		h01,t1
 | 
						|
-	movd		s3,t2
 | 
						|
-	movd		s2,t3
 | 
						|
-	punpcklqdq	t3,t2
 | 
						|
-	pmuludq		h23,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t2[0] = h0 * r1 + h2 * s4
 | 
						|
-	# t2[1] = h1 * r0 + h3 * s3
 | 
						|
-	movd		r1,t2
 | 
						|
-	movd		r0,t3
 | 
						|
-	punpcklqdq	t3,t2
 | 
						|
-	pmuludq		h01,t2
 | 
						|
-	movd		s4,t3
 | 
						|
-	movd		s3,t4
 | 
						|
-	punpcklqdq	t4,t3
 | 
						|
-	pmuludq		h23,t3
 | 
						|
-	paddq		t3,t2
 | 
						|
-	# t3[0] = h4 * s1
 | 
						|
-	# t3[1] = h4 * s2
 | 
						|
-	movd		s1,t3
 | 
						|
-	movd		s2,t4
 | 
						|
-	punpcklqdq	t4,t3
 | 
						|
-	pmuludq		h44,t3
 | 
						|
-	# d0 = t1[0] + t1[1] + t3[0]
 | 
						|
-	# d1 = t2[0] + t2[1] + t3[1]
 | 
						|
-	movdqa		t1,t4
 | 
						|
-	punpcklqdq	t2,t4
 | 
						|
-	punpckhqdq	t2,t1
 | 
						|
-	paddq		t4,t1
 | 
						|
-	paddq		t3,t1
 | 
						|
-	movq		t1,d0
 | 
						|
-	psrldq		$8,t1
 | 
						|
-	movq		t1,d1
 | 
						|
-
 | 
						|
-	# t1[0] = h0 * r2 + h2 * r0
 | 
						|
-	# t1[1] = h1 * r1 + h3 * s4
 | 
						|
-	movd		r2,t1
 | 
						|
-	movd		r1,t2
 | 
						|
-	punpcklqdq 	t2,t1
 | 
						|
-	pmuludq		h01,t1
 | 
						|
-	movd		r0,t2
 | 
						|
-	movd		s4,t3
 | 
						|
-	punpcklqdq	t3,t2
 | 
						|
-	pmuludq		h23,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t2[0] = h0 * r3 + h2 * r1
 | 
						|
-	# t2[1] = h1 * r2 + h3 * r0
 | 
						|
-	movd		r3,t2
 | 
						|
-	movd		r2,t3
 | 
						|
-	punpcklqdq	t3,t2
 | 
						|
-	pmuludq		h01,t2
 | 
						|
-	movd		r1,t3
 | 
						|
-	movd		r0,t4
 | 
						|
-	punpcklqdq	t4,t3
 | 
						|
-	pmuludq		h23,t3
 | 
						|
-	paddq		t3,t2
 | 
						|
-	# t3[0] = h4 * s3
 | 
						|
-	# t3[1] = h4 * s4
 | 
						|
-	movd		s3,t3
 | 
						|
-	movd		s4,t4
 | 
						|
-	punpcklqdq	t4,t3
 | 
						|
-	pmuludq		h44,t3
 | 
						|
-	# d2 = t1[0] + t1[1] + t3[0]
 | 
						|
-	# d3 = t2[0] + t2[1] + t3[1]
 | 
						|
-	movdqa		t1,t4
 | 
						|
-	punpcklqdq	t2,t4
 | 
						|
-	punpckhqdq	t2,t1
 | 
						|
-	paddq		t4,t1
 | 
						|
-	paddq		t3,t1
 | 
						|
-	movq		t1,d2
 | 
						|
-	psrldq		$8,t1
 | 
						|
-	movq		t1,d3
 | 
						|
-
 | 
						|
-	# t1[0] = h0 * r4 + h2 * r2
 | 
						|
-	# t1[1] = h1 * r3 + h3 * r1
 | 
						|
-	movd		r4,t1
 | 
						|
-	movd		r3,t2
 | 
						|
-	punpcklqdq	t2,t1
 | 
						|
-	pmuludq		h01,t1
 | 
						|
-	movd		r2,t2
 | 
						|
-	movd		r1,t3
 | 
						|
-	punpcklqdq	t3,t2
 | 
						|
-	pmuludq		h23,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t3[0] = h4 * r0
 | 
						|
-	movd		r0,t3
 | 
						|
-	pmuludq		h44,t3
 | 
						|
-	# d4 = t1[0] + t1[1] + t3[0]
 | 
						|
-	movdqa		t1,t4
 | 
						|
-	psrldq		$8,t4
 | 
						|
-	paddq		t4,t1
 | 
						|
-	paddq		t3,t1
 | 
						|
-	movq		t1,d4
 | 
						|
-
 | 
						|
-	# d1 += d0 >> 26
 | 
						|
-	mov		d0,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d1
 | 
						|
-	# h0 = d0 & 0x3ffffff
 | 
						|
-	mov		d0,%rbx
 | 
						|
-	and		$0x3ffffff,%ebx
 | 
						|
-
 | 
						|
-	# d2 += d1 >> 26
 | 
						|
-	mov		d1,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d2
 | 
						|
-	# h1 = d1 & 0x3ffffff
 | 
						|
-	mov		d1,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h1
 | 
						|
-
 | 
						|
-	# d3 += d2 >> 26
 | 
						|
-	mov		d2,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d3
 | 
						|
-	# h2 = d2 & 0x3ffffff
 | 
						|
-	mov		d2,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h2
 | 
						|
-
 | 
						|
-	# d4 += d3 >> 26
 | 
						|
-	mov		d3,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d4
 | 
						|
-	# h3 = d3 & 0x3ffffff
 | 
						|
-	mov		d3,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h3
 | 
						|
-
 | 
						|
-	# h0 += (d4 >> 26) * 5
 | 
						|
-	mov		d4,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	lea		(%rax,%rax,4),%rax
 | 
						|
-	add		%rax,%rbx
 | 
						|
-	# h4 = d4 & 0x3ffffff
 | 
						|
-	mov		d4,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h4
 | 
						|
-
 | 
						|
-	# h1 += h0 >> 26
 | 
						|
-	mov		%rbx,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%eax,h1
 | 
						|
-	# h0 = h0 & 0x3ffffff
 | 
						|
-	andl		$0x3ffffff,%ebx
 | 
						|
-	mov		%ebx,h0
 | 
						|
-
 | 
						|
-	add		$0x10,m
 | 
						|
-	dec		%rcx
 | 
						|
-	jnz		.Ldoblock
 | 
						|
-
 | 
						|
-	# Zeroing of key material
 | 
						|
-	mov		%rcx,0x00(%rsp)
 | 
						|
-	mov		%rcx,0x08(%rsp)
 | 
						|
-
 | 
						|
-	add		$0x10,%rsp
 | 
						|
-	pop		%r12
 | 
						|
-	pop		%rbx
 | 
						|
-	ret
 | 
						|
-ENDPROC(poly1305_block_sse2)
 | 
						|
-
 | 
						|
-
 | 
						|
-#define u0 0x00(%r8)
 | 
						|
-#define u1 0x04(%r8)
 | 
						|
-#define u2 0x08(%r8)
 | 
						|
-#define u3 0x0c(%r8)
 | 
						|
-#define u4 0x10(%r8)
 | 
						|
-#define hc0 %xmm0
 | 
						|
-#define hc1 %xmm1
 | 
						|
-#define hc2 %xmm2
 | 
						|
-#define hc3 %xmm5
 | 
						|
-#define hc4 %xmm6
 | 
						|
-#define ru0 %xmm7
 | 
						|
-#define ru1 %xmm8
 | 
						|
-#define ru2 %xmm9
 | 
						|
-#define ru3 %xmm10
 | 
						|
-#define ru4 %xmm11
 | 
						|
-#define sv1 %xmm12
 | 
						|
-#define sv2 %xmm13
 | 
						|
-#define sv3 %xmm14
 | 
						|
-#define sv4 %xmm15
 | 
						|
-#undef d0
 | 
						|
-#define d0 %r13
 | 
						|
-
 | 
						|
-ENTRY(poly1305_2block_sse2)
 | 
						|
-	# %rdi: Accumulator h[5]
 | 
						|
-	# %rsi: 16 byte input block m
 | 
						|
-	# %rdx: Poly1305 key r[5]
 | 
						|
-	# %rcx: Doubleblock count
 | 
						|
-	# %r8:  Poly1305 derived key r^2 u[5]
 | 
						|
-
 | 
						|
-	# This two-block variant further improves performance by using loop
 | 
						|
-	# unrolled block processing. This is more straight forward and does
 | 
						|
-	# less byte shuffling, but requires a second Poly1305 key r^2:
 | 
						|
-	# h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
 | 
						|
-
 | 
						|
-	push		%rbx
 | 
						|
-	push		%r12
 | 
						|
-	push		%r13
 | 
						|
-
 | 
						|
-	# combine r0,u0
 | 
						|
-	movd		u0,ru0
 | 
						|
-	movd		r0,t1
 | 
						|
-	punpcklqdq	t1,ru0
 | 
						|
-
 | 
						|
-	# combine r1,u1 and s1=r1*5,v1=u1*5
 | 
						|
-	movd		u1,ru1
 | 
						|
-	movd		r1,t1
 | 
						|
-	punpcklqdq	t1,ru1
 | 
						|
-	movdqa		ru1,sv1
 | 
						|
-	pslld		$2,sv1
 | 
						|
-	paddd		ru1,sv1
 | 
						|
-
 | 
						|
-	# combine r2,u2 and s2=r2*5,v2=u2*5
 | 
						|
-	movd		u2,ru2
 | 
						|
-	movd		r2,t1
 | 
						|
-	punpcklqdq	t1,ru2
 | 
						|
-	movdqa		ru2,sv2
 | 
						|
-	pslld		$2,sv2
 | 
						|
-	paddd		ru2,sv2
 | 
						|
-
 | 
						|
-	# combine r3,u3 and s3=r3*5,v3=u3*5
 | 
						|
-	movd		u3,ru3
 | 
						|
-	movd		r3,t1
 | 
						|
-	punpcklqdq	t1,ru3
 | 
						|
-	movdqa		ru3,sv3
 | 
						|
-	pslld		$2,sv3
 | 
						|
-	paddd		ru3,sv3
 | 
						|
-
 | 
						|
-	# combine r4,u4 and s4=r4*5,v4=u4*5
 | 
						|
-	movd		u4,ru4
 | 
						|
-	movd		r4,t1
 | 
						|
-	punpcklqdq	t1,ru4
 | 
						|
-	movdqa		ru4,sv4
 | 
						|
-	pslld		$2,sv4
 | 
						|
-	paddd		ru4,sv4
 | 
						|
-
 | 
						|
-.Ldoblock2:
 | 
						|
-	# hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
 | 
						|
-	movd		0x00(m),hc0
 | 
						|
-	movd		0x10(m),t1
 | 
						|
-	punpcklqdq	t1,hc0
 | 
						|
-	pand		ANMASK(%rip),hc0
 | 
						|
-	movd		h0,t1
 | 
						|
-	paddd		t1,hc0
 | 
						|
-	# hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
 | 
						|
-	movd		0x03(m),hc1
 | 
						|
-	movd		0x13(m),t1
 | 
						|
-	punpcklqdq	t1,hc1
 | 
						|
-	psrld		$2,hc1
 | 
						|
-	pand		ANMASK(%rip),hc1
 | 
						|
-	movd		h1,t1
 | 
						|
-	paddd		t1,hc1
 | 
						|
-	# hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
 | 
						|
-	movd		0x06(m),hc2
 | 
						|
-	movd		0x16(m),t1
 | 
						|
-	punpcklqdq	t1,hc2
 | 
						|
-	psrld		$4,hc2
 | 
						|
-	pand		ANMASK(%rip),hc2
 | 
						|
-	movd		h2,t1
 | 
						|
-	paddd		t1,hc2
 | 
						|
-	# hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
 | 
						|
-	movd		0x09(m),hc3
 | 
						|
-	movd		0x19(m),t1
 | 
						|
-	punpcklqdq	t1,hc3
 | 
						|
-	psrld		$6,hc3
 | 
						|
-	pand		ANMASK(%rip),hc3
 | 
						|
-	movd		h3,t1
 | 
						|
-	paddd		t1,hc3
 | 
						|
-	# hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
 | 
						|
-	movd		0x0c(m),hc4
 | 
						|
-	movd		0x1c(m),t1
 | 
						|
-	punpcklqdq	t1,hc4
 | 
						|
-	psrld		$8,hc4
 | 
						|
-	por		ORMASK(%rip),hc4
 | 
						|
-	movd		h4,t1
 | 
						|
-	paddd		t1,hc4
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[1] * r0, hc0[0] * u0 ]
 | 
						|
-	movdqa		ru0,t1
 | 
						|
-	pmuludq		hc0,t1
 | 
						|
-	# t1 += [ hc1[1] * s4, hc1[0] * v4 ]
 | 
						|
-	movdqa		sv4,t2
 | 
						|
-	pmuludq		hc1,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc2[1] * s3, hc2[0] * v3 ]
 | 
						|
-	movdqa		sv3,t2
 | 
						|
-	pmuludq		hc2,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc3[1] * s2, hc3[0] * v2 ]
 | 
						|
-	movdqa		sv2,t2
 | 
						|
-	pmuludq		hc3,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc4[1] * s1, hc4[0] * v1 ]
 | 
						|
-	movdqa		sv1,t2
 | 
						|
-	pmuludq		hc4,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# d0 = t1[0] + t1[1]
 | 
						|
-	movdqa		t1,t2
 | 
						|
-	psrldq		$8,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	movq		t1,d0
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[1] * r1, hc0[0] * u1 ]
 | 
						|
-	movdqa		ru1,t1
 | 
						|
-	pmuludq		hc0,t1
 | 
						|
-	# t1 += [ hc1[1] * r0, hc1[0] * u0 ]
 | 
						|
-	movdqa		ru0,t2
 | 
						|
-	pmuludq		hc1,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc2[1] * s4, hc2[0] * v4 ]
 | 
						|
-	movdqa		sv4,t2
 | 
						|
-	pmuludq		hc2,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc3[1] * s3, hc3[0] * v3 ]
 | 
						|
-	movdqa		sv3,t2
 | 
						|
-	pmuludq		hc3,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc4[1] * s2, hc4[0] * v2 ]
 | 
						|
-	movdqa		sv2,t2
 | 
						|
-	pmuludq		hc4,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# d1 = t1[0] + t1[1]
 | 
						|
-	movdqa		t1,t2
 | 
						|
-	psrldq		$8,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	movq		t1,d1
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[1] * r2, hc0[0] * u2 ]
 | 
						|
-	movdqa		ru2,t1
 | 
						|
-	pmuludq		hc0,t1
 | 
						|
-	# t1 += [ hc1[1] * r1, hc1[0] * u1 ]
 | 
						|
-	movdqa		ru1,t2
 | 
						|
-	pmuludq		hc1,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc2[1] * r0, hc2[0] * u0 ]
 | 
						|
-	movdqa		ru0,t2
 | 
						|
-	pmuludq		hc2,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc3[1] * s4, hc3[0] * v4 ]
 | 
						|
-	movdqa		sv4,t2
 | 
						|
-	pmuludq		hc3,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc4[1] * s3, hc4[0] * v3 ]
 | 
						|
-	movdqa		sv3,t2
 | 
						|
-	pmuludq		hc4,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# d2 = t1[0] + t1[1]
 | 
						|
-	movdqa		t1,t2
 | 
						|
-	psrldq		$8,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	movq		t1,d2
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[1] * r3, hc0[0] * u3 ]
 | 
						|
-	movdqa		ru3,t1
 | 
						|
-	pmuludq		hc0,t1
 | 
						|
-	# t1 += [ hc1[1] * r2, hc1[0] * u2 ]
 | 
						|
-	movdqa		ru2,t2
 | 
						|
-	pmuludq		hc1,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc2[1] * r1, hc2[0] * u1 ]
 | 
						|
-	movdqa		ru1,t2
 | 
						|
-	pmuludq		hc2,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc3[1] * r0, hc3[0] * u0 ]
 | 
						|
-	movdqa		ru0,t2
 | 
						|
-	pmuludq		hc3,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc4[1] * s4, hc4[0] * v4 ]
 | 
						|
-	movdqa		sv4,t2
 | 
						|
-	pmuludq		hc4,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# d3 = t1[0] + t1[1]
 | 
						|
-	movdqa		t1,t2
 | 
						|
-	psrldq		$8,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	movq		t1,d3
 | 
						|
-
 | 
						|
-	# t1 = [ hc0[1] * r4, hc0[0] * u4 ]
 | 
						|
-	movdqa		ru4,t1
 | 
						|
-	pmuludq		hc0,t1
 | 
						|
-	# t1 += [ hc1[1] * r3, hc1[0] * u3 ]
 | 
						|
-	movdqa		ru3,t2
 | 
						|
-	pmuludq		hc1,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc2[1] * r2, hc2[0] * u2 ]
 | 
						|
-	movdqa		ru2,t2
 | 
						|
-	pmuludq		hc2,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc3[1] * r1, hc3[0] * u1 ]
 | 
						|
-	movdqa		ru1,t2
 | 
						|
-	pmuludq		hc3,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# t1 += [ hc4[1] * r0, hc4[0] * u0 ]
 | 
						|
-	movdqa		ru0,t2
 | 
						|
-	pmuludq		hc4,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	# d4 = t1[0] + t1[1]
 | 
						|
-	movdqa		t1,t2
 | 
						|
-	psrldq		$8,t2
 | 
						|
-	paddq		t2,t1
 | 
						|
-	movq		t1,d4
 | 
						|
-
 | 
						|
-	# Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
 | 
						|
-	# h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
 | 
						|
-	# amount.  Careful: we must not assume the carry bits 'd0 >> 26',
 | 
						|
-	# 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
 | 
						|
-	# integers.  It's true in a single-block implementation, but not here.
 | 
						|
-
 | 
						|
-	# d1 += d0 >> 26
 | 
						|
-	mov		d0,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d1
 | 
						|
-	# h0 = d0 & 0x3ffffff
 | 
						|
-	mov		d0,%rbx
 | 
						|
-	and		$0x3ffffff,%ebx
 | 
						|
-
 | 
						|
-	# d2 += d1 >> 26
 | 
						|
-	mov		d1,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d2
 | 
						|
-	# h1 = d1 & 0x3ffffff
 | 
						|
-	mov		d1,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h1
 | 
						|
-
 | 
						|
-	# d3 += d2 >> 26
 | 
						|
-	mov		d2,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d3
 | 
						|
-	# h2 = d2 & 0x3ffffff
 | 
						|
-	mov		d2,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h2
 | 
						|
-
 | 
						|
-	# d4 += d3 >> 26
 | 
						|
-	mov		d3,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%rax,d4
 | 
						|
-	# h3 = d3 & 0x3ffffff
 | 
						|
-	mov		d3,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h3
 | 
						|
-
 | 
						|
-	# h0 += (d4 >> 26) * 5
 | 
						|
-	mov		d4,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	lea		(%rax,%rax,4),%rax
 | 
						|
-	add		%rax,%rbx
 | 
						|
-	# h4 = d4 & 0x3ffffff
 | 
						|
-	mov		d4,%rax
 | 
						|
-	and		$0x3ffffff,%eax
 | 
						|
-	mov		%eax,h4
 | 
						|
-
 | 
						|
-	# h1 += h0 >> 26
 | 
						|
-	mov		%rbx,%rax
 | 
						|
-	shr		$26,%rax
 | 
						|
-	add		%eax,h1
 | 
						|
-	# h0 = h0 & 0x3ffffff
 | 
						|
-	andl		$0x3ffffff,%ebx
 | 
						|
-	mov		%ebx,h0
 | 
						|
-
 | 
						|
-	add		$0x20,m
 | 
						|
-	dec		%rcx
 | 
						|
-	jnz		.Ldoblock2
 | 
						|
-
 | 
						|
-	pop		%r13
 | 
						|
-	pop		%r12
 | 
						|
-	pop		%rbx
 | 
						|
-	ret
 | 
						|
-ENDPROC(poly1305_2block_sse2)
 | 
						|
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
 | 
						|
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
 | 
						|
@@ -1,11 +1,14 @@
 | 
						|
-#! /usr/bin/env perl
 | 
						|
-# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
 | 
						|
+#!/usr/bin/env perl
 | 
						|
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 | 
						|
 #
 | 
						|
-# Licensed under the OpenSSL license (the "License").  You may not use
 | 
						|
-# this file except in compliance with the License.  You can obtain a copy
 | 
						|
-# in the file LICENSE in the source distribution or at
 | 
						|
-# https://www.openssl.org/source/license.html
 | 
						|
-
 | 
						|
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
 | 
						|
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 | 
						|
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
 | 
						|
+#
 | 
						|
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
 | 
						|
+# has relicensed it under the licenses specified in the SPDX header above.
 | 
						|
+# The original headers, including the original license headers, are
 | 
						|
+# included below for completeness.
 | 
						|
 #
 | 
						|
 # ====================================================================
 | 
						|
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 | 
						|
@@ -32,7 +35,7 @@
 | 
						|
 # Skylake-X system performance. Since we are likely to suppress
 | 
						|
 # AVX512F capability flag [at least on Skylake-X], conversion serves
 | 
						|
 # as kind of "investment protection". Note that next *lake processor,
 | 
						|
-# Cannolake, has AVX512IFMA code path to execute...
 | 
						|
+# Cannonlake, has AVX512IFMA code path to execute...
 | 
						|
 #
 | 
						|
 # Numbers are cycles per processed byte with poly1305_blocks alone,
 | 
						|
 # measured with rdtsc at fixed clock frequency.
 | 
						|
@@ -68,39 +71,114 @@ $output  = shift;
 | 
						|
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 | 
						|
 
 | 
						|
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 | 
						|
+$kernel=0; $kernel=1 if (!$flavour && !$output);
 | 
						|
 
 | 
						|
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 | 
						|
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 | 
						|
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 | 
						|
-die "can't locate x86_64-xlate.pl";
 | 
						|
-
 | 
						|
-if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 | 
						|
-		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
 | 
						|
-	$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
 | 
						|
+if (!$kernel) {
 | 
						|
+	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 | 
						|
+	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 | 
						|
+	( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 | 
						|
+	die "can't locate x86_64-xlate.pl";
 | 
						|
+
 | 
						|
+	open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 | 
						|
+	*STDOUT=*OUT;
 | 
						|
+
 | 
						|
+	if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
 | 
						|
+	    =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
 | 
						|
+		$avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
 | 
						|
+	}
 | 
						|
+
 | 
						|
+	if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 | 
						|
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
 | 
						|
+		$avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
 | 
						|
+		$avx += 1 if ($1==2.11 && $2>=8);
 | 
						|
+	}
 | 
						|
+
 | 
						|
+	if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 | 
						|
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
 | 
						|
+		$avx = ($1>=10) + ($1>=11);
 | 
						|
+	}
 | 
						|
+
 | 
						|
+	if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
 | 
						|
+		$avx = ($2>=3.0) + ($2>3.0);
 | 
						|
+	}
 | 
						|
+} else {
 | 
						|
+	$avx = 4; # The kernel uses ifdefs for this.
 | 
						|
 }
 | 
						|
 
 | 
						|
-if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
 | 
						|
-	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
 | 
						|
-	$avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
 | 
						|
-	$avx += 2 if ($1==2.11 && $2>=8);
 | 
						|
+sub declare_function() {
 | 
						|
+	my ($name, $align, $nargs) = @_;
 | 
						|
+	if($kernel) {
 | 
						|
+		$code .= ".align $align\n";
 | 
						|
+		$code .= "ENTRY($name)\n";
 | 
						|
+		$code .= ".L$name:\n";
 | 
						|
+	} else {
 | 
						|
+		$code .= ".globl	$name\n";
 | 
						|
+		$code .= ".type	$name,\@function,$nargs\n";
 | 
						|
+		$code .= ".align	$align\n";
 | 
						|
+		$code .= "$name:\n";
 | 
						|
+	}
 | 
						|
 }
 | 
						|
 
 | 
						|
-if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
 | 
						|
-	   `ml64 2>&1` =~ /Version ([0-9]+)\./) {
 | 
						|
-	$avx = ($1>=10) + ($1>=12);
 | 
						|
+sub end_function() {
 | 
						|
+	my ($name) = @_;
 | 
						|
+	if($kernel) {
 | 
						|
+		$code .= "ENDPROC($name)\n";
 | 
						|
+	} else {
 | 
						|
+		$code .= ".size   $name,.-$name\n";
 | 
						|
+	}
 | 
						|
 }
 | 
						|
 
 | 
						|
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
 | 
						|
-	$avx = ($2>=3.0) + ($2>3.0);
 | 
						|
-}
 | 
						|
+$code.=<<___ if $kernel;
 | 
						|
+#include <linux/linkage.h>
 | 
						|
+___
 | 
						|
 
 | 
						|
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 | 
						|
-*STDOUT=*OUT;
 | 
						|
+if ($avx) {
 | 
						|
+$code.=<<___ if $kernel;
 | 
						|
+.section .rodata
 | 
						|
+___
 | 
						|
+$code.=<<___;
 | 
						|
+.align	64
 | 
						|
+.Lconst:
 | 
						|
+.Lmask24:
 | 
						|
+.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
 | 
						|
+.L129:
 | 
						|
+.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 | 
						|
+.Lmask26:
 | 
						|
+.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 | 
						|
+.Lpermd_avx2:
 | 
						|
+.long	2,2,2,3,2,0,2,1
 | 
						|
+.Lpermd_avx512:
 | 
						|
+.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 | 
						|
+
 | 
						|
+.L2_44_inp_permd:
 | 
						|
+.long	0,1,1,2,2,3,7,7
 | 
						|
+.L2_44_inp_shift:
 | 
						|
+.quad	0,12,24,64
 | 
						|
+.L2_44_mask:
 | 
						|
+.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
 | 
						|
+.L2_44_shift_rgt:
 | 
						|
+.quad	44,44,42,64
 | 
						|
+.L2_44_shift_lft:
 | 
						|
+.quad	8,8,10,64
 | 
						|
+
 | 
						|
+.align	64
 | 
						|
+.Lx_mask44:
 | 
						|
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | 
						|
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | 
						|
+.Lx_mask42:
 | 
						|
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | 
						|
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | 
						|
+___
 | 
						|
+}
 | 
						|
+$code.=<<___ if (!$kernel);
 | 
						|
+.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 | 
						|
+.align	16
 | 
						|
+___
 | 
						|
 
 | 
						|
 my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
 | 
						|
 my ($mac,$nonce)=($inp,$len);	# *_emit arguments
 | 
						|
-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
 | 
						|
-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
 | 
						|
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
 | 
						|
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
 | 
						|
 
 | 
						|
 sub poly1305_iteration {
 | 
						|
 # input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
 | 
						|
@@ -155,19 +233,19 @@ ___
 | 
						|
 
 | 
						|
 $code.=<<___;
 | 
						|
 .text
 | 
						|
-
 | 
						|
+___
 | 
						|
+$code.=<<___ if (!$kernel);
 | 
						|
 .extern	OPENSSL_ia32cap_P
 | 
						|
 
 | 
						|
-.globl	poly1305_init
 | 
						|
-.hidden	poly1305_init
 | 
						|
-.globl	poly1305_blocks
 | 
						|
-.hidden	poly1305_blocks
 | 
						|
-.globl	poly1305_emit
 | 
						|
-.hidden	poly1305_emit
 | 
						|
-
 | 
						|
-.type	poly1305_init,\@function,3
 | 
						|
-.align	32
 | 
						|
-poly1305_init:
 | 
						|
+.globl	poly1305_init_x86_64
 | 
						|
+.hidden	poly1305_init_x86_64
 | 
						|
+.globl	poly1305_blocks_x86_64
 | 
						|
+.hidden	poly1305_blocks_x86_64
 | 
						|
+.globl	poly1305_emit_x86_64
 | 
						|
+.hidden	poly1305_emit_x86_64
 | 
						|
+___
 | 
						|
+&declare_function("poly1305_init_x86_64", 32, 3);
 | 
						|
+$code.=<<___;
 | 
						|
 	xor	%rax,%rax
 | 
						|
 	mov	%rax,0($ctx)		# initialize hash value
 | 
						|
 	mov	%rax,8($ctx)
 | 
						|
@@ -175,11 +253,12 @@ poly1305_init:
 | 
						|
 
 | 
						|
 	cmp	\$0,$inp
 | 
						|
 	je	.Lno_key
 | 
						|
-
 | 
						|
-	lea	poly1305_blocks(%rip),%r10
 | 
						|
-	lea	poly1305_emit(%rip),%r11
 | 
						|
 ___
 | 
						|
-$code.=<<___	if ($avx);
 | 
						|
+$code.=<<___ if (!$kernel);
 | 
						|
+	lea	poly1305_blocks_x86_64(%rip),%r10
 | 
						|
+	lea	poly1305_emit_x86_64(%rip),%r11
 | 
						|
+___
 | 
						|
+$code.=<<___	if (!$kernel && $avx);
 | 
						|
 	mov	OPENSSL_ia32cap_P+4(%rip),%r9
 | 
						|
 	lea	poly1305_blocks_avx(%rip),%rax
 | 
						|
 	lea	poly1305_emit_avx(%rip),%rcx
 | 
						|
@@ -187,12 +266,12 @@ $code.=<<___	if ($avx);
 | 
						|
 	cmovc	%rax,%r10
 | 
						|
 	cmovc	%rcx,%r11
 | 
						|
 ___
 | 
						|
-$code.=<<___	if ($avx>1);
 | 
						|
+$code.=<<___	if (!$kernel && $avx>1);
 | 
						|
 	lea	poly1305_blocks_avx2(%rip),%rax
 | 
						|
 	bt	\$`5+32`,%r9		# AVX2?
 | 
						|
 	cmovc	%rax,%r10
 | 
						|
 ___
 | 
						|
-$code.=<<___	if ($avx>3);
 | 
						|
+$code.=<<___	if (!$kernel && $avx>3);
 | 
						|
 	mov	\$`(1<<31|1<<21|1<<16)`,%rax
 | 
						|
 	shr	\$32,%r9
 | 
						|
 	and	%rax,%r9
 | 
						|
@@ -207,11 +286,11 @@ $code.=<<___;
 | 
						|
 	mov	%rax,24($ctx)
 | 
						|
 	mov	%rcx,32($ctx)
 | 
						|
 ___
 | 
						|
-$code.=<<___	if ($flavour !~ /elf32/);
 | 
						|
+$code.=<<___	if (!$kernel && $flavour !~ /elf32/);
 | 
						|
 	mov	%r10,0(%rdx)
 | 
						|
 	mov	%r11,8(%rdx)
 | 
						|
 ___
 | 
						|
-$code.=<<___	if ($flavour =~ /elf32/);
 | 
						|
+$code.=<<___	if (!$kernel && $flavour =~ /elf32/);
 | 
						|
 	mov	%r10d,0(%rdx)
 | 
						|
 	mov	%r11d,4(%rdx)
 | 
						|
 ___
 | 
						|
@@ -219,11 +298,11 @@ $code.=<<___;
 | 
						|
 	mov	\$1,%eax
 | 
						|
 .Lno_key:
 | 
						|
 	ret
 | 
						|
-.size	poly1305_init,.-poly1305_init
 | 
						|
+___
 | 
						|
+&end_function("poly1305_init_x86_64");
 | 
						|
 
 | 
						|
-.type	poly1305_blocks,\@function,4
 | 
						|
-.align	32
 | 
						|
-poly1305_blocks:
 | 
						|
+&declare_function("poly1305_blocks_x86_64", 32, 4);
 | 
						|
+$code.=<<___;
 | 
						|
 .cfi_startproc
 | 
						|
 .Lblocks:
 | 
						|
 	shr	\$4,$len
 | 
						|
@@ -231,8 +310,6 @@ poly1305_blocks:
 | 
						|
 
 | 
						|
 	push	%rbx
 | 
						|
 .cfi_push	%rbx
 | 
						|
-	push	%rbp
 | 
						|
-.cfi_push	%rbp
 | 
						|
 	push	%r12
 | 
						|
 .cfi_push	%r12
 | 
						|
 	push	%r13
 | 
						|
@@ -241,6 +318,8 @@ poly1305_blocks:
 | 
						|
 .cfi_push	%r14
 | 
						|
 	push	%r15
 | 
						|
 .cfi_push	%r15
 | 
						|
+	push	$ctx
 | 
						|
+.cfi_push	$ctx
 | 
						|
 .Lblocks_body:
 | 
						|
 
 | 
						|
 	mov	$len,%r15		# reassign $len
 | 
						|
@@ -265,26 +344,29 @@ poly1305_blocks:
 | 
						|
 	lea	16($inp),$inp
 | 
						|
 	adc	$padbit,$h2
 | 
						|
 ___
 | 
						|
+
 | 
						|
 	&poly1305_iteration();
 | 
						|
+
 | 
						|
 $code.=<<___;
 | 
						|
 	mov	$r1,%rax
 | 
						|
 	dec	%r15			# len-=16
 | 
						|
 	jnz	.Loop
 | 
						|
 
 | 
						|
+	mov	0(%rsp),$ctx
 | 
						|
+.cfi_restore	$ctx
 | 
						|
+
 | 
						|
 	mov	$h0,0($ctx)		# store hash value
 | 
						|
 	mov	$h1,8($ctx)
 | 
						|
 	mov	$h2,16($ctx)
 | 
						|
 
 | 
						|
-	mov	0(%rsp),%r15
 | 
						|
+	mov	8(%rsp),%r15
 | 
						|
 .cfi_restore	%r15
 | 
						|
-	mov	8(%rsp),%r14
 | 
						|
+	mov	16(%rsp),%r14
 | 
						|
 .cfi_restore	%r14
 | 
						|
-	mov	16(%rsp),%r13
 | 
						|
+	mov	24(%rsp),%r13
 | 
						|
 .cfi_restore	%r13
 | 
						|
-	mov	24(%rsp),%r12
 | 
						|
+	mov	32(%rsp),%r12
 | 
						|
 .cfi_restore	%r12
 | 
						|
-	mov	32(%rsp),%rbp
 | 
						|
-.cfi_restore	%rbp
 | 
						|
 	mov	40(%rsp),%rbx
 | 
						|
 .cfi_restore	%rbx
 | 
						|
 	lea	48(%rsp),%rsp
 | 
						|
@@ -293,11 +375,11 @@ $code.=<<___;
 | 
						|
 .Lblocks_epilogue:
 | 
						|
 	ret
 | 
						|
 .cfi_endproc
 | 
						|
-.size	poly1305_blocks,.-poly1305_blocks
 | 
						|
+___
 | 
						|
+&end_function("poly1305_blocks_x86_64");
 | 
						|
 
 | 
						|
-.type	poly1305_emit,\@function,3
 | 
						|
-.align	32
 | 
						|
-poly1305_emit:
 | 
						|
+&declare_function("poly1305_emit_x86_64", 32, 3);
 | 
						|
+$code.=<<___;
 | 
						|
 .Lemit:
 | 
						|
 	mov	0($ctx),%r8	# load hash value
 | 
						|
 	mov	8($ctx),%r9
 | 
						|
@@ -318,10 +400,14 @@ poly1305_emit:
 | 
						|
 	mov	%rcx,8($mac)
 | 
						|
 
 | 
						|
 	ret
 | 
						|
-.size	poly1305_emit,.-poly1305_emit
 | 
						|
 ___
 | 
						|
+&end_function("poly1305_emit_x86_64");
 | 
						|
 if ($avx) {
 | 
						|
 
 | 
						|
+if($kernel) {
 | 
						|
+	$code .= "#ifdef CONFIG_AS_AVX\n";
 | 
						|
+}
 | 
						|
+
 | 
						|
 ########################################################################
 | 
						|
 # Layout of opaque area is following.
 | 
						|
 #
 | 
						|
@@ -342,15 +428,19 @@ $code.=<<___;
 | 
						|
 .type	__poly1305_block,\@abi-omnipotent
 | 
						|
 .align	32
 | 
						|
 __poly1305_block:
 | 
						|
+	push $ctx
 | 
						|
 ___
 | 
						|
 	&poly1305_iteration();
 | 
						|
 $code.=<<___;
 | 
						|
+	pop $ctx
 | 
						|
 	ret
 | 
						|
 .size	__poly1305_block,.-__poly1305_block
 | 
						|
 
 | 
						|
 .type	__poly1305_init_avx,\@abi-omnipotent
 | 
						|
 .align	32
 | 
						|
 __poly1305_init_avx:
 | 
						|
+	push %rbp
 | 
						|
+	mov %rsp,%rbp
 | 
						|
 	mov	$r0,$h0
 | 
						|
 	mov	$r1,$h1
 | 
						|
 	xor	$h2,$h2
 | 
						|
@@ -507,12 +597,13 @@ __poly1305_init_avx:
 | 
						|
 	mov	$d1#d,`16*8+8-64`($ctx)
 | 
						|
 
 | 
						|
 	lea	-48-64($ctx),$ctx	# size [de-]optimization
 | 
						|
+	pop %rbp
 | 
						|
 	ret
 | 
						|
 .size	__poly1305_init_avx,.-__poly1305_init_avx
 | 
						|
+___
 | 
						|
 
 | 
						|
-.type	poly1305_blocks_avx,\@function,4
 | 
						|
-.align	32
 | 
						|
-poly1305_blocks_avx:
 | 
						|
+&declare_function("poly1305_blocks_avx", 32, 4);
 | 
						|
+$code.=<<___;
 | 
						|
 .cfi_startproc
 | 
						|
 	mov	20($ctx),%r8d		# is_base2_26
 | 
						|
 	cmp	\$128,$len
 | 
						|
@@ -532,10 +623,11 @@ poly1305_blocks_avx:
 | 
						|
 	test	\$31,$len
 | 
						|
 	jz	.Leven_avx
 | 
						|
 
 | 
						|
-	push	%rbx
 | 
						|
-.cfi_push	%rbx
 | 
						|
 	push	%rbp
 | 
						|
 .cfi_push	%rbp
 | 
						|
+	mov 	%rsp,%rbp
 | 
						|
+	push	%rbx
 | 
						|
+.cfi_push	%rbx
 | 
						|
 	push	%r12
 | 
						|
 .cfi_push	%r12
 | 
						|
 	push	%r13
 | 
						|
@@ -645,20 +737,18 @@ poly1305_blocks_avx:
 | 
						|
 	mov	$h2#d,16($ctx)
 | 
						|
 .align	16
 | 
						|
 .Ldone_avx:
 | 
						|
-	mov	0(%rsp),%r15
 | 
						|
+	pop 		%r15
 | 
						|
 .cfi_restore	%r15
 | 
						|
-	mov	8(%rsp),%r14
 | 
						|
+	pop 		%r14
 | 
						|
 .cfi_restore	%r14
 | 
						|
-	mov	16(%rsp),%r13
 | 
						|
+	pop 		%r13
 | 
						|
 .cfi_restore	%r13
 | 
						|
-	mov	24(%rsp),%r12
 | 
						|
+	pop 		%r12
 | 
						|
 .cfi_restore	%r12
 | 
						|
-	mov	32(%rsp),%rbp
 | 
						|
-.cfi_restore	%rbp
 | 
						|
-	mov	40(%rsp),%rbx
 | 
						|
+	pop 		%rbx
 | 
						|
 .cfi_restore	%rbx
 | 
						|
-	lea	48(%rsp),%rsp
 | 
						|
-.cfi_adjust_cfa_offset	-48
 | 
						|
+	pop 		%rbp
 | 
						|
+.cfi_restore	%rbp
 | 
						|
 .Lno_data_avx:
 | 
						|
 .Lblocks_avx_epilogue:
 | 
						|
 	ret
 | 
						|
@@ -667,10 +757,11 @@ poly1305_blocks_avx:
 | 
						|
 .align	32
 | 
						|
 .Lbase2_64_avx:
 | 
						|
 .cfi_startproc
 | 
						|
-	push	%rbx
 | 
						|
-.cfi_push	%rbx
 | 
						|
 	push	%rbp
 | 
						|
 .cfi_push	%rbp
 | 
						|
+	mov 	%rsp,%rbp
 | 
						|
+	push	%rbx
 | 
						|
+.cfi_push	%rbx
 | 
						|
 	push	%r12
 | 
						|
 .cfi_push	%r12
 | 
						|
 	push	%r13
 | 
						|
@@ -736,22 +827,18 @@ poly1305_blocks_avx:
 | 
						|
 
 | 
						|
 .Lproceed_avx:
 | 
						|
 	mov	%r15,$len
 | 
						|
-
 | 
						|
-	mov	0(%rsp),%r15
 | 
						|
+	pop 		%r15
 | 
						|
 .cfi_restore	%r15
 | 
						|
-	mov	8(%rsp),%r14
 | 
						|
+	pop 		%r14
 | 
						|
 .cfi_restore	%r14
 | 
						|
-	mov	16(%rsp),%r13
 | 
						|
+	pop 		%r13
 | 
						|
 .cfi_restore	%r13
 | 
						|
-	mov	24(%rsp),%r12
 | 
						|
+	pop 		%r12
 | 
						|
 .cfi_restore	%r12
 | 
						|
-	mov	32(%rsp),%rbp
 | 
						|
-.cfi_restore	%rbp
 | 
						|
-	mov	40(%rsp),%rbx
 | 
						|
+	pop 		%rbx
 | 
						|
 .cfi_restore	%rbx
 | 
						|
-	lea	48(%rsp),%rax
 | 
						|
-	lea	48(%rsp),%rsp
 | 
						|
-.cfi_adjust_cfa_offset	-48
 | 
						|
+	pop 		%rbp
 | 
						|
+.cfi_restore	%rbp
 | 
						|
 .Lbase2_64_avx_epilogue:
 | 
						|
 	jmp	.Ldo_avx
 | 
						|
 .cfi_endproc
 | 
						|
@@ -768,8 +855,11 @@ poly1305_blocks_avx:
 | 
						|
 .Ldo_avx:
 | 
						|
 ___
 | 
						|
 $code.=<<___	if (!$win64);
 | 
						|
+	lea		8(%rsp),%r10
 | 
						|
+.cfi_def_cfa_register	%r10
 | 
						|
+	and		\$-32,%rsp
 | 
						|
+	sub		\$-8,%rsp
 | 
						|
 	lea		-0x58(%rsp),%r11
 | 
						|
-.cfi_def_cfa		%r11,0x60
 | 
						|
 	sub		\$0x178,%rsp
 | 
						|
 ___
 | 
						|
 $code.=<<___	if ($win64);
 | 
						|
@@ -1361,18 +1451,18 @@ $code.=<<___	if ($win64);
 | 
						|
 .Ldo_avx_epilogue:
 | 
						|
 ___
 | 
						|
 $code.=<<___	if (!$win64);
 | 
						|
-	lea		0x58(%r11),%rsp
 | 
						|
-.cfi_def_cfa		%rsp,8
 | 
						|
+	lea		-8(%r10),%rsp
 | 
						|
+.cfi_def_cfa_register	%rsp
 | 
						|
 ___
 | 
						|
 $code.=<<___;
 | 
						|
 	vzeroupper
 | 
						|
 	ret
 | 
						|
 .cfi_endproc
 | 
						|
-.size	poly1305_blocks_avx,.-poly1305_blocks_avx
 | 
						|
+___
 | 
						|
+&end_function("poly1305_blocks_avx");
 | 
						|
 
 | 
						|
-.type	poly1305_emit_avx,\@function,3
 | 
						|
-.align	32
 | 
						|
-poly1305_emit_avx:
 | 
						|
+&declare_function("poly1305_emit_avx", 32, 3);
 | 
						|
+$code.=<<___;
 | 
						|
 	cmpl	\$0,20($ctx)	# is_base2_26?
 | 
						|
 	je	.Lemit
 | 
						|
 
 | 
						|
@@ -1423,41 +1513,51 @@ poly1305_emit_avx:
 | 
						|
 	mov	%rcx,8($mac)
 | 
						|
 
 | 
						|
 	ret
 | 
						|
-.size	poly1305_emit_avx,.-poly1305_emit_avx
 | 
						|
 ___
 | 
						|
+&end_function("poly1305_emit_avx");
 | 
						|
+
 | 
						|
+if ($kernel) {
 | 
						|
+	$code .= "#endif\n";
 | 
						|
+}
 | 
						|
 
 | 
						|
 if ($avx>1) {
 | 
						|
+
 | 
						|
+if ($kernel) {
 | 
						|
+	$code .= "#ifdef CONFIG_AS_AVX2\n";
 | 
						|
+}
 | 
						|
+
 | 
						|
 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
 | 
						|
     map("%ymm$_",(0..15));
 | 
						|
 my $S4=$MASK;
 | 
						|
 
 | 
						|
+sub poly1305_blocks_avxN {
 | 
						|
+	my ($avx512) = @_;
 | 
						|
+	my $suffix = $avx512 ? "_avx512" : "";
 | 
						|
 $code.=<<___;
 | 
						|
-.type	poly1305_blocks_avx2,\@function,4
 | 
						|
-.align	32
 | 
						|
-poly1305_blocks_avx2:
 | 
						|
 .cfi_startproc
 | 
						|
 	mov	20($ctx),%r8d		# is_base2_26
 | 
						|
 	cmp	\$128,$len
 | 
						|
-	jae	.Lblocks_avx2
 | 
						|
+	jae	.Lblocks_avx2$suffix
 | 
						|
 	test	%r8d,%r8d
 | 
						|
 	jz	.Lblocks
 | 
						|
 
 | 
						|
-.Lblocks_avx2:
 | 
						|
+.Lblocks_avx2$suffix:
 | 
						|
 	and	\$-16,$len
 | 
						|
-	jz	.Lno_data_avx2
 | 
						|
+	jz	.Lno_data_avx2$suffix
 | 
						|
 
 | 
						|
 	vzeroupper
 | 
						|
 
 | 
						|
 	test	%r8d,%r8d
 | 
						|
-	jz	.Lbase2_64_avx2
 | 
						|
+	jz	.Lbase2_64_avx2$suffix
 | 
						|
 
 | 
						|
 	test	\$63,$len
 | 
						|
-	jz	.Leven_avx2
 | 
						|
+	jz	.Leven_avx2$suffix
 | 
						|
 
 | 
						|
-	push	%rbx
 | 
						|
-.cfi_push	%rbx
 | 
						|
 	push	%rbp
 | 
						|
 .cfi_push	%rbp
 | 
						|
+	mov 	%rsp,%rbp
 | 
						|
+	push	%rbx
 | 
						|
+.cfi_push	%rbx
 | 
						|
 	push	%r12
 | 
						|
 .cfi_push	%r12
 | 
						|
 	push	%r13
 | 
						|
@@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
 | 
						|
 .cfi_push	%r14
 | 
						|
 	push	%r15
 | 
						|
 .cfi_push	%r15
 | 
						|
-.Lblocks_avx2_body:
 | 
						|
+.Lblocks_avx2_body$suffix:
 | 
						|
 
 | 
						|
 	mov	$len,%r15		# reassign $len
 | 
						|
 
 | 
						|
@@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
 | 
						|
 	shr	\$2,$s1
 | 
						|
 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
 | 
						|
 
 | 
						|
-.Lbase2_26_pre_avx2:
 | 
						|
+.Lbase2_26_pre_avx2$suffix:
 | 
						|
 	add	0($inp),$h0		# accumulate input
 | 
						|
 	adc	8($inp),$h1
 | 
						|
 	lea	16($inp),$inp
 | 
						|
@@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
 | 
						|
 	mov	$r1,%rax
 | 
						|
 
 | 
						|
 	test	\$63,%r15
 | 
						|
-	jnz	.Lbase2_26_pre_avx2
 | 
						|
+	jnz	.Lbase2_26_pre_avx2$suffix
 | 
						|
 
 | 
						|
 	test	$padbit,$padbit		# if $padbit is zero,
 | 
						|
-	jz	.Lstore_base2_64_avx2	# store hash in base 2^64 format
 | 
						|
+	jz	.Lstore_base2_64_avx2$suffix	# store hash in base 2^64 format
 | 
						|
 
 | 
						|
 	################################# base 2^64 -> base 2^26
 | 
						|
 	mov	$h0,%rax
 | 
						|
@@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
 | 
						|
 	or	$r1,$h2			# h[4]
 | 
						|
 
 | 
						|
 	test	%r15,%r15
 | 
						|
-	jz	.Lstore_base2_26_avx2
 | 
						|
+	jz	.Lstore_base2_26_avx2$suffix
 | 
						|
 
 | 
						|
 	vmovd	%rax#d,%x#$H0
 | 
						|
 	vmovd	%rdx#d,%x#$H1
 | 
						|
 	vmovd	$h0#d,%x#$H2
 | 
						|
 	vmovd	$h1#d,%x#$H3
 | 
						|
 	vmovd	$h2#d,%x#$H4
 | 
						|
-	jmp	.Lproceed_avx2
 | 
						|
+	jmp	.Lproceed_avx2$suffix
 | 
						|
 
 | 
						|
 .align	32
 | 
						|
-.Lstore_base2_64_avx2:
 | 
						|
+.Lstore_base2_64_avx2$suffix:
 | 
						|
 	mov	$h0,0($ctx)
 | 
						|
 	mov	$h1,8($ctx)
 | 
						|
 	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
 | 
						|
-	jmp	.Ldone_avx2
 | 
						|
+	jmp	.Ldone_avx2$suffix
 | 
						|
 
 | 
						|
 .align	16
 | 
						|
-.Lstore_base2_26_avx2:
 | 
						|
+.Lstore_base2_26_avx2$suffix:
 | 
						|
 	mov	%rax#d,0($ctx)		# store hash value base 2^26
 | 
						|
 	mov	%rdx#d,4($ctx)
 | 
						|
 	mov	$h0#d,8($ctx)
 | 
						|
 	mov	$h1#d,12($ctx)
 | 
						|
 	mov	$h2#d,16($ctx)
 | 
						|
 .align	16
 | 
						|
-.Ldone_avx2:
 | 
						|
-	mov	0(%rsp),%r15
 | 
						|
+.Ldone_avx2$suffix:
 | 
						|
+	pop 		%r15
 | 
						|
 .cfi_restore	%r15
 | 
						|
-	mov	8(%rsp),%r14
 | 
						|
+	pop 		%r14
 | 
						|
 .cfi_restore	%r14
 | 
						|
-	mov	16(%rsp),%r13
 | 
						|
+	pop 		%r13
 | 
						|
 .cfi_restore	%r13
 | 
						|
-	mov	24(%rsp),%r12
 | 
						|
+	pop 		%r12
 | 
						|
 .cfi_restore	%r12
 | 
						|
-	mov	32(%rsp),%rbp
 | 
						|
-.cfi_restore	%rbp
 | 
						|
-	mov	40(%rsp),%rbx
 | 
						|
+	pop 		%rbx
 | 
						|
 .cfi_restore	%rbx
 | 
						|
-	lea	48(%rsp),%rsp
 | 
						|
-.cfi_adjust_cfa_offset	-48
 | 
						|
-.Lno_data_avx2:
 | 
						|
-.Lblocks_avx2_epilogue:
 | 
						|
+	pop 		%rbp
 | 
						|
+.cfi_restore 	%rbp
 | 
						|
+.Lno_data_avx2$suffix:
 | 
						|
+.Lblocks_avx2_epilogue$suffix:
 | 
						|
 	ret
 | 
						|
 .cfi_endproc
 | 
						|
 
 | 
						|
 .align	32
 | 
						|
-.Lbase2_64_avx2:
 | 
						|
+.Lbase2_64_avx2$suffix:
 | 
						|
 .cfi_startproc
 | 
						|
-	push	%rbx
 | 
						|
-.cfi_push	%rbx
 | 
						|
 	push	%rbp
 | 
						|
 .cfi_push	%rbp
 | 
						|
+	mov 	%rsp,%rbp
 | 
						|
+	push	%rbx
 | 
						|
+.cfi_push	%rbx
 | 
						|
 	push	%r12
 | 
						|
 .cfi_push	%r12
 | 
						|
 	push	%r13
 | 
						|
@@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
 | 
						|
 .cfi_push	%r14
 | 
						|
 	push	%r15
 | 
						|
 .cfi_push	%r15
 | 
						|
-.Lbase2_64_avx2_body:
 | 
						|
+.Lbase2_64_avx2_body$suffix:
 | 
						|
 
 | 
						|
 	mov	$len,%r15		# reassign $len
 | 
						|
 
 | 
						|
@@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
 | 
						|
 	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
 | 
						|
 
 | 
						|
 	test	\$63,$len
 | 
						|
-	jz	.Linit_avx2
 | 
						|
+	jz	.Linit_avx2$suffix
 | 
						|
 
 | 
						|
-.Lbase2_64_pre_avx2:
 | 
						|
+.Lbase2_64_pre_avx2$suffix:
 | 
						|
 	add	0($inp),$h0		# accumulate input
 | 
						|
 	adc	8($inp),$h1
 | 
						|
 	lea	16($inp),$inp
 | 
						|
@@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
 | 
						|
 	mov	$r1,%rax
 | 
						|
 
 | 
						|
 	test	\$63,%r15
 | 
						|
-	jnz	.Lbase2_64_pre_avx2
 | 
						|
+	jnz	.Lbase2_64_pre_avx2$suffix
 | 
						|
 
 | 
						|
-.Linit_avx2:
 | 
						|
+.Linit_avx2$suffix:
 | 
						|
 	################################# base 2^64 -> base 2^26
 | 
						|
 	mov	$h0,%rax
 | 
						|
 	mov	$h0,%rdx
 | 
						|
@@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
 | 
						|
 
 | 
						|
 	call	__poly1305_init_avx
 | 
						|
 
 | 
						|
-.Lproceed_avx2:
 | 
						|
+.Lproceed_avx2$suffix:
 | 
						|
 	mov	%r15,$len			# restore $len
 | 
						|
-	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
 | 
						|
+___
 | 
						|
+$code.=<<___ if (!$kernel);
 | 
						|
+	mov	OPENSSL_ia32cap_P+8(%rip),%r9d
 | 
						|
 	mov	\$`(1<<31|1<<30|1<<16)`,%r11d
 | 
						|
-
 | 
						|
-	mov	0(%rsp),%r15
 | 
						|
+___
 | 
						|
+$code.=<<___;
 | 
						|
+	pop 		%r15
 | 
						|
 .cfi_restore	%r15
 | 
						|
-	mov	8(%rsp),%r14
 | 
						|
+	pop 		%r14
 | 
						|
 .cfi_restore	%r14
 | 
						|
-	mov	16(%rsp),%r13
 | 
						|
+	pop 		%r13
 | 
						|
 .cfi_restore	%r13
 | 
						|
-	mov	24(%rsp),%r12
 | 
						|
+	pop 		%r12
 | 
						|
 .cfi_restore	%r12
 | 
						|
-	mov	32(%rsp),%rbp
 | 
						|
-.cfi_restore	%rbp
 | 
						|
-	mov	40(%rsp),%rbx
 | 
						|
+	pop 		%rbx
 | 
						|
 .cfi_restore	%rbx
 | 
						|
-	lea	48(%rsp),%rax
 | 
						|
-	lea	48(%rsp),%rsp
 | 
						|
-.cfi_adjust_cfa_offset	-48
 | 
						|
-.Lbase2_64_avx2_epilogue:
 | 
						|
-	jmp	.Ldo_avx2
 | 
						|
+	pop 		%rbp
 | 
						|
+.cfi_restore 	%rbp
 | 
						|
+.Lbase2_64_avx2_epilogue$suffix:
 | 
						|
+	jmp	.Ldo_avx2$suffix
 | 
						|
 .cfi_endproc
 | 
						|
 
 | 
						|
 .align	32
 | 
						|
-.Leven_avx2:
 | 
						|
+.Leven_avx2$suffix:
 | 
						|
 .cfi_startproc
 | 
						|
-	mov		OPENSSL_ia32cap_P+8(%rip),%r10d
 | 
						|
+___
 | 
						|
+$code.=<<___ if (!$kernel);
 | 
						|
+	mov		OPENSSL_ia32cap_P+8(%rip),%r9d
 | 
						|
+___
 | 
						|
+$code.=<<___;
 | 
						|
 	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
 | 
						|
 	vmovd		4*1($ctx),%x#$H1
 | 
						|
 	vmovd		4*2($ctx),%x#$H2
 | 
						|
 	vmovd		4*3($ctx),%x#$H3
 | 
						|
 	vmovd		4*4($ctx),%x#$H4
 | 
						|
 
 | 
						|
-.Ldo_avx2:
 | 
						|
+.Ldo_avx2$suffix:
 | 
						|
 ___
 | 
						|
-$code.=<<___		if ($avx>2);
 | 
						|
+$code.=<<___		if (!$kernel && $avx>2);
 | 
						|
 	cmp		\$512,$len
 | 
						|
 	jb		.Lskip_avx512
 | 
						|
-	and		%r11d,%r10d
 | 
						|
-	test		\$`1<<16`,%r10d		# check for AVX512F
 | 
						|
+	and		%r11d,%r9d
 | 
						|
+	test		\$`1<<16`,%r9d		# check for AVX512F
 | 
						|
 	jnz		.Lblocks_avx512
 | 
						|
-.Lskip_avx512:
 | 
						|
+.Lskip_avx512$suffix:
 | 
						|
+___
 | 
						|
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
 | 
						|
+	cmp		\$512,$len
 | 
						|
+	jae		.Lblocks_avx512
 | 
						|
 ___
 | 
						|
 $code.=<<___	if (!$win64);
 | 
						|
-	lea		-8(%rsp),%r11
 | 
						|
-.cfi_def_cfa		%r11,16
 | 
						|
+	lea		8(%rsp),%r10
 | 
						|
+.cfi_def_cfa_register	%r10
 | 
						|
 	sub		\$0x128,%rsp
 | 
						|
 ___
 | 
						|
 $code.=<<___	if ($win64);
 | 
						|
-	lea		-0xf8(%rsp),%r11
 | 
						|
+	lea		8(%rsp),%r10
 | 
						|
 	sub		\$0x1c8,%rsp
 | 
						|
-	vmovdqa		%xmm6,0x50(%r11)
 | 
						|
-	vmovdqa		%xmm7,0x60(%r11)
 | 
						|
-	vmovdqa		%xmm8,0x70(%r11)
 | 
						|
-	vmovdqa		%xmm9,0x80(%r11)
 | 
						|
-	vmovdqa		%xmm10,0x90(%r11)
 | 
						|
-	vmovdqa		%xmm11,0xa0(%r11)
 | 
						|
-	vmovdqa		%xmm12,0xb0(%r11)
 | 
						|
-	vmovdqa		%xmm13,0xc0(%r11)
 | 
						|
-	vmovdqa		%xmm14,0xd0(%r11)
 | 
						|
-	vmovdqa		%xmm15,0xe0(%r11)
 | 
						|
-.Ldo_avx2_body:
 | 
						|
+	vmovdqa		%xmm6,-0xb0(%r10)
 | 
						|
+	vmovdqa		%xmm7,-0xa0(%r10)
 | 
						|
+	vmovdqa		%xmm8,-0x90(%r10)
 | 
						|
+	vmovdqa		%xmm9,-0x80(%r10)
 | 
						|
+	vmovdqa		%xmm10,-0x70(%r10)
 | 
						|
+	vmovdqa		%xmm11,-0x60(%r10)
 | 
						|
+	vmovdqa		%xmm12,-0x50(%r10)
 | 
						|
+	vmovdqa		%xmm13,-0x40(%r10)
 | 
						|
+	vmovdqa		%xmm14,-0x30(%r10)
 | 
						|
+	vmovdqa		%xmm15,-0x20(%r10)
 | 
						|
+.Ldo_avx2_body$suffix:
 | 
						|
 ___
 | 
						|
 $code.=<<___;
 | 
						|
 	lea		.Lconst(%rip),%rcx
 | 
						|
@@ -1794,11 +1901,11 @@ $code.=<<___;
 | 
						|
 
 | 
						|
 	vpaddq		$H2,$T2,$H2		# accumulate input
 | 
						|
 	sub		\$64,$len
 | 
						|
-	jz		.Ltail_avx2
 | 
						|
-	jmp		.Loop_avx2
 | 
						|
+	jz		.Ltail_avx2$suffix
 | 
						|
+	jmp		.Loop_avx2$suffix
 | 
						|
 
 | 
						|
 .align	32
 | 
						|
-.Loop_avx2:
 | 
						|
+.Loop_avx2$suffix:
 | 
						|
 	################################################################
 | 
						|
 	# ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
 | 
						|
 	# ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
 | 
						|
@@ -1946,10 +2053,10 @@ $code.=<<___;
 | 
						|
 	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
 | 
						|
 
 | 
						|
 	sub		\$64,$len
 | 
						|
-	jnz		.Loop_avx2
 | 
						|
+	jnz		.Loop_avx2$suffix
 | 
						|
 
 | 
						|
 	.byte		0x66,0x90
 | 
						|
-.Ltail_avx2:
 | 
						|
+.Ltail_avx2$suffix:
 | 
						|
 	################################################################
 | 
						|
 	# while above multiplications were by r^4 in all lanes, in last
 | 
						|
 	# iteration we multiply least significant lane by r^4 and most
 | 
						|
@@ -2087,37 +2194,29 @@ $code.=<<___;
 | 
						|
 	vmovd		%x#$H4,`4*4-48-64`($ctx)
 | 
						|
 ___
 | 
						|
 $code.=<<___	if ($win64);
 | 
						|
-	vmovdqa		0x50(%r11),%xmm6
 | 
						|
-	vmovdqa		0x60(%r11),%xmm7
 | 
						|
-	vmovdqa		0x70(%r11),%xmm8
 | 
						|
-	vmovdqa		0x80(%r11),%xmm9
 | 
						|
-	vmovdqa		0x90(%r11),%xmm10
 | 
						|
-	vmovdqa		0xa0(%r11),%xmm11
 | 
						|
-	vmovdqa		0xb0(%r11),%xmm12
 | 
						|
-	vmovdqa		0xc0(%r11),%xmm13
 | 
						|
-	vmovdqa		0xd0(%r11),%xmm14
 | 
						|
-	vmovdqa		0xe0(%r11),%xmm15
 | 
						|
-	lea		0xf8(%r11),%rsp
 | 
						|
-.Ldo_avx2_epilogue:
 | 
						|
+	vmovdqa		-0xb0(%r10),%xmm6
 | 
						|
+	vmovdqa		-0xa0(%r10),%xmm7
 | 
						|
+	vmovdqa		-0x90(%r10),%xmm8
 | 
						|
+	vmovdqa		-0x80(%r10),%xmm9
 | 
						|
+	vmovdqa		-0x70(%r10),%xmm10
 | 
						|
+	vmovdqa		-0x60(%r10),%xmm11
 | 
						|
+	vmovdqa		-0x50(%r10),%xmm12
 | 
						|
+	vmovdqa		-0x40(%r10),%xmm13
 | 
						|
+	vmovdqa		-0x30(%r10),%xmm14
 | 
						|
+	vmovdqa		-0x20(%r10),%xmm15
 | 
						|
+	lea		-8(%r10),%rsp
 | 
						|
+.Ldo_avx2_epilogue$suffix:
 | 
						|
 ___
 | 
						|
 $code.=<<___	if (!$win64);
 | 
						|
-	lea		8(%r11),%rsp
 | 
						|
-.cfi_def_cfa		%rsp,8
 | 
						|
+	lea		-8(%r10),%rsp
 | 
						|
+.cfi_def_cfa_register	%rsp
 | 
						|
 ___
 | 
						|
 $code.=<<___;
 | 
						|
 	vzeroupper
 | 
						|
 	ret
 | 
						|
 .cfi_endproc
 | 
						|
-.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
 | 
						|
 ___
 | 
						|
-#######################################################################
 | 
						|
-if ($avx>2) {
 | 
						|
-# On entry we have input length divisible by 64. But since inner loop
 | 
						|
-# processes 128 bytes per iteration, cases when length is not divisible
 | 
						|
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
 | 
						|
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
 | 
						|
-# for this tail, we wouldn't have to even allocate stack frame...
 | 
						|
-
 | 
						|
+if($avx > 2 && $avx512) {
 | 
						|
 my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
 | 
						|
 my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
 | 
						|
 my $PADBIT="%zmm30";
 | 
						|
@@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
 | 
						|
 map(s/%y/%z/,($MASK));
 | 
						|
 
 | 
						|
 $code.=<<___;
 | 
						|
-.type	poly1305_blocks_avx512,\@function,4
 | 
						|
-.align	32
 | 
						|
-poly1305_blocks_avx512:
 | 
						|
 .cfi_startproc
 | 
						|
 .Lblocks_avx512:
 | 
						|
 	mov		\$15,%eax
 | 
						|
 	kmovw		%eax,%k2
 | 
						|
 ___
 | 
						|
 $code.=<<___	if (!$win64);
 | 
						|
-	lea		-8(%rsp),%r11
 | 
						|
-.cfi_def_cfa		%r11,16
 | 
						|
+	lea		8(%rsp),%r10
 | 
						|
+.cfi_def_cfa_register	%r10
 | 
						|
 	sub		\$0x128,%rsp
 | 
						|
 ___
 | 
						|
 $code.=<<___	if ($win64);
 | 
						|
-	lea		-0xf8(%rsp),%r11
 | 
						|
+	lea		8(%rsp),%r10
 | 
						|
 	sub		\$0x1c8,%rsp
 | 
						|
-	vmovdqa		%xmm6,0x50(%r11)
 | 
						|
-	vmovdqa		%xmm7,0x60(%r11)
 | 
						|
-	vmovdqa		%xmm8,0x70(%r11)
 | 
						|
-	vmovdqa		%xmm9,0x80(%r11)
 | 
						|
-	vmovdqa		%xmm10,0x90(%r11)
 | 
						|
-	vmovdqa		%xmm11,0xa0(%r11)
 | 
						|
-	vmovdqa		%xmm12,0xb0(%r11)
 | 
						|
-	vmovdqa		%xmm13,0xc0(%r11)
 | 
						|
-	vmovdqa		%xmm14,0xd0(%r11)
 | 
						|
-	vmovdqa		%xmm15,0xe0(%r11)
 | 
						|
+	vmovdqa		%xmm6,-0xb0(%r10)
 | 
						|
+	vmovdqa		%xmm7,-0xa0(%r10)
 | 
						|
+	vmovdqa		%xmm8,-0x90(%r10)
 | 
						|
+	vmovdqa		%xmm9,-0x80(%r10)
 | 
						|
+	vmovdqa		%xmm10,-0x70(%r10)
 | 
						|
+	vmovdqa		%xmm11,-0x60(%r10)
 | 
						|
+	vmovdqa		%xmm12,-0x50(%r10)
 | 
						|
+	vmovdqa		%xmm13,-0x40(%r10)
 | 
						|
+	vmovdqa		%xmm14,-0x30(%r10)
 | 
						|
+	vmovdqa		%xmm15,-0x20(%r10)
 | 
						|
 .Ldo_avx512_body:
 | 
						|
 ___
 | 
						|
 $code.=<<___;
 | 
						|
@@ -2679,7 +2775,7 @@ $code.=<<___;
 | 
						|
 
 | 
						|
 	lea		0x90(%rsp),%rax		# size optimization for .Ltail_avx2
 | 
						|
 	add		\$64,$len
 | 
						|
-	jnz		.Ltail_avx2
 | 
						|
+	jnz		.Ltail_avx2$suffix
 | 
						|
 
 | 
						|
 	vpsubq		$T2,$H2,$H2		# undo input accumulation
 | 
						|
 	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
 | 
						|
@@ -2690,29 +2786,61 @@ $code.=<<___;
 | 
						|
 	vzeroall
 | 
						|
 ___
 | 
						|
 $code.=<<___	if ($win64);
 | 
						|
-	movdqa		0x50(%r11),%xmm6
 | 
						|
-	movdqa		0x60(%r11),%xmm7
 | 
						|
-	movdqa		0x70(%r11),%xmm8
 | 
						|
-	movdqa		0x80(%r11),%xmm9
 | 
						|
-	movdqa		0x90(%r11),%xmm10
 | 
						|
-	movdqa		0xa0(%r11),%xmm11
 | 
						|
-	movdqa		0xb0(%r11),%xmm12
 | 
						|
-	movdqa		0xc0(%r11),%xmm13
 | 
						|
-	movdqa		0xd0(%r11),%xmm14
 | 
						|
-	movdqa		0xe0(%r11),%xmm15
 | 
						|
-	lea		0xf8(%r11),%rsp
 | 
						|
+	movdqa		-0xb0(%r10),%xmm6
 | 
						|
+	movdqa		-0xa0(%r10),%xmm7
 | 
						|
+	movdqa		-0x90(%r10),%xmm8
 | 
						|
+	movdqa		-0x80(%r10),%xmm9
 | 
						|
+	movdqa		-0x70(%r10),%xmm10
 | 
						|
+	movdqa		-0x60(%r10),%xmm11
 | 
						|
+	movdqa		-0x50(%r10),%xmm12
 | 
						|
+	movdqa		-0x40(%r10),%xmm13
 | 
						|
+	movdqa		-0x30(%r10),%xmm14
 | 
						|
+	movdqa		-0x20(%r10),%xmm15
 | 
						|
+	lea		-8(%r10),%rsp
 | 
						|
 .Ldo_avx512_epilogue:
 | 
						|
 ___
 | 
						|
 $code.=<<___	if (!$win64);
 | 
						|
-	lea		8(%r11),%rsp
 | 
						|
-.cfi_def_cfa		%rsp,8
 | 
						|
+	lea		-8(%r10),%rsp
 | 
						|
+.cfi_def_cfa_register	%rsp
 | 
						|
 ___
 | 
						|
 $code.=<<___;
 | 
						|
 	ret
 | 
						|
 .cfi_endproc
 | 
						|
-.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
 | 
						|
 ___
 | 
						|
-if ($avx>3) {
 | 
						|
+
 | 
						|
+}
 | 
						|
+
 | 
						|
+}
 | 
						|
+
 | 
						|
+&declare_function("poly1305_blocks_avx2", 32, 4);
 | 
						|
+poly1305_blocks_avxN(0);
 | 
						|
+&end_function("poly1305_blocks_avx2");
 | 
						|
+
 | 
						|
+if($kernel) {
 | 
						|
+	$code .= "#endif\n";
 | 
						|
+}
 | 
						|
+
 | 
						|
+#######################################################################
 | 
						|
+if ($avx>2) {
 | 
						|
+# On entry we have input length divisible by 64. But since inner loop
 | 
						|
+# processes 128 bytes per iteration, cases when length is not divisible
 | 
						|
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
 | 
						|
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
 | 
						|
+# for this tail, we wouldn't have to even allocate stack frame...
 | 
						|
+
 | 
						|
+if($kernel) {
 | 
						|
+	$code .= "#ifdef CONFIG_AS_AVX512\n";
 | 
						|
+}
 | 
						|
+
 | 
						|
+&declare_function("poly1305_blocks_avx512", 32, 4);
 | 
						|
+poly1305_blocks_avxN(1);
 | 
						|
+&end_function("poly1305_blocks_avx512");
 | 
						|
+
 | 
						|
+if ($kernel) {
 | 
						|
+	$code .= "#endif\n";
 | 
						|
+}
 | 
						|
+
 | 
						|
+if (!$kernel && $avx>3) {
 | 
						|
 ########################################################################
 | 
						|
 # VPMADD52 version using 2^44 radix.
 | 
						|
 #
 | 
						|
@@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
 | 
						|
 .size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
 | 
						|
 ___
 | 
						|
 }	}	}
 | 
						|
-$code.=<<___;
 | 
						|
-.align	64
 | 
						|
-.Lconst:
 | 
						|
-.Lmask24:
 | 
						|
-.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
 | 
						|
-.L129:
 | 
						|
-.long	`1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
 | 
						|
-.Lmask26:
 | 
						|
-.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 | 
						|
-.Lpermd_avx2:
 | 
						|
-.long	2,2,2,3,2,0,2,1
 | 
						|
-.Lpermd_avx512:
 | 
						|
-.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 | 
						|
-
 | 
						|
-.L2_44_inp_permd:
 | 
						|
-.long	0,1,1,2,2,3,7,7
 | 
						|
-.L2_44_inp_shift:
 | 
						|
-.quad	0,12,24,64
 | 
						|
-.L2_44_mask:
 | 
						|
-.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
 | 
						|
-.L2_44_shift_rgt:
 | 
						|
-.quad	44,44,42,64
 | 
						|
-.L2_44_shift_lft:
 | 
						|
-.quad	8,8,10,64
 | 
						|
-
 | 
						|
-.align	64
 | 
						|
-.Lx_mask44:
 | 
						|
-.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | 
						|
-.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
 | 
						|
-.Lx_mask42:
 | 
						|
-.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | 
						|
-.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
 | 
						|
-___
 | 
						|
 }
 | 
						|
-$code.=<<___;
 | 
						|
-.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 | 
						|
-.align	16
 | 
						|
-___
 | 
						|
 
 | 
						|
+if (!$kernel)
 | 
						|
 {	# chacha20-poly1305 helpers
 | 
						|
 my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
 | 
						|
                                   ("%rdi","%rsi","%rdx","%rcx");  # Unix order
 | 
						|
@@ -4038,17 +4130,17 @@ avx_handler:
 | 
						|
 
 | 
						|
 .section	.pdata
 | 
						|
 .align	4
 | 
						|
-	.rva	.LSEH_begin_poly1305_init
 | 
						|
-	.rva	.LSEH_end_poly1305_init
 | 
						|
-	.rva	.LSEH_info_poly1305_init
 | 
						|
-
 | 
						|
-	.rva	.LSEH_begin_poly1305_blocks
 | 
						|
-	.rva	.LSEH_end_poly1305_blocks
 | 
						|
-	.rva	.LSEH_info_poly1305_blocks
 | 
						|
-
 | 
						|
-	.rva	.LSEH_begin_poly1305_emit
 | 
						|
-	.rva	.LSEH_end_poly1305_emit
 | 
						|
-	.rva	.LSEH_info_poly1305_emit
 | 
						|
+	.rva	.LSEH_begin_poly1305_init_x86_64
 | 
						|
+	.rva	.LSEH_end_poly1305_init_x86_64
 | 
						|
+	.rva	.LSEH_info_poly1305_init_x86_64
 | 
						|
+
 | 
						|
+	.rva	.LSEH_begin_poly1305_blocks_x86_64
 | 
						|
+	.rva	.LSEH_end_poly1305_blocks_x86_64
 | 
						|
+	.rva	.LSEH_info_poly1305_blocks_x86_64
 | 
						|
+
 | 
						|
+	.rva	.LSEH_begin_poly1305_emit_x86_64
 | 
						|
+	.rva	.LSEH_end_poly1305_emit_x86_64
 | 
						|
+	.rva	.LSEH_info_poly1305_emit_x86_64
 | 
						|
 ___
 | 
						|
 $code.=<<___ if ($avx);
 | 
						|
 	.rva	.LSEH_begin_poly1305_blocks_avx
 | 
						|
@@ -4088,20 +4180,20 @@ ___
 | 
						|
 $code.=<<___;
 | 
						|
 .section	.xdata
 | 
						|
 .align	8
 | 
						|
-.LSEH_info_poly1305_init:
 | 
						|
+.LSEH_info_poly1305_init_x86_64:
 | 
						|
 	.byte	9,0,0,0
 | 
						|
 	.rva	se_handler
 | 
						|
-	.rva	.LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
 | 
						|
+	.rva	.LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
 | 
						|
 
 | 
						|
-.LSEH_info_poly1305_blocks:
 | 
						|
+.LSEH_info_poly1305_blocks_x86_64:
 | 
						|
 	.byte	9,0,0,0
 | 
						|
 	.rva	se_handler
 | 
						|
 	.rva	.Lblocks_body,.Lblocks_epilogue
 | 
						|
 
 | 
						|
-.LSEH_info_poly1305_emit:
 | 
						|
+.LSEH_info_poly1305_emit_x86_64:
 | 
						|
 	.byte	9,0,0,0
 | 
						|
 	.rva	se_handler
 | 
						|
-	.rva	.LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
 | 
						|
+	.rva	.LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
 | 
						|
 ___
 | 
						|
 $code.=<<___ if ($avx);
 | 
						|
 .LSEH_info_poly1305_blocks_avx_1:
 | 
						|
@@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
 | 
						|
 ___
 | 
						|
 }
 | 
						|
 
 | 
						|
+open SELF,$0;
 | 
						|
+while(<SELF>) {
 | 
						|
+	next if (/^#!/);
 | 
						|
+	last if (!s/^#/\/\// and !/^$/);
 | 
						|
+	print;
 | 
						|
+}
 | 
						|
+close SELF;
 | 
						|
+
 | 
						|
 foreach (split('\n',$code)) {
 | 
						|
 	s/\`([^\`]*)\`/eval($1)/ge;
 | 
						|
 	s/%r([a-z]+)#d/%e$1/g;
 | 
						|
 	s/%r([0-9]+)#d/%r$1d/g;
 | 
						|
 	s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
 | 
						|
 
 | 
						|
+	if ($kernel) {
 | 
						|
+		s/(^\.type.*),[0-9]+$/\1/;
 | 
						|
+		s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
 | 
						|
+		next if /^\.cfi.*/;
 | 
						|
+	}
 | 
						|
+
 | 
						|
 	print $_,"\n";
 | 
						|
 }
 | 
						|
 close STDOUT;
 | 
						|
--- a/arch/x86/crypto/poly1305_glue.c
 | 
						|
+++ b/arch/x86/crypto/poly1305_glue.c
 | 
						|
@@ -1,8 +1,6 @@
 | 
						|
-// SPDX-License-Identifier: GPL-2.0-or-later
 | 
						|
+// SPDX-License-Identifier: GPL-2.0 OR MIT
 | 
						|
 /*
 | 
						|
- * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
 | 
						|
- *
 | 
						|
- * Copyright (C) 2015 Martin Willi
 | 
						|
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 | 
						|
  */
 | 
						|
 
 | 
						|
 #include <crypto/algapi.h>
 | 
						|
@@ -13,279 +11,170 @@
 | 
						|
 #include <linux/jump_label.h>
 | 
						|
 #include <linux/kernel.h>
 | 
						|
 #include <linux/module.h>
 | 
						|
+#include <asm/intel-family.h>
 | 
						|
 #include <asm/simd.h>
 | 
						|
 
 | 
						|
-asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
 | 
						|
-				    const u32 *r, unsigned int blocks);
 | 
						|
-asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
 | 
						|
-				     unsigned int blocks, const u32 *u);
 | 
						|
-asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
 | 
						|
-				     unsigned int blocks, const u32 *u);
 | 
						|
+asmlinkage void poly1305_init_x86_64(void *ctx,
 | 
						|
+				     const u8 key[POLY1305_KEY_SIZE]);
 | 
						|
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
 | 
						|
+				       const size_t len, const u32 padbit);
 | 
						|
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 | 
						|
+				     const u32 nonce[4]);
 | 
						|
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 | 
						|
+				  const u32 nonce[4]);
 | 
						|
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
 | 
						|
+				    const u32 padbit);
 | 
						|
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
 | 
						|
+				     const u32 padbit);
 | 
						|
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
 | 
						|
+				       const size_t len, const u32 padbit);
 | 
						|
 
 | 
						|
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
 | 
						|
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
 | 
						|
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
 | 
						|
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
 | 
						|
 
 | 
						|
-static inline u64 mlt(u64 a, u64 b)
 | 
						|
-{
 | 
						|
-	return a * b;
 | 
						|
-}
 | 
						|
-
 | 
						|
-static inline u32 sr(u64 v, u_char n)
 | 
						|
-{
 | 
						|
-	return v >> n;
 | 
						|
-}
 | 
						|
-
 | 
						|
-static inline u32 and(u32 v, u32 mask)
 | 
						|
-{
 | 
						|
-	return v & mask;
 | 
						|
-}
 | 
						|
-
 | 
						|
-static void poly1305_simd_mult(u32 *a, const u32 *b)
 | 
						|
-{
 | 
						|
-	u8 m[POLY1305_BLOCK_SIZE];
 | 
						|
-
 | 
						|
-	memset(m, 0, sizeof(m));
 | 
						|
-	/* The poly1305 block function adds a hi-bit to the accumulator which
 | 
						|
-	 * we don't need for key multiplication; compensate for it. */
 | 
						|
-	a[4] -= 1 << 24;
 | 
						|
-	poly1305_block_sse2(a, m, b, 1);
 | 
						|
-}
 | 
						|
-
 | 
						|
-static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
 | 
						|
-{
 | 
						|
-	/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
 | 
						|
-	key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
 | 
						|
-	key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
 | 
						|
-	key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
 | 
						|
-	key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
 | 
						|
-	key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
 | 
						|
-}
 | 
						|
+struct poly1305_arch_internal {
 | 
						|
+	union {
 | 
						|
+		struct {
 | 
						|
+			u32 h[5];
 | 
						|
+			u32 is_base2_26;
 | 
						|
+		};
 | 
						|
+		u64 hs[3];
 | 
						|
+	};
 | 
						|
+	u64 r[2];
 | 
						|
+	u64 pad;
 | 
						|
+	struct { u32 r2, r1, r4, r3; } rn[9];
 | 
						|
+};
 | 
						|
 
 | 
						|
-static void poly1305_integer_blocks(struct poly1305_state *state,
 | 
						|
-				    const struct poly1305_key *key,
 | 
						|
-				    const void *src,
 | 
						|
-				    unsigned int nblocks, u32 hibit)
 | 
						|
+/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
 | 
						|
+ * the unfortunate situation of using AVX and then having to go back to scalar
 | 
						|
+ * -- because the user is silly and has called the update function from two
 | 
						|
+ * separate contexts -- then we need to convert back to the original base before
 | 
						|
+ * proceeding. It is possible to reason that the initial reduction below is
 | 
						|
+ * sufficient given the implementation invariants. However, for an avoidance of
 | 
						|
+ * doubt and because this is not performance critical, we do the full reduction
 | 
						|
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
 | 
						|
+ */
 | 
						|
+static void convert_to_base2_64(void *ctx)
 | 
						|
 {
 | 
						|
-	u32 r0, r1, r2, r3, r4;
 | 
						|
-	u32 s1, s2, s3, s4;
 | 
						|
-	u32 h0, h1, h2, h3, h4;
 | 
						|
-	u64 d0, d1, d2, d3, d4;
 | 
						|
+	struct poly1305_arch_internal *state = ctx;
 | 
						|
+	u32 cy;
 | 
						|
 
 | 
						|
-	if (!nblocks)
 | 
						|
+	if (!state->is_base2_26)
 | 
						|
 		return;
 | 
						|
 
 | 
						|
-	r0 = key->r[0];
 | 
						|
-	r1 = key->r[1];
 | 
						|
-	r2 = key->r[2];
 | 
						|
-	r3 = key->r[3];
 | 
						|
-	r4 = key->r[4];
 | 
						|
-
 | 
						|
-	s1 = r1 * 5;
 | 
						|
-	s2 = r2 * 5;
 | 
						|
-	s3 = r3 * 5;
 | 
						|
-	s4 = r4 * 5;
 | 
						|
-
 | 
						|
-	h0 = state->h[0];
 | 
						|
-	h1 = state->h[1];
 | 
						|
-	h2 = state->h[2];
 | 
						|
-	h3 = state->h[3];
 | 
						|
-	h4 = state->h[4];
 | 
						|
-
 | 
						|
-	do {
 | 
						|
-		/* h += m[i] */
 | 
						|
-		h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
 | 
						|
-		h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
 | 
						|
-		h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
 | 
						|
-		h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
 | 
						|
-		h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
 | 
						|
-
 | 
						|
-		/* h *= r */
 | 
						|
-		d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
 | 
						|
-		     mlt(h3, s2) + mlt(h4, s1);
 | 
						|
-		d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
 | 
						|
-		     mlt(h3, s3) + mlt(h4, s2);
 | 
						|
-		d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
 | 
						|
-		     mlt(h3, s4) + mlt(h4, s3);
 | 
						|
-		d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
 | 
						|
-		     mlt(h3, r0) + mlt(h4, s4);
 | 
						|
-		d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
 | 
						|
-		     mlt(h3, r1) + mlt(h4, r0);
 | 
						|
-
 | 
						|
-		/* (partial) h %= p */
 | 
						|
-		d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
 | 
						|
-		d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
 | 
						|
-		d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
 | 
						|
-		d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
 | 
						|
-		h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
 | 
						|
-		h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
 | 
						|
-
 | 
						|
-		src += POLY1305_BLOCK_SIZE;
 | 
						|
-	} while (--nblocks);
 | 
						|
-
 | 
						|
-	state->h[0] = h0;
 | 
						|
-	state->h[1] = h1;
 | 
						|
-	state->h[2] = h2;
 | 
						|
-	state->h[3] = h3;
 | 
						|
-	state->h[4] = h4;
 | 
						|
-}
 | 
						|
-
 | 
						|
-static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
 | 
						|
-{
 | 
						|
-	u32 h0, h1, h2, h3, h4;
 | 
						|
-	u32 g0, g1, g2, g3, g4;
 | 
						|
-	u32 mask;
 | 
						|
-
 | 
						|
-	/* fully carry h */
 | 
						|
-	h0 = state->h[0];
 | 
						|
-	h1 = state->h[1];
 | 
						|
-	h2 = state->h[2];
 | 
						|
-	h3 = state->h[3];
 | 
						|
-	h4 = state->h[4];
 | 
						|
-
 | 
						|
-	h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
 | 
						|
-	h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
 | 
						|
-	h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
 | 
						|
-	h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
 | 
						|
-	h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
 | 
						|
-
 | 
						|
-	/* compute h + -p */
 | 
						|
-	g0 = h0 + 5;
 | 
						|
-	g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
 | 
						|
-	g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
 | 
						|
-	g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
 | 
						|
-	g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
 | 
						|
-
 | 
						|
-	/* select h if h < p, or h + -p if h >= p */
 | 
						|
-	mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
 | 
						|
-	g0 &= mask;
 | 
						|
-	g1 &= mask;
 | 
						|
-	g2 &= mask;
 | 
						|
-	g3 &= mask;
 | 
						|
-	g4 &= mask;
 | 
						|
-	mask = ~mask;
 | 
						|
-	h0 = (h0 & mask) | g0;
 | 
						|
-	h1 = (h1 & mask) | g1;
 | 
						|
-	h2 = (h2 & mask) | g2;
 | 
						|
-	h3 = (h3 & mask) | g3;
 | 
						|
-	h4 = (h4 & mask) | g4;
 | 
						|
-
 | 
						|
-	/* h = h % (2^128) */
 | 
						|
-	put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
 | 
						|
-	put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
 | 
						|
-	put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
 | 
						|
-	put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
 | 
						|
-}
 | 
						|
-
 | 
						|
-void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
 | 
						|
-{
 | 
						|
-	poly1305_integer_setkey(desc->opaque_r, key);
 | 
						|
-	desc->s[0] = get_unaligned_le32(key + 16);
 | 
						|
-	desc->s[1] = get_unaligned_le32(key + 20);
 | 
						|
-	desc->s[2] = get_unaligned_le32(key + 24);
 | 
						|
-	desc->s[3] = get_unaligned_le32(key + 28);
 | 
						|
-	poly1305_core_init(&desc->h);
 | 
						|
-	desc->buflen = 0;
 | 
						|
-	desc->sset = true;
 | 
						|
-	desc->rset = 1;
 | 
						|
-}
 | 
						|
-EXPORT_SYMBOL_GPL(poly1305_init_arch);
 | 
						|
-
 | 
						|
-static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
 | 
						|
-					       const u8 *src, unsigned int srclen)
 | 
						|
-{
 | 
						|
-	if (!dctx->sset) {
 | 
						|
-		if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
 | 
						|
-			poly1305_integer_setkey(dctx->r, src);
 | 
						|
-			src += POLY1305_BLOCK_SIZE;
 | 
						|
-			srclen -= POLY1305_BLOCK_SIZE;
 | 
						|
-			dctx->rset = 1;
 | 
						|
-		}
 | 
						|
-		if (srclen >= POLY1305_BLOCK_SIZE) {
 | 
						|
-			dctx->s[0] = get_unaligned_le32(src +  0);
 | 
						|
-			dctx->s[1] = get_unaligned_le32(src +  4);
 | 
						|
-			dctx->s[2] = get_unaligned_le32(src +  8);
 | 
						|
-			dctx->s[3] = get_unaligned_le32(src + 12);
 | 
						|
-			src += POLY1305_BLOCK_SIZE;
 | 
						|
-			srclen -= POLY1305_BLOCK_SIZE;
 | 
						|
-			dctx->sset = true;
 | 
						|
-		}
 | 
						|
+	cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
 | 
						|
+	cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
 | 
						|
+	cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
 | 
						|
+	cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
 | 
						|
+	state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
 | 
						|
+	state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
 | 
						|
+	state->hs[2] = state->h[4] >> 24;
 | 
						|
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
 | 
						|
+	cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
 | 
						|
+	state->hs[2] &= 3;
 | 
						|
+	state->hs[0] += cy;
 | 
						|
+	state->hs[1] += (cy = ULT(state->hs[0], cy));
 | 
						|
+	state->hs[2] += ULT(state->hs[1], cy);
 | 
						|
+#undef ULT
 | 
						|
+	state->is_base2_26 = 0;
 | 
						|
+}
 | 
						|
+
 | 
						|
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
 | 
						|
+{
 | 
						|
+	poly1305_init_x86_64(ctx, key);
 | 
						|
+}
 | 
						|
+
 | 
						|
+static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
 | 
						|
+				 const u32 padbit)
 | 
						|
+{
 | 
						|
+	struct poly1305_arch_internal *state = ctx;
 | 
						|
+
 | 
						|
+	/* SIMD disables preemption, so relax after processing each page. */
 | 
						|
+	BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
 | 
						|
+		     PAGE_SIZE % POLY1305_BLOCK_SIZE);
 | 
						|
+
 | 
						|
+	if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
 | 
						|
+	    (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
 | 
						|
+	    !crypto_simd_usable()) {
 | 
						|
+		convert_to_base2_64(ctx);
 | 
						|
+		poly1305_blocks_x86_64(ctx, inp, len, padbit);
 | 
						|
+		return;
 | 
						|
 	}
 | 
						|
-	return srclen;
 | 
						|
-}
 | 
						|
 
 | 
						|
-static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
 | 
						|
-					   const u8 *src, unsigned int srclen)
 | 
						|
-{
 | 
						|
-	unsigned int datalen;
 | 
						|
+	for (;;) {
 | 
						|
+		const size_t bytes = min_t(size_t, len, PAGE_SIZE);
 | 
						|
 
 | 
						|
-	if (unlikely(!dctx->sset)) {
 | 
						|
-		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
 | 
						|
-		src += srclen - datalen;
 | 
						|
-		srclen = datalen;
 | 
						|
-	}
 | 
						|
-	if (srclen >= POLY1305_BLOCK_SIZE) {
 | 
						|
-		poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
 | 
						|
-					srclen / POLY1305_BLOCK_SIZE, 1);
 | 
						|
-		srclen %= POLY1305_BLOCK_SIZE;
 | 
						|
+		kernel_fpu_begin();
 | 
						|
+		if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
 | 
						|
+			poly1305_blocks_avx512(ctx, inp, bytes, padbit);
 | 
						|
+		else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
 | 
						|
+			poly1305_blocks_avx2(ctx, inp, bytes, padbit);
 | 
						|
+		else
 | 
						|
+			poly1305_blocks_avx(ctx, inp, bytes, padbit);
 | 
						|
+		kernel_fpu_end();
 | 
						|
+		len -= bytes;
 | 
						|
+		if (!len)
 | 
						|
+			break;
 | 
						|
+		inp += bytes;
 | 
						|
 	}
 | 
						|
-	return srclen;
 | 
						|
 }
 | 
						|
 
 | 
						|
-static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
 | 
						|
-					 const u8 *src, unsigned int srclen)
 | 
						|
-{
 | 
						|
-	unsigned int blocks, datalen;
 | 
						|
+static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
 | 
						|
+			       const u32 nonce[4])
 | 
						|
+{
 | 
						|
+	struct poly1305_arch_internal *state = ctx;
 | 
						|
+
 | 
						|
+	if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
 | 
						|
+	    !state->is_base2_26 || !crypto_simd_usable()) {
 | 
						|
+		convert_to_base2_64(ctx);
 | 
						|
+		poly1305_emit_x86_64(ctx, mac, nonce);
 | 
						|
+	} else
 | 
						|
+		poly1305_emit_avx(ctx, mac, nonce);
 | 
						|
+}
 | 
						|
+
 | 
						|
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
 | 
						|
+{
 | 
						|
+	poly1305_simd_init(&dctx->h, key);
 | 
						|
+	dctx->s[0] = get_unaligned_le32(&key[16]);
 | 
						|
+	dctx->s[1] = get_unaligned_le32(&key[20]);
 | 
						|
+	dctx->s[2] = get_unaligned_le32(&key[24]);
 | 
						|
+	dctx->s[3] = get_unaligned_le32(&key[28]);
 | 
						|
+	dctx->buflen = 0;
 | 
						|
+	dctx->sset = true;
 | 
						|
+}
 | 
						|
+EXPORT_SYMBOL(poly1305_init_arch);
 | 
						|
 
 | 
						|
+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
 | 
						|
+					       const u8 *inp, unsigned int len)
 | 
						|
+{
 | 
						|
+	unsigned int acc = 0;
 | 
						|
 	if (unlikely(!dctx->sset)) {
 | 
						|
-		datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
 | 
						|
-		src += srclen - datalen;
 | 
						|
-		srclen = datalen;
 | 
						|
-	}
 | 
						|
-
 | 
						|
-	if (IS_ENABLED(CONFIG_AS_AVX2) &&
 | 
						|
-	    static_branch_likely(&poly1305_use_avx2) &&
 | 
						|
-	    srclen >= POLY1305_BLOCK_SIZE * 4) {
 | 
						|
-		if (unlikely(dctx->rset < 4)) {
 | 
						|
-			if (dctx->rset < 2) {
 | 
						|
-				dctx->r[1] = dctx->r[0];
 | 
						|
-				poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
 | 
						|
-			}
 | 
						|
-			dctx->r[2] = dctx->r[1];
 | 
						|
-			poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
 | 
						|
-			dctx->r[3] = dctx->r[2];
 | 
						|
-			poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
 | 
						|
-			dctx->rset = 4;
 | 
						|
+		if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
 | 
						|
+			poly1305_simd_init(&dctx->h, inp);
 | 
						|
+			inp += POLY1305_BLOCK_SIZE;
 | 
						|
+			len -= POLY1305_BLOCK_SIZE;
 | 
						|
+			acc += POLY1305_BLOCK_SIZE;
 | 
						|
+			dctx->rset = 1;
 | 
						|
 		}
 | 
						|
-		blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
 | 
						|
-		poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
 | 
						|
-				     dctx->r[1].r);
 | 
						|
-		src += POLY1305_BLOCK_SIZE * 4 * blocks;
 | 
						|
-		srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
 | 
						|
-	}
 | 
						|
-
 | 
						|
-	if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
 | 
						|
-		if (unlikely(dctx->rset < 2)) {
 | 
						|
-			dctx->r[1] = dctx->r[0];
 | 
						|
-			poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
 | 
						|
-			dctx->rset = 2;
 | 
						|
+		if (len >= POLY1305_BLOCK_SIZE) {
 | 
						|
+			dctx->s[0] = get_unaligned_le32(&inp[0]);
 | 
						|
+			dctx->s[1] = get_unaligned_le32(&inp[4]);
 | 
						|
+			dctx->s[2] = get_unaligned_le32(&inp[8]);
 | 
						|
+			dctx->s[3] = get_unaligned_le32(&inp[12]);
 | 
						|
+			inp += POLY1305_BLOCK_SIZE;
 | 
						|
+			len -= POLY1305_BLOCK_SIZE;
 | 
						|
+			acc += POLY1305_BLOCK_SIZE;
 | 
						|
+			dctx->sset = true;
 | 
						|
 		}
 | 
						|
-		blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
 | 
						|
-		poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
 | 
						|
-				     blocks, dctx->r[1].r);
 | 
						|
-		src += POLY1305_BLOCK_SIZE * 2 * blocks;
 | 
						|
-		srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
 | 
						|
-	}
 | 
						|
-	if (srclen >= POLY1305_BLOCK_SIZE) {
 | 
						|
-		poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
 | 
						|
-		srclen -= POLY1305_BLOCK_SIZE;
 | 
						|
 	}
 | 
						|
-	return srclen;
 | 
						|
+	return acc;
 | 
						|
 }
 | 
						|
 
 | 
						|
 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
 | 
						|
 			  unsigned int srclen)
 | 
						|
 {
 | 
						|
-	unsigned int bytes;
 | 
						|
+	unsigned int bytes, used;
 | 
						|
 
 | 
						|
 	if (unlikely(dctx->buflen)) {
 | 
						|
 		bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
 | 
						|
@@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly130
 | 
						|
 		dctx->buflen += bytes;
 | 
						|
 
 | 
						|
 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 | 
						|
-			if (static_branch_likely(&poly1305_use_simd) &&
 | 
						|
-			    likely(crypto_simd_usable())) {
 | 
						|
-				kernel_fpu_begin();
 | 
						|
-				poly1305_simd_blocks(dctx, dctx->buf,
 | 
						|
-						     POLY1305_BLOCK_SIZE);
 | 
						|
-				kernel_fpu_end();
 | 
						|
-			} else {
 | 
						|
-				poly1305_scalar_blocks(dctx, dctx->buf,
 | 
						|
-						       POLY1305_BLOCK_SIZE);
 | 
						|
-			}
 | 
						|
+			if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
 | 
						|
+				poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
 | 
						|
 			dctx->buflen = 0;
 | 
						|
 		}
 | 
						|
 	}
 | 
						|
 
 | 
						|
 	if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
 | 
						|
-		if (static_branch_likely(&poly1305_use_simd) &&
 | 
						|
-		    likely(crypto_simd_usable())) {
 | 
						|
-			kernel_fpu_begin();
 | 
						|
-			bytes = poly1305_simd_blocks(dctx, src, srclen);
 | 
						|
-			kernel_fpu_end();
 | 
						|
-		} else {
 | 
						|
-			bytes = poly1305_scalar_blocks(dctx, src, srclen);
 | 
						|
-		}
 | 
						|
-		src += srclen - bytes;
 | 
						|
-		srclen = bytes;
 | 
						|
+		bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
 | 
						|
+		srclen -= bytes;
 | 
						|
+		used = crypto_poly1305_setdctxkey(dctx, src, bytes);
 | 
						|
+		if (likely(bytes - used))
 | 
						|
+			poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
 | 
						|
+		src += bytes;
 | 
						|
 	}
 | 
						|
 
 | 
						|
 	if (unlikely(srclen)) {
 | 
						|
@@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly130
 | 
						|
 }
 | 
						|
 EXPORT_SYMBOL(poly1305_update_arch);
 | 
						|
 
 | 
						|
-void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
 | 
						|
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
 | 
						|
 {
 | 
						|
-	__le32 digest[4];
 | 
						|
-	u64 f = 0;
 | 
						|
-
 | 
						|
-	if (unlikely(desc->buflen)) {
 | 
						|
-		desc->buf[desc->buflen++] = 1;
 | 
						|
-		memset(desc->buf + desc->buflen, 0,
 | 
						|
-		       POLY1305_BLOCK_SIZE - desc->buflen);
 | 
						|
-		poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
 | 
						|
+	if (unlikely(dctx->buflen)) {
 | 
						|
+		dctx->buf[dctx->buflen++] = 1;
 | 
						|
+		memset(dctx->buf + dctx->buflen, 0,
 | 
						|
+		       POLY1305_BLOCK_SIZE - dctx->buflen);
 | 
						|
+		poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
 | 
						|
 	}
 | 
						|
 
 | 
						|
-	poly1305_integer_emit(&desc->h, digest);
 | 
						|
-
 | 
						|
-	/* mac = (h + s) % (2^128) */
 | 
						|
-	f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
 | 
						|
-	put_unaligned_le32(f, dst + 0);
 | 
						|
-	f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
 | 
						|
-	put_unaligned_le32(f, dst + 4);
 | 
						|
-	f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
 | 
						|
-	put_unaligned_le32(f, dst + 8);
 | 
						|
-	f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
 | 
						|
-	put_unaligned_le32(f, dst + 12);
 | 
						|
-
 | 
						|
-	*desc = (struct poly1305_desc_ctx){};
 | 
						|
+	poly1305_simd_emit(&dctx->h, dst, dctx->s);
 | 
						|
+	*dctx = (struct poly1305_desc_ctx){};
 | 
						|
 }
 | 
						|
 EXPORT_SYMBOL(poly1305_final_arch);
 | 
						|
 
 | 
						|
@@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct s
 | 
						|
 {
 | 
						|
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 | 
						|
 
 | 
						|
-	poly1305_core_init(&dctx->h);
 | 
						|
-	dctx->buflen = 0;
 | 
						|
-	dctx->rset = 0;
 | 
						|
-	dctx->sset = false;
 | 
						|
-
 | 
						|
+	*dctx = (struct poly1305_desc_ctx){};
 | 
						|
 	return 0;
 | 
						|
 }
 | 
						|
 
 | 
						|
-static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 | 
						|
+static int crypto_poly1305_update(struct shash_desc *desc,
 | 
						|
+				  const u8 *src, unsigned int srclen)
 | 
						|
 {
 | 
						|
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 | 
						|
 
 | 
						|
-	if (unlikely(!dctx->sset))
 | 
						|
-		return -ENOKEY;
 | 
						|
-
 | 
						|
-	poly1305_final_arch(dctx, dst);
 | 
						|
+	poly1305_update_arch(dctx, src, srclen);
 | 
						|
 	return 0;
 | 
						|
 }
 | 
						|
 
 | 
						|
-static int poly1305_simd_update(struct shash_desc *desc,
 | 
						|
-				const u8 *src, unsigned int srclen)
 | 
						|
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
 | 
						|
 {
 | 
						|
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 | 
						|
 
 | 
						|
-	poly1305_update_arch(dctx, src, srclen);
 | 
						|
+	if (unlikely(!dctx->sset))
 | 
						|
+		return -ENOKEY;
 | 
						|
+
 | 
						|
+	poly1305_final_arch(dctx, dst);
 | 
						|
 	return 0;
 | 
						|
 }
 | 
						|
 
 | 
						|
 static struct shash_alg alg = {
 | 
						|
 	.digestsize	= POLY1305_DIGEST_SIZE,
 | 
						|
 	.init		= crypto_poly1305_init,
 | 
						|
-	.update		= poly1305_simd_update,
 | 
						|
+	.update		= crypto_poly1305_update,
 | 
						|
 	.final		= crypto_poly1305_final,
 | 
						|
 	.descsize	= sizeof(struct poly1305_desc_ctx),
 | 
						|
 	.base		= {
 | 
						|
@@ -406,17 +265,19 @@ static struct shash_alg alg = {
 | 
						|
 
 | 
						|
 static int __init poly1305_simd_mod_init(void)
 | 
						|
 {
 | 
						|
-	if (!boot_cpu_has(X86_FEATURE_XMM2))
 | 
						|
-		return 0;
 | 
						|
-
 | 
						|
-	static_branch_enable(&poly1305_use_simd);
 | 
						|
-
 | 
						|
-	if (IS_ENABLED(CONFIG_AS_AVX2) &&
 | 
						|
-	    boot_cpu_has(X86_FEATURE_AVX) &&
 | 
						|
+	if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
 | 
						|
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
 | 
						|
+		static_branch_enable(&poly1305_use_avx);
 | 
						|
+	if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
 | 
						|
 	    boot_cpu_has(X86_FEATURE_AVX2) &&
 | 
						|
 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
 | 
						|
 		static_branch_enable(&poly1305_use_avx2);
 | 
						|
-
 | 
						|
+	if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
 | 
						|
+	    boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
 | 
						|
+	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
 | 
						|
+	    /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
 | 
						|
+	    boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
 | 
						|
+		static_branch_enable(&poly1305_use_avx512);
 | 
						|
 	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
 | 
						|
 }
 | 
						|
 
 | 
						|
@@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
 | 
						|
 module_exit(poly1305_simd_mod_exit);
 | 
						|
 
 | 
						|
 MODULE_LICENSE("GPL");
 | 
						|
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
 | 
						|
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
 | 
						|
 MODULE_DESCRIPTION("Poly1305 authenticator");
 | 
						|
 MODULE_ALIAS_CRYPTO("poly1305");
 | 
						|
 MODULE_ALIAS_CRYPTO("poly1305-simd");
 | 
						|
--- a/lib/crypto/Kconfig
 | 
						|
+++ b/lib/crypto/Kconfig
 | 
						|
@@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
 | 
						|
 config CRYPTO_LIB_POLY1305_RSIZE
 | 
						|
 	int
 | 
						|
 	default 2 if MIPS
 | 
						|
-	default 4 if X86_64
 | 
						|
+	default 11 if X86_64
 | 
						|
 	default 9 if ARM || ARM64
 | 
						|
 	default 1
 | 
						|
 
 |