/* poly1305_asm * * Copyright (C) 2006-2022 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #ifdef WOLFSSL_USER_SETTINGS #ifdef WOLFSSL_USER_SETTINGS_ASM /* * user_settings_asm.h is a file generated by the script user_settings_asm.sh. * The script takes in a user_settings.h and produces user_settings_asm.h, which * is a stripped down version of user_settings.h containing only preprocessor * directives. This makes the header safe to include in assembly (.S) files. */ #include "user_settings_asm.h" #else /* * Note: if user_settings.h contains any C code (e.g. a typedef or function * prototype), including it here in an assembly (.S) file will cause an * assembler failure. See user_settings_asm.h above. */ #include "user_settings.h" #endif /* WOLFSSL_USER_SETTINGS_ASM */ #endif /* WOLFSSL_USER_SETTINGS */ #ifndef HAVE_INTEL_AVX1 #define HAVE_INTEL_AVX1 #endif /* HAVE_INTEL_AVX1 */ #ifndef NO_AVX2_SUPPORT #define HAVE_INTEL_AVX2 #endif /* NO_AVX2_SUPPORT */ #ifdef WOLFSSL_X86_64_BUILD #ifdef HAVE_INTEL_AVX1 #ifndef __APPLE__ .text .globl poly1305_setkey_avx .type poly1305_setkey_avx,@function .align 16 poly1305_setkey_avx: #else .section __TEXT,__text .globl _poly1305_setkey_avx .p2align 4 _poly1305_setkey_avx: #endif /* __APPLE__ */ movabsq $0xffffffc0fffffff, %r10 movabsq $0xffffffc0ffffffc, %r11 movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 andq %r10, %rdx andq %r11, %rax movq %rdx, %r10 movq %rax, %r11 xorq %r9, %r9 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %r9, 24(%rdi) movq %r9, 32(%rdi) movq %r9, 40(%rdi) movq %rcx, 48(%rdi) movq %r8, 56(%rdi) movq %r9, 352(%rdi) movq %r9, 408(%rdi) movq %rdx, 360(%rdi) movq %rax, 416(%rdi) addq %rdx, %r10 addq %rax, %r11 movq %r10, 368(%rdi) movq %r11, 424(%rdi) addq %rdx, %r10 addq %rax, %r11 movq %r10, 376(%rdi) movq %r11, 432(%rdi) addq %rdx, %r10 addq %rax, %r11 movq %r10, 384(%rdi) movq %r11, 440(%rdi) addq %rdx, %r10 addq %rax, %r11 movq %r10, 392(%rdi) movq %r11, 448(%rdi) addq %rdx, %r10 addq %rax, %r11 movq %r10, 400(%rdi) movq %r11, 456(%rdi) movq %r9, 608(%rdi) movb $0x01, 616(%rdi) repz retq #ifndef __APPLE__ .size poly1305_setkey_avx,.-poly1305_setkey_avx #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl poly1305_block_avx .type poly1305_block_avx,@function .align 16 poly1305_block_avx: #else .section __TEXT,__text .globl _poly1305_block_avx .p2align 4 _poly1305_block_avx: #endif /* __APPLE__ */ pushq %r15 pushq %rbx pushq %r12 pushq %r13 pushq %r14 movq (%rdi), %r15 movq 8(%rdi), %rbx movq 24(%rdi), %r8 movq 32(%rdi), %r9 movq 40(%rdi), %r10 xorq %r14, %r14 movb 616(%rdi), %r14b # h += m movq (%rsi), %r11 movq 8(%rsi), %r12 addq %r11, %r8 adcq %r12, %r9 movq %rbx, %rax adcq %r14, %r10 # r[1] * h[0] => rdx, rax ==> t2, t1 mulq %r8 movq %rax, %r12 movq %rdx, %r13 # r[0] * h[1] => rdx, rax ++> t2, t1 movq %r15, %rax mulq %r9 addq %rax, %r12 movq %r15, %rax adcq %rdx, %r13 # r[0] * h[0] => rdx, rax ==> t4, t0 mulq %r8 movq %rax, %r11 movq %rdx, %r8 # r[1] * h[1] => rdx, rax =+> t3, t2 movq %rbx, %rax mulq %r9 # r[0] * h[2] +> t2 addq 352(%rdi,%r10,8), %r13 movq %rdx, %r14 addq %r8, %r12 adcq %rax, %r13 # r[1] * h[2] +> t3 adcq 408(%rdi,%r10,8), %r14 # r * h in r14, r13, r12, r11 # h = (r * h) mod 2^130 - 5 movq %r13, %r10 andq $-4, %r13 andq $3, %r10 addq %r13, %r11 movq %r13, %r8 adcq %r14, %r12 adcq $0x00, %r10 shrdq $2, %r14, %r8 shrq $2, %r14 addq %r11, %r8 adcq %r14, %r12 movq %r12, %r9 adcq $0x00, %r10 # h in r10, r9, r8 # Store h to ctx movq %r8, 24(%rdi) movq %r9, 32(%rdi) movq %r10, 40(%rdi) popq %r14 popq %r13 popq %r12 popq %rbx popq %r15 repz retq #ifndef __APPLE__ .size poly1305_block_avx,.-poly1305_block_avx #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl poly1305_blocks_avx .type poly1305_blocks_avx,@function .align 16 poly1305_blocks_avx: #else .section __TEXT,__text .globl _poly1305_blocks_avx .p2align 4 _poly1305_blocks_avx: #endif /* __APPLE__ */ pushq %r15 pushq %rbx pushq %r12 pushq %r13 pushq %r14 movq %rdx, %rcx movq (%rdi), %r15 movq 8(%rdi), %rbx movq 24(%rdi), %r8 movq 32(%rdi), %r9 movq 40(%rdi), %r10 L_poly1305_avx_blocks_start: # h += m movq (%rsi), %r11 movq 8(%rsi), %r12 addq %r11, %r8 adcq %r12, %r9 movq %rbx, %rax adcq $0x00, %r10 # r[1] * h[0] => rdx, rax ==> t2, t1 mulq %r8 movq %rax, %r12 movq %rdx, %r13 # r[0] * h[1] => rdx, rax ++> t2, t1 movq %r15, %rax mulq %r9 addq %rax, %r12 movq %r15, %rax adcq %rdx, %r13 # r[0] * h[0] => rdx, rax ==> t4, t0 mulq %r8 movq %rax, %r11 movq %rdx, %r8 # r[1] * h[1] => rdx, rax =+> t3, t2 movq %rbx, %rax mulq %r9 # r[0] * h[2] +> t2 addq 360(%rdi,%r10,8), %r13 movq %rdx, %r14 addq %r8, %r12 adcq %rax, %r13 # r[1] * h[2] +> t3 adcq 416(%rdi,%r10,8), %r14 # r * h in r14, r13, r12, r11 # h = (r * h) mod 2^130 - 5 movq %r13, %r10 andq $-4, %r13 andq $3, %r10 addq %r13, %r11 movq %r13, %r8 adcq %r14, %r12 adcq $0x00, %r10 shrdq $2, %r14, %r8 shrq $2, %r14 addq %r11, %r8 adcq %r14, %r12 movq %r12, %r9 adcq $0x00, %r10 # h in r10, r9, r8 # Next block from message addq $16, %rsi subq $16, %rcx jg L_poly1305_avx_blocks_start # Store h to ctx movq %r8, 24(%rdi) movq %r9, 32(%rdi) movq %r10, 40(%rdi) popq %r14 popq %r13 popq %r12 popq %rbx popq %r15 repz retq #ifndef __APPLE__ .size poly1305_blocks_avx,.-poly1305_blocks_avx #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl poly1305_final_avx .type poly1305_final_avx,@function .align 16 poly1305_final_avx: #else .section __TEXT,__text .globl _poly1305_final_avx .p2align 4 _poly1305_final_avx: #endif /* __APPLE__ */ pushq %rbx pushq %r12 movq %rsi, %rbx movq 608(%rdi), %rax testq %rax, %rax je L_poly1305_avx_final_no_more movb $0x01, 480(%rdi,%rax,1) jmp L_poly1305_avx_final_cmp_rem L_poly1305_avx_final_zero_rem: movb $0x00, 480(%rdi,%rax,1) L_poly1305_avx_final_cmp_rem: incb %al cmpq $16, %rax jl L_poly1305_avx_final_zero_rem movb $0x00, 616(%rdi) leaq 480(%rdi), %rsi #ifndef __APPLE__ callq poly1305_block_avx@plt #else callq _poly1305_block_avx #endif /* __APPLE__ */ L_poly1305_avx_final_no_more: movq 24(%rdi), %rax movq 32(%rdi), %rdx movq 40(%rdi), %rcx movq 48(%rdi), %r11 movq 56(%rdi), %r12 # h %= p # h = (h + pad) # mod 2^130 - 5 movq %rcx, %r8 andq $3, %rcx shrq $2, %r8 # Multily by 5 leaq 0(%r8,%r8,4), %r8 addq %r8, %rax adcq $0x00, %rdx adcq $0x00, %rcx # Fixup when between (1 << 130) - 1 and (1 << 130) - 5 movq %rax, %r8 movq %rdx, %r9 movq %rcx, %r10 addq $5, %r8 adcq $0x00, %r9 adcq $0x00, %r10 cmpq $4, %r10 cmoveq %r8, %rax cmoveq %r9, %rdx # h += pad addq %r11, %rax adcq %r12, %rdx movq %rax, (%rbx) movq %rdx, 8(%rbx) # Zero out r movq $0x00, (%rdi) movq $0x00, 8(%rdi) # Zero out h movq $0x00, 24(%rdi) movq $0x00, 32(%rdi) movq $0x00, 40(%rdi) # Zero out pad movq $0x00, 48(%rdi) movq $0x00, 56(%rdi) popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size poly1305_final_avx,.-poly1305_final_avx #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX1 */ #ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ .text .globl poly1305_calc_powers_avx2 .type poly1305_calc_powers_avx2,@function .align 16 poly1305_calc_powers_avx2: #else .section __TEXT,__text .globl _poly1305_calc_powers_avx2 .p2align 4 _poly1305_calc_powers_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq (%rdi), %rcx movq 8(%rdi), %r8 xorq %r9, %r9 # Convert to 26 bits in 32 movq %rcx, %rax movq %rcx, %rdx movq %rcx, %rsi movq %r8, %rbx movq %r8, %rbp shrq $26, %rdx shrdq $52, %r8, %rsi shrq $14, %rbx shrdq $40, %r9, %rbp andq $0x3ffffff, %rax andq $0x3ffffff, %rdx andq $0x3ffffff, %rsi andq $0x3ffffff, %rbx andq $0x3ffffff, %rbp movl %eax, 224(%rdi) movl %edx, 228(%rdi) movl %esi, 232(%rdi) movl %ebx, 236(%rdi) movl %ebp, 240(%rdi) movl $0x00, 244(%rdi) # Square 128-bit movq %r8, %rax mulq %rcx xorq %r13, %r13 movq %rax, %r11 movq %rdx, %r12 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 movq %rcx, %rax mulq %rax movq %rax, %r10 movq %rdx, %r15 movq %r8, %rax mulq %rax addq %r15, %r11 adcq %rax, %r12 adcq %rdx, %r13 # Reduce 256-bit to 130-bit movq %r12, %rax movq %r13, %rdx andq $-4, %rax andq $3, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 shrdq $2, %rdx, %rax shrq $2, %rdx addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 movq %r12, %rax shrq $2, %rax leaq 0(%rax,%rax,4), %rax andq $3, %r12 addq %rax, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Convert to 26 bits in 32 movq %r10, %rax movq %r10, %rdx movq %r10, %rsi movq %r11, %rbx movq %r11, %rbp shrq $26, %rdx shrdq $52, %r11, %rsi shrq $14, %rbx shrdq $40, %r12, %rbp andq $0x3ffffff, %rax andq $0x3ffffff, %rdx andq $0x3ffffff, %rsi andq $0x3ffffff, %rbx andq $0x3ffffff, %rbp movl %eax, 256(%rdi) movl %edx, 260(%rdi) movl %esi, 264(%rdi) movl %ebx, 268(%rdi) movl %ebp, 272(%rdi) movl $0x00, 276(%rdi) # Multiply 128-bit by 130-bit # r1[0] * r2[0] movq %rcx, %rax mulq %r10 movq %rax, %r13 movq %rdx, %r14 # r1[0] * r2[1] movq %rcx, %rax mulq %r11 movq $0x00, %r15 addq %rax, %r14 adcq %rdx, %r15 # r1[1] * r2[0] movq %r8, %rax mulq %r10 movq $0x00, %rsi addq %rax, %r14 adcq %rdx, %r15 adcq $0x00, %rsi # r1[0] * r2[2] movq %rcx, %rax mulq %r12 addq %rax, %r15 adcq %rdx, %rsi # r1[1] * r2[1] movq %r8, %rax mulq %r11 movq $0x00, %rbx addq %rax, %r15 adcq %rdx, %rsi adcq $0x00, %rbx # r1[1] * r2[2] movq %r8, %rax mulq %r12 addq %rax, %rsi adcq %rdx, %rbx # Reduce 260-bit to 130-bit movq %r15, %rax movq %rsi, %rdx movq %rbx, %rbx andq $-4, %rax andq $3, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq %rbx, %r15 shrdq $2, %rdx, %rax shrdq $2, %rbx, %rdx shrq $2, %rbx addq %rax, %r13 adcq %rdx, %r14 adcq %rbx, %r15 movq %r15, %rax andq $3, %r15 shrq $2, %rax leaq 0(%rax,%rax,4), %rax addq %rax, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Convert to 26 bits in 32 movq %r13, %rax movq %r13, %rdx movq %r13, %rsi movq %r14, %rbx movq %r14, %rbp shrq $26, %rdx shrdq $52, %r14, %rsi shrq $14, %rbx shrdq $40, %r15, %rbp andq $0x3ffffff, %rax andq $0x3ffffff, %rdx andq $0x3ffffff, %rsi andq $0x3ffffff, %rbx andq $0x3ffffff, %rbp movl %eax, 288(%rdi) movl %edx, 292(%rdi) movl %esi, 296(%rdi) movl %ebx, 300(%rdi) movl %ebp, 304(%rdi) movl $0x00, 308(%rdi) # Square 130-bit movq %r11, %rax mulq %r10 xorq %r13, %r13 movq %rax, %r8 movq %rdx, %r9 addq %rax, %r8 adcq %rdx, %r9 adcq $0x00, %r13 movq %r10, %rax mulq %rax movq %rax, %rcx movq %rdx, %r15 movq %r11, %rax mulq %rax addq %r15, %r8 adcq %rax, %r9 adcq %rdx, %r13 movq %r12, %rax mulq %rax movq %rax, %r14 movq %r12, %rax mulq %r10 addq %rax, %r9 adcq %rdx, %r13 adcq $0x00, %r14 addq %rax, %r9 adcq %rdx, %r13 adcq $0x00, %r14 movq %r12, %rax mulq %r11 addq %rax, %r13 adcq %rdx, %r14 addq %rax, %r13 adcq %rdx, %r14 # Reduce 260-bit to 130-bit movq %r9, %rax movq %r13, %rdx movq %r14, %r15 andq $-4, %rax andq $3, %r9 addq %rax, %rcx adcq %rdx, %r8 adcq %r15, %r9 shrdq $2, %rdx, %rax shrdq $2, %r15, %rdx shrq $2, %r15 addq %rax, %rcx adcq %rdx, %r8 adcq %r15, %r9 movq %r9, %rax andq $3, %r9 shrq $2, %rax leaq 0(%rax,%rax,4), %rax addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 # Convert to 26 bits in 32 movq %rcx, %rax movq %rcx, %rdx movq %rcx, %rsi movq %r8, %rbx movq %r8, %rbp shrq $26, %rdx shrdq $52, %r8, %rsi shrq $14, %rbx shrdq $40, %r9, %rbp andq $0x3ffffff, %rax andq $0x3ffffff, %rdx andq $0x3ffffff, %rsi andq $0x3ffffff, %rbx andq $0x3ffffff, %rbp movl %eax, 320(%rdi) movl %edx, 324(%rdi) movl %esi, 328(%rdi) movl %ebx, 332(%rdi) movl %ebp, 336(%rdi) movl $0x00, 340(%rdi) popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size poly1305_calc_powers_avx2,.-poly1305_calc_powers_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl poly1305_setkey_avx2 .type poly1305_setkey_avx2,@function .align 16 poly1305_setkey_avx2: #else .section __TEXT,__text .globl _poly1305_setkey_avx2 .p2align 4 _poly1305_setkey_avx2: #endif /* __APPLE__ */ #ifndef __APPLE__ callq poly1305_setkey_avx@plt #else callq _poly1305_setkey_avx #endif /* __APPLE__ */ vpxor %ymm0, %ymm0, %ymm0 vmovdqu %ymm0, 64(%rdi) vmovdqu %ymm0, 96(%rdi) vmovdqu %ymm0, 128(%rdi) vmovdqu %ymm0, 160(%rdi) vmovdqu %ymm0, 192(%rdi) movq $0x00, 608(%rdi) movw $0x00, 616(%rdi) repz retq #ifndef __APPLE__ .size poly1305_setkey_avx2,.-poly1305_setkey_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 32 #else .p2align 5 #endif /* __APPLE__ */ L_poly1305_avx2_blocks_mask: .quad 0x3ffffff, 0x3ffffff .quad 0x3ffffff, 0x3ffffff #ifndef __APPLE__ .data #else .section __DATA,__data #endif /* __APPLE__ */ #ifndef __APPLE__ .align 32 #else .p2align 5 #endif /* __APPLE__ */ L_poly1305_avx2_blocks_hibit: .quad 0x1000000, 0x1000000 .quad 0x1000000, 0x1000000 #ifndef __APPLE__ .text .globl poly1305_blocks_avx2 .type poly1305_blocks_avx2,@function .align 16 poly1305_blocks_avx2: #else .section __TEXT,__text .globl _poly1305_blocks_avx2 .p2align 4 _poly1305_blocks_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %rbx subq $0x140, %rsp movq %rsp, %rcx andq $-32, %rcx addq $32, %rcx vpxor %ymm15, %ymm15, %ymm15 movq %rcx, %rbx leaq 64(%rdi), %rax addq $0xa0, %rbx cmpw $0x00, 616(%rdi) jne L_poly1305_avx2_blocks_begin_h # Load the message data vmovdqu (%rsi), %ymm0 vmovdqu 32(%rsi), %ymm1 vperm2i128 $32, %ymm1, %ymm0, %ymm2 vperm2i128 $49, %ymm1, %ymm0, %ymm0 vpunpckldq %ymm0, %ymm2, %ymm1 vpunpckhdq %ymm0, %ymm2, %ymm3 vpunpckldq %ymm15, %ymm1, %ymm0 vpunpckhdq %ymm15, %ymm1, %ymm1 vpunpckldq %ymm15, %ymm3, %ymm2 vpunpckhdq %ymm15, %ymm3, %ymm3 vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm4 vpsllq $6, %ymm1, %ymm1 vpsllq $12, %ymm2, %ymm2 vpsllq $18, %ymm3, %ymm3 vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14 # Reduce, in place, the message data vpsrlq $26, %ymm0, %ymm10 vpsrlq $26, %ymm3, %ymm11 vpand %ymm14, %ymm0, %ymm0 vpand %ymm14, %ymm3, %ymm3 vpaddq %ymm1, %ymm10, %ymm1 vpaddq %ymm4, %ymm11, %ymm4 vpsrlq $26, %ymm1, %ymm10 vpsrlq $26, %ymm4, %ymm11 vpand %ymm14, %ymm1, %ymm1 vpand %ymm14, %ymm4, %ymm4 vpaddq %ymm2, %ymm10, %ymm2 vpslld $2, %ymm11, %ymm12 vpaddd %ymm12, %ymm11, %ymm12 vpsrlq $26, %ymm2, %ymm10 vpaddq %ymm0, %ymm12, %ymm0 vpsrlq $26, %ymm0, %ymm11 vpand %ymm14, %ymm2, %ymm2 vpand %ymm14, %ymm0, %ymm0 vpaddq %ymm3, %ymm10, %ymm3 vpaddq %ymm1, %ymm11, %ymm1 vpsrlq $26, %ymm3, %ymm10 vpand %ymm14, %ymm3, %ymm3 vpaddq %ymm4, %ymm10, %ymm4 addq $0x40, %rsi subq $0x40, %rdx jz L_poly1305_avx2_blocks_store jmp L_poly1305_avx2_blocks_load_r4 L_poly1305_avx2_blocks_begin_h: # Load the H values. vmovdqu (%rax), %ymm0 vmovdqu 32(%rax), %ymm1 vmovdqu 64(%rax), %ymm2 vmovdqu 96(%rax), %ymm3 vmovdqu 128(%rax), %ymm4 # Check if there is a power of r to load - otherwise use r^4. cmpb $0x00, 616(%rdi) je L_poly1305_avx2_blocks_load_r4 # Load the 4 powers of r - r^4, r^3, r^2, r^1. vmovdqu 224(%rdi), %ymm8 vmovdqu 256(%rdi), %ymm7 vmovdqu 288(%rdi), %ymm6 vmovdqu 320(%rdi), %ymm5 vpermq $0xd8, %ymm5, %ymm5 vpermq $0xd8, %ymm6, %ymm6 vpermq $0xd8, %ymm7, %ymm7 vpermq $0xd8, %ymm8, %ymm8 vpunpcklqdq %ymm6, %ymm5, %ymm10 vpunpckhqdq %ymm6, %ymm5, %ymm11 vpunpcklqdq %ymm8, %ymm7, %ymm12 vpunpckhqdq %ymm8, %ymm7, %ymm13 vperm2i128 $32, %ymm12, %ymm10, %ymm5 vperm2i128 $49, %ymm12, %ymm10, %ymm7 vperm2i128 $32, %ymm13, %ymm11, %ymm9 vpsrlq $32, %ymm5, %ymm6 vpsrlq $32, %ymm7, %ymm8 jmp L_poly1305_avx2_blocks_mul_5 L_poly1305_avx2_blocks_load_r4: # Load r^4 into all four positions. vmovdqu 320(%rdi), %ymm13 vpermq $0x00, %ymm13, %ymm5 vpsrlq $32, %ymm13, %ymm14 vpermq $0x55, %ymm13, %ymm7 vpermq $0xaa, %ymm13, %ymm9 vpermq $0x00, %ymm14, %ymm6 vpermq $0x55, %ymm14, %ymm8 L_poly1305_avx2_blocks_mul_5: # Multiply top 4 26-bit values of all four H by 5 vpslld $2, %ymm6, %ymm10 vpslld $2, %ymm7, %ymm11 vpslld $2, %ymm8, %ymm12 vpslld $2, %ymm9, %ymm13 vpaddq %ymm10, %ymm6, %ymm10 vpaddq %ymm11, %ymm7, %ymm11 vpaddq %ymm12, %ymm8, %ymm12 vpaddq %ymm13, %ymm9, %ymm13 # Store powers of r and multiple of 5 for use in multiply. vmovdqa %ymm10, (%rbx) vmovdqa %ymm11, 32(%rbx) vmovdqa %ymm12, 64(%rbx) vmovdqa %ymm13, 96(%rbx) vmovdqa %ymm5, (%rcx) vmovdqa %ymm6, 32(%rcx) vmovdqa %ymm7, 64(%rcx) vmovdqa %ymm8, 96(%rcx) vmovdqa %ymm9, 128(%rcx) vmovdqu L_poly1305_avx2_blocks_mask(%rip), %ymm14 # If not finished then loop over data cmpb $0x01, 616(%rdi) jne L_poly1305_avx2_blocks_start # Do last multiply, reduce, add the four H together and move to # 32-bit registers vpmuludq (%rbx), %ymm4, %ymm5 vpmuludq 32(%rbx), %ymm3, %ymm10 vpmuludq 32(%rbx), %ymm4, %ymm6 vpmuludq 64(%rbx), %ymm3, %ymm11 vpmuludq 64(%rbx), %ymm4, %ymm7 vpaddq %ymm5, %ymm10, %ymm5 vpmuludq 64(%rbx), %ymm2, %ymm12 vpmuludq 96(%rbx), %ymm4, %ymm8 vpaddq %ymm6, %ymm11, %ymm6 vpmuludq 96(%rbx), %ymm1, %ymm13 vpmuludq 96(%rbx), %ymm2, %ymm10 vpaddq %ymm5, %ymm12, %ymm5 vpmuludq 96(%rbx), %ymm3, %ymm11 vpmuludq (%rcx), %ymm3, %ymm12 vpaddq %ymm5, %ymm13, %ymm5 vpmuludq (%rcx), %ymm4, %ymm9 vpaddq %ymm6, %ymm10, %ymm6 vpmuludq (%rcx), %ymm0, %ymm13 vpaddq %ymm7, %ymm11, %ymm7 vpmuludq (%rcx), %ymm1, %ymm10 vpaddq %ymm8, %ymm12, %ymm8 vpmuludq (%rcx), %ymm2, %ymm11 vpmuludq 32(%rcx), %ymm2, %ymm12 vpaddq %ymm5, %ymm13, %ymm5 vpmuludq 32(%rcx), %ymm3, %ymm13 vpaddq %ymm6, %ymm10, %ymm6 vpmuludq 32(%rcx), %ymm0, %ymm10 vpaddq %ymm7, %ymm11, %ymm7 vpmuludq 32(%rcx), %ymm1, %ymm11 vpaddq %ymm8, %ymm12, %ymm8 vpmuludq 64(%rcx), %ymm1, %ymm12 vpaddq %ymm9, %ymm13, %ymm9 vpmuludq 64(%rcx), %ymm2, %ymm13 vpaddq %ymm6, %ymm10, %ymm6 vpmuludq 64(%rcx), %ymm0, %ymm10 vpaddq %ymm7, %ymm11, %ymm7 vpmuludq 96(%rcx), %ymm0, %ymm11 vpaddq %ymm8, %ymm12, %ymm8 vpmuludq 96(%rcx), %ymm1, %ymm12 vpaddq %ymm9, %ymm13, %ymm9 vpaddq %ymm7, %ymm10, %ymm7 vpmuludq 128(%rcx), %ymm0, %ymm13 vpaddq %ymm8, %ymm11, %ymm8 vpaddq %ymm9, %ymm12, %ymm9 vpaddq %ymm9, %ymm13, %ymm9 vpsrlq $26, %ymm5, %ymm10 vpsrlq $26, %ymm8, %ymm11 vpand %ymm14, %ymm5, %ymm5 vpand %ymm14, %ymm8, %ymm8 vpaddq %ymm6, %ymm10, %ymm6 vpaddq %ymm9, %ymm11, %ymm9 vpsrlq $26, %ymm6, %ymm10 vpsrlq $26, %ymm9, %ymm11 vpand %ymm14, %ymm6, %ymm1 vpand %ymm14, %ymm9, %ymm4 vpaddq %ymm7, %ymm10, %ymm7 vpslld $2, %ymm11, %ymm12 vpaddd %ymm12, %ymm11, %ymm12 vpsrlq $26, %ymm7, %ymm10 vpaddq %ymm5, %ymm12, %ymm5 vpsrlq $26, %ymm5, %ymm11 vpand %ymm14, %ymm7, %ymm2 vpand %ymm14, %ymm5, %ymm0 vpaddq %ymm8, %ymm10, %ymm8 vpaddq %ymm1, %ymm11, %ymm1 vpsrlq $26, %ymm8, %ymm10 vpand %ymm14, %ymm8, %ymm3 vpaddq %ymm4, %ymm10, %ymm4 vpsrldq $8, %ymm0, %ymm5 vpsrldq $8, %ymm1, %ymm6 vpsrldq $8, %ymm2, %ymm7 vpsrldq $8, %ymm3, %ymm8 vpsrldq $8, %ymm4, %ymm9 vpaddq %ymm0, %ymm5, %ymm0 vpaddq %ymm1, %ymm6, %ymm1 vpaddq %ymm2, %ymm7, %ymm2 vpaddq %ymm3, %ymm8, %ymm3 vpaddq %ymm4, %ymm9, %ymm4 vpermq $2, %ymm0, %ymm5 vpermq $2, %ymm1, %ymm6 vpermq $2, %ymm2, %ymm7 vpermq $2, %ymm3, %ymm8 vpermq $2, %ymm4, %ymm9 vpaddq %ymm0, %ymm5, %ymm0 vpaddq %ymm1, %ymm6, %ymm1 vpaddq %ymm2, %ymm7, %ymm2 vpaddq %ymm3, %ymm8, %ymm3 vpaddq %ymm4, %ymm9, %ymm4 vmovd %xmm0, %r8d vmovd %xmm1, %r9d vmovd %xmm2, %r10d vmovd %xmm3, %r11d vmovd %xmm4, %r12d jmp L_poly1305_avx2_blocks_end_calc L_poly1305_avx2_blocks_start: vmovdqu (%rsi), %ymm5 vmovdqu 32(%rsi), %ymm6 vperm2i128 $32, %ymm6, %ymm5, %ymm7 vperm2i128 $49, %ymm6, %ymm5, %ymm5 vpunpckldq %ymm5, %ymm7, %ymm6 vpunpckhdq %ymm5, %ymm7, %ymm8 vpunpckldq %ymm15, %ymm6, %ymm5 vpunpckhdq %ymm15, %ymm6, %ymm6 vpunpckldq %ymm15, %ymm8, %ymm7 vpunpckhdq %ymm15, %ymm8, %ymm8 vmovdqu L_poly1305_avx2_blocks_hibit(%rip), %ymm9 vpsllq $6, %ymm6, %ymm6 vpsllq $12, %ymm7, %ymm7 vpsllq $18, %ymm8, %ymm8 vpmuludq (%rbx), %ymm4, %ymm10 vpaddq %ymm5, %ymm10, %ymm5 vpmuludq 32(%rbx), %ymm3, %ymm10 vpmuludq 32(%rbx), %ymm4, %ymm11 vpaddq %ymm6, %ymm11, %ymm6 vpmuludq 64(%rbx), %ymm3, %ymm11 vpmuludq 64(%rbx), %ymm4, %ymm12 vpaddq %ymm7, %ymm12, %ymm7 vpaddq %ymm5, %ymm10, %ymm5 vpmuludq 64(%rbx), %ymm2, %ymm12 vpmuludq 96(%rbx), %ymm4, %ymm13 vpaddq %ymm8, %ymm13, %ymm8 vpaddq %ymm6, %ymm11, %ymm6 vpmuludq 96(%rbx), %ymm1, %ymm13 vpmuludq 96(%rbx), %ymm2, %ymm10 vpaddq %ymm5, %ymm12, %ymm5 vpmuludq 96(%rbx), %ymm3, %ymm11 vpmuludq (%rcx), %ymm3, %ymm12 vpaddq %ymm5, %ymm13, %ymm5 vpmuludq (%rcx), %ymm4, %ymm13 vpaddq %ymm9, %ymm13, %ymm9 vpaddq %ymm6, %ymm10, %ymm6 vpmuludq (%rcx), %ymm0, %ymm13 vpaddq %ymm7, %ymm11, %ymm7 vpmuludq (%rcx), %ymm1, %ymm10 vpaddq %ymm8, %ymm12, %ymm8 vpmuludq (%rcx), %ymm2, %ymm11 vpmuludq 32(%rcx), %ymm2, %ymm12 vpaddq %ymm5, %ymm13, %ymm5 vpmuludq 32(%rcx), %ymm3, %ymm13 vpaddq %ymm6, %ymm10, %ymm6 vpmuludq 32(%rcx), %ymm0, %ymm10 vpaddq %ymm7, %ymm11, %ymm7 vpmuludq 32(%rcx), %ymm1, %ymm11 vpaddq %ymm8, %ymm12, %ymm8 vpmuludq 64(%rcx), %ymm1, %ymm12 vpaddq %ymm9, %ymm13, %ymm9 vpmuludq 64(%rcx), %ymm2, %ymm13 vpaddq %ymm6, %ymm10, %ymm6 vpmuludq 64(%rcx), %ymm0, %ymm10 vpaddq %ymm7, %ymm11, %ymm7 vpmuludq 96(%rcx), %ymm0, %ymm11 vpaddq %ymm8, %ymm12, %ymm8 vpmuludq 96(%rcx), %ymm1, %ymm12 vpaddq %ymm9, %ymm13, %ymm9 vpaddq %ymm7, %ymm10, %ymm7 vpmuludq 128(%rcx), %ymm0, %ymm13 vpaddq %ymm8, %ymm11, %ymm8 vpaddq %ymm9, %ymm12, %ymm9 vpaddq %ymm9, %ymm13, %ymm9 vpsrlq $26, %ymm5, %ymm10 vpsrlq $26, %ymm8, %ymm11 vpand %ymm14, %ymm5, %ymm5 vpand %ymm14, %ymm8, %ymm8 vpaddq %ymm6, %ymm10, %ymm6 vpaddq %ymm9, %ymm11, %ymm9 vpsrlq $26, %ymm6, %ymm10 vpsrlq $26, %ymm9, %ymm11 vpand %ymm14, %ymm6, %ymm1 vpand %ymm14, %ymm9, %ymm4 vpaddq %ymm7, %ymm10, %ymm7 vpslld $2, %ymm11, %ymm12 vpaddd %ymm12, %ymm11, %ymm12 vpsrlq $26, %ymm7, %ymm10 vpaddq %ymm5, %ymm12, %ymm5 vpsrlq $26, %ymm5, %ymm11 vpand %ymm14, %ymm7, %ymm2 vpand %ymm14, %ymm5, %ymm0 vpaddq %ymm8, %ymm10, %ymm8 vpaddq %ymm1, %ymm11, %ymm1 vpsrlq $26, %ymm8, %ymm10 vpand %ymm14, %ymm8, %ymm3 vpaddq %ymm4, %ymm10, %ymm4 addq $0x40, %rsi subq $0x40, %rdx jnz L_poly1305_avx2_blocks_start L_poly1305_avx2_blocks_store: # Store four H values - state vmovdqu %ymm0, (%rax) vmovdqu %ymm1, 32(%rax) vmovdqu %ymm2, 64(%rax) vmovdqu %ymm3, 96(%rax) vmovdqu %ymm4, 128(%rax) L_poly1305_avx2_blocks_end_calc: cmpb $0x00, 616(%rdi) je L_poly1305_avx2_blocks_complete movq %r8, %rax movq %r10, %rdx movq %r12, %rcx shrq $12, %rdx shrq $24, %rcx shlq $26, %r9 shlq $52, %r10 shlq $14, %r11 shlq $40, %r12 addq %r9, %rax adcq %r10, %rax adcq %r11, %rdx adcq %r12, %rdx adcq $0x00, %rcx movq %rcx, %r8 andq $3, %rcx shrq $2, %r8 leaq 0(%r8,%r8,4), %r8 addq %r8, %rax adcq $0x00, %rdx adcq $0x00, %rcx movq %rax, 24(%rdi) movq %rdx, 32(%rdi) movq %rcx, 40(%rdi) L_poly1305_avx2_blocks_complete: movb $0x01, 617(%rdi) addq $0x140, %rsp popq %rbx popq %r12 repz retq #ifndef __APPLE__ .size poly1305_blocks_avx2,.-poly1305_blocks_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl poly1305_final_avx2 .type poly1305_final_avx2,@function .align 16 poly1305_final_avx2: #else .section __TEXT,__text .globl _poly1305_final_avx2 .p2align 4 _poly1305_final_avx2: #endif /* __APPLE__ */ movb $0x01, 616(%rdi) movb 617(%rdi), %cl cmpb $0x00, %cl je L_poly1305_avx2_final_done_blocks_X4 pushq %rsi movq $0x40, %rdx xorq %rsi, %rsi #ifndef __APPLE__ callq poly1305_blocks_avx2@plt #else callq _poly1305_blocks_avx2 #endif /* __APPLE__ */ popq %rsi L_poly1305_avx2_final_done_blocks_X4: movq 608(%rdi), %rax movq %rax, %rcx andq $-16, %rcx cmpb $0x00, %cl je L_poly1305_avx2_final_done_blocks pushq %rcx pushq %rax pushq %rsi movq %rcx, %rdx leaq 480(%rdi), %rsi #ifndef __APPLE__ callq poly1305_blocks_avx@plt #else callq _poly1305_blocks_avx #endif /* __APPLE__ */ popq %rsi popq %rax popq %rcx L_poly1305_avx2_final_done_blocks: subq %rcx, 608(%rdi) xorq %rdx, %rdx jmp L_poly1305_avx2_final_cmp_copy L_poly1305_avx2_final_start_copy: movb 480(%rdi,%rcx,1), %r8b movb %r8b, 480(%rdi,%rdx,1) incb %cl incb %dl L_poly1305_avx2_final_cmp_copy: cmp %rcx, %rax jne L_poly1305_avx2_final_start_copy #ifndef __APPLE__ callq poly1305_final_avx@plt #else callq _poly1305_final_avx #endif /* __APPLE__ */ vpxor %ymm0, %ymm0, %ymm0 vmovdqu %ymm0, 64(%rdi) vmovdqu %ymm0, 96(%rdi) vmovdqu %ymm0, 128(%rdi) vmovdqu %ymm0, 160(%rdi) vmovdqu %ymm0, 192(%rdi) vmovdqu %ymm0, 224(%rdi) vmovdqu %ymm0, 256(%rdi) vmovdqu %ymm0, 288(%rdi) vmovdqu %ymm0, 320(%rdi) movq $0x00, 608(%rdi) movw $0x00, 616(%rdi) repz retq #ifndef __APPLE__ .size poly1305_final_avx2,.-poly1305_final_avx2 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ #endif /* WOLFSSL_X86_64_BUILD */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif