; /* chacha_asm.asm */ ; /* ; * Copyright (C) 2006-2024 wolfSSL Inc. ; * ; * This file is part of wolfSSL. ; * ; * wolfSSL is free software; you can redistribute it and/or modify ; * it under the terms of the GNU General Public License as published by ; * the Free Software Foundation; either version 2 of the License, or ; * (at your option) any later version. ; * ; * wolfSSL is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; * GNU General Public License for more details. ; * ; * You should have received a copy of the GNU General Public License ; * along with this program; if not, write to the Free Software ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA ; */ IF @Version LT 1200 ; AVX2 instructions not recognized by old versions of MASM IFNDEF NO_AVX2_SUPPORT NO_AVX2_SUPPORT = 1 ENDIF ; MOVBE instruction not recognized by old versions of MASM IFNDEF NO_MOVBE_SUPPORT NO_MOVBE_SUPPORT = 1 ENDIF ENDIF IFNDEF HAVE_INTEL_AVX1 HAVE_INTEL_AVX1 = 1 ENDIF IFNDEF NO_AVX2_SUPPORT HAVE_INTEL_AVX2 = 1 ENDIF IFNDEF _WIN64 _WIN64 = 1 ENDIF _text SEGMENT READONLY PARA chacha_encrypt_x64 PROC push rbx push rbp push r12 push r13 push r14 push r15 sub rsp, 64 cmp r9d, 64 jl L_chacha_x64_small L_chacha_x64_start: sub rsp, 48 mov QWORD PTR [rsp+24], r8 mov QWORD PTR [rsp+32], rdx mov QWORD PTR [rsp+40], r9 mov rax, QWORD PTR [rcx+32] mov rbx, QWORD PTR [rcx+40] mov QWORD PTR [rsp+8], rax mov QWORD PTR [rsp+16], rbx mov eax, DWORD PTR [rcx] mov ebx, DWORD PTR [rcx+4] mov r9d, DWORD PTR [rcx+8] mov r8d, DWORD PTR [rcx+12] mov r8d, DWORD PTR [rcx+16] mov r9d, DWORD PTR [rcx+20] mov r10d, DWORD PTR [rcx+24] mov r11d, DWORD PTR [rcx+28] mov r12d, DWORD PTR [rcx+48] mov r13d, DWORD PTR [rcx+52] mov r14d, DWORD PTR [rcx+56] mov r15d, DWORD PTR [rcx+60] mov BYTE PTR [rsp], 10 mov edx, DWORD PTR [rsp+8] mov ebp, DWORD PTR [rsp+12] L_chacha_x64_block_crypt_start: add eax, r8d add ebx, r9d xor r12d, eax xor r13d, ebx rol r12d, 16 rol r13d, 16 add edx, r12d add ebp, r13d xor r8d, edx xor r9d, ebp rol r8d, 12 rol r9d, 12 add eax, r8d add ebx, r9d xor r12d, eax xor r13d, ebx rol r12d, 8 rol r13d, 8 add edx, r12d add ebp, r13d xor r8d, edx xor r9d, ebp rol r8d, 7 rol r9d, 7 mov DWORD PTR [rsp+8], edx mov DWORD PTR [rsp+12], ebp mov edx, DWORD PTR [rsp+16] mov ebp, DWORD PTR [rsp+20] add r9d, r10d add r8d, r11d xor r14d, r9d xor r15d, r8d rol r14d, 16 rol r15d, 16 add edx, r14d add ebp, r15d xor r10d, edx xor r11d, ebp rol r10d, 12 rol r11d, 12 add r9d, r10d add r8d, r11d xor r14d, r9d xor r15d, r8d rol r14d, 8 rol r15d, 8 add edx, r14d add ebp, r15d xor r10d, edx xor r11d, ebp rol r10d, 7 rol r11d, 7 add eax, r9d add ebx, r10d xor r15d, eax xor r12d, ebx rol r15d, 16 rol r12d, 16 add edx, r15d add ebp, r12d xor r9d, edx xor r10d, ebp rol r9d, 12 rol r10d, 12 add eax, r9d add ebx, r10d xor r15d, eax xor r12d, ebx rol r15d, 8 rol r12d, 8 add edx, r15d add ebp, r12d xor r9d, edx xor r10d, ebp rol r9d, 7 rol r10d, 7 mov DWORD PTR [rsp+16], edx mov DWORD PTR [rsp+20], ebp mov edx, DWORD PTR [rsp+8] mov ebp, DWORD PTR [rsp+12] add r9d, r11d add r8d, r8d xor r13d, r9d xor r14d, r8d rol r13d, 16 rol r14d, 16 add edx, r13d add ebp, r14d xor r11d, edx xor r8d, ebp rol r11d, 12 rol r8d, 12 add r9d, r11d add r8d, r8d xor r13d, r9d xor r14d, r8d rol r13d, 8 rol r14d, 8 add edx, r13d add ebp, r14d xor r11d, edx xor r8d, ebp rol r11d, 7 rol r8d, 7 dec BYTE PTR [rsp] jnz L_chacha_x64_block_crypt_start mov DWORD PTR [rsp+8], edx mov DWORD PTR [rsp+12], ebp mov rdx, QWORD PTR [rsp+32] mov rbp, QWORD PTR [rsp+24] add eax, DWORD PTR [rcx] add ebx, DWORD PTR [rcx+4] add r9d, DWORD PTR [rcx+8] add r8d, DWORD PTR [rcx+12] add r8d, DWORD PTR [rcx+16] add r9d, DWORD PTR [rcx+20] add r10d, DWORD PTR [rcx+24] add r11d, DWORD PTR [rcx+28] add r12d, DWORD PTR [rcx+48] add r13d, DWORD PTR [rcx+52] add r14d, DWORD PTR [rcx+56] add r15d, DWORD PTR [rcx+60] xor eax, DWORD PTR [rdx] xor ebx, DWORD PTR [rdx+4] xor r9d, DWORD PTR [rdx+8] xor r8d, DWORD PTR [rdx+12] xor r8d, DWORD PTR [rdx+16] xor r9d, DWORD PTR [rdx+20] xor r10d, DWORD PTR [rdx+24] xor r11d, DWORD PTR [rdx+28] xor r12d, DWORD PTR [rdx+48] xor r13d, DWORD PTR [rdx+52] xor r14d, DWORD PTR [rdx+56] xor r15d, DWORD PTR [rdx+60] mov DWORD PTR [rbp], eax mov DWORD PTR [rbp+4], ebx mov DWORD PTR [rbp+8], r9d mov DWORD PTR [rbp+12], r8d mov DWORD PTR [rbp+16], r8d mov DWORD PTR [rbp+20], r9d mov DWORD PTR [rbp+24], r10d mov DWORD PTR [rbp+28], r11d mov DWORD PTR [rbp+48], r12d mov DWORD PTR [rbp+52], r13d mov DWORD PTR [rbp+56], r14d mov DWORD PTR [rbp+60], r15d mov eax, DWORD PTR [rsp+8] mov ebx, DWORD PTR [rsp+12] mov r9d, DWORD PTR [rsp+16] mov r8d, DWORD PTR [rsp+20] add eax, DWORD PTR [rcx+32] add ebx, DWORD PTR [rcx+36] add r9d, DWORD PTR [rcx+40] add r8d, DWORD PTR [rcx+44] xor eax, DWORD PTR [rdx+32] xor ebx, DWORD PTR [rdx+36] xor r9d, DWORD PTR [rdx+40] xor r8d, DWORD PTR [rdx+44] mov DWORD PTR [rbp+32], eax mov DWORD PTR [rbp+36], ebx mov DWORD PTR [rbp+40], r9d mov DWORD PTR [rbp+44], r8d mov r8, QWORD PTR [rsp+24] mov r9, QWORD PTR [rsp+40] add DWORD PTR [rcx+48], 1 add rsp, 48 sub r9d, 64 add rdx, 64 add r8, 64 cmp r9d, 64 jge L_chacha_x64_start L_chacha_x64_small: cmp r9d, 0 je L_chacha_x64_done sub rsp, 48 mov QWORD PTR [rsp+24], r8 mov QWORD PTR [rsp+32], rdx mov QWORD PTR [rsp+40], r9 mov rax, QWORD PTR [rcx+32] mov rbx, QWORD PTR [rcx+40] mov QWORD PTR [rsp+8], rax mov QWORD PTR [rsp+16], rbx mov eax, DWORD PTR [rcx] mov ebx, DWORD PTR [rcx+4] mov r9d, DWORD PTR [rcx+8] mov r8d, DWORD PTR [rcx+12] mov r8d, DWORD PTR [rcx+16] mov r9d, DWORD PTR [rcx+20] mov r10d, DWORD PTR [rcx+24] mov r11d, DWORD PTR [rcx+28] mov r12d, DWORD PTR [rcx+48] mov r13d, DWORD PTR [rcx+52] mov r14d, DWORD PTR [rcx+56] mov r15d, DWORD PTR [rcx+60] mov BYTE PTR [rsp], 10 mov edx, DWORD PTR [rsp+8] mov ebp, DWORD PTR [rsp+12] L_chacha_x64_partial_crypt_start: add eax, r8d add ebx, r9d xor r12d, eax xor r13d, ebx rol r12d, 16 rol r13d, 16 add edx, r12d add ebp, r13d xor r8d, edx xor r9d, ebp rol r8d, 12 rol r9d, 12 add eax, r8d add ebx, r9d xor r12d, eax xor r13d, ebx rol r12d, 8 rol r13d, 8 add edx, r12d add ebp, r13d xor r8d, edx xor r9d, ebp rol r8d, 7 rol r9d, 7 mov DWORD PTR [rsp+8], edx mov DWORD PTR [rsp+12], ebp mov edx, DWORD PTR [rsp+16] mov ebp, DWORD PTR [rsp+20] add r9d, r10d add r8d, r11d xor r14d, r9d xor r15d, r8d rol r14d, 16 rol r15d, 16 add edx, r14d add ebp, r15d xor r10d, edx xor r11d, ebp rol r10d, 12 rol r11d, 12 add r9d, r10d add r8d, r11d xor r14d, r9d xor r15d, r8d rol r14d, 8 rol r15d, 8 add edx, r14d add ebp, r15d xor r10d, edx xor r11d, ebp rol r10d, 7 rol r11d, 7 add eax, r9d add ebx, r10d xor r15d, eax xor r12d, ebx rol r15d, 16 rol r12d, 16 add edx, r15d add ebp, r12d xor r9d, edx xor r10d, ebp rol r9d, 12 rol r10d, 12 add eax, r9d add ebx, r10d xor r15d, eax xor r12d, ebx rol r15d, 8 rol r12d, 8 add edx, r15d add ebp, r12d xor r9d, edx xor r10d, ebp rol r9d, 7 rol r10d, 7 mov DWORD PTR [rsp+16], edx mov DWORD PTR [rsp+20], ebp mov edx, DWORD PTR [rsp+8] mov ebp, DWORD PTR [rsp+12] add r9d, r11d add r8d, r8d xor r13d, r9d xor r14d, r8d rol r13d, 16 rol r14d, 16 add edx, r13d add ebp, r14d xor r11d, edx xor r8d, ebp rol r11d, 12 rol r8d, 12 add r9d, r11d add r8d, r8d xor r13d, r9d xor r14d, r8d rol r13d, 8 rol r14d, 8 add edx, r13d add ebp, r14d xor r11d, edx xor r8d, ebp rol r11d, 7 rol r8d, 7 dec BYTE PTR [rsp] jnz L_chacha_x64_partial_crypt_start mov DWORD PTR [rsp+8], edx mov DWORD PTR [rsp+12], ebp mov rdx, QWORD PTR [rsp+32] add eax, DWORD PTR [rcx] add ebx, DWORD PTR [rcx+4] add r9d, DWORD PTR [rcx+8] add r8d, DWORD PTR [rcx+12] add r8d, DWORD PTR [rcx+16] add r9d, DWORD PTR [rcx+20] add r10d, DWORD PTR [rcx+24] add r11d, DWORD PTR [rcx+28] add r12d, DWORD PTR [rcx+48] add r13d, DWORD PTR [rcx+52] add r14d, DWORD PTR [rcx+56] add r15d, DWORD PTR [rcx+60] lea rbp, QWORD PTR [rcx+80] mov DWORD PTR [rbp], eax mov DWORD PTR [rbp+4], ebx mov DWORD PTR [rbp+8], r9d mov DWORD PTR [rbp+12], r8d mov DWORD PTR [rbp+16], r8d mov DWORD PTR [rbp+20], r9d mov DWORD PTR [rbp+24], r10d mov DWORD PTR [rbp+28], r11d mov DWORD PTR [rbp+48], r12d mov DWORD PTR [rbp+52], r13d mov DWORD PTR [rbp+56], r14d mov DWORD PTR [rbp+60], r15d mov eax, DWORD PTR [rsp+8] mov ebx, DWORD PTR [rsp+12] mov r9d, DWORD PTR [rsp+16] mov r8d, DWORD PTR [rsp+20] add eax, DWORD PTR [rcx+32] add ebx, DWORD PTR [rcx+36] add r9d, DWORD PTR [rcx+40] add r8d, DWORD PTR [rcx+44] mov DWORD PTR [rbp+32], eax mov DWORD PTR [rbp+36], ebx mov DWORD PTR [rbp+40], r9d mov DWORD PTR [rbp+44], r8d mov r8, QWORD PTR [rsp+24] mov r9, QWORD PTR [rsp+40] add DWORD PTR [rcx+48], 1 add rsp, 48 mov r8d, r9d xor rbx, rbx and r8d, 7 jz L_chacha_x64_partial_start64 L_chacha_x64_partial_start8: movzx eax, BYTE PTR [rbp+rbx] xor al, BYTE PTR [rdx+rbx] mov BYTE PTR [r8+rbx], al inc ebx cmp ebx, r8d jne L_chacha_x64_partial_start8 je L_chacha_x64_partial_end64 L_chacha_x64_partial_start64: mov rax, QWORD PTR [rbp+rbx] xor rax, QWORD PTR [rdx+rbx] mov QWORD PTR [r8+rbx], rax add ebx, 8 L_chacha_x64_partial_end64: cmp ebx, r9d jne L_chacha_x64_partial_start64 mov r9d, 64 sub r9d, ebx mov DWORD PTR [rcx+76], r9d L_chacha_x64_done: add rsp, 64 pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx ret chacha_encrypt_x64 ENDP _text ENDS IFDEF HAVE_INTEL_AVX1 _DATA SEGMENT ALIGN 16 L_chacha20_avx1_rotl8 QWORD 433757367256023043, 1012478749960636427 ptr_L_chacha20_avx1_rotl8 QWORD L_chacha20_avx1_rotl8 _DATA ENDS _DATA SEGMENT ALIGN 16 L_chacha20_avx1_rotl16 QWORD 361421592464458498, 940142975169071882 ptr_L_chacha20_avx1_rotl16 QWORD L_chacha20_avx1_rotl16 _DATA ENDS _DATA SEGMENT ALIGN 16 L_chacha20_avx1_add QWORD 4294967296, 12884901890 ptr_L_chacha20_avx1_add QWORD L_chacha20_avx1_add _DATA ENDS _DATA SEGMENT ALIGN 16 L_chacha20_avx1_four QWORD 17179869188, 17179869188 ptr_L_chacha20_avx1_four QWORD L_chacha20_avx1_four _DATA ENDS _text SEGMENT READONLY PARA chacha_encrypt_avx1 PROC push r12 push r13 push r14 push r15 push rdi push rsi sub rsp, 560 vmovdqu OWORD PTR [rsp+400], xmm6 vmovdqu OWORD PTR [rsp+416], xmm7 vmovdqu OWORD PTR [rsp+432], xmm8 vmovdqu OWORD PTR [rsp+448], xmm9 vmovdqu OWORD PTR [rsp+464], xmm10 vmovdqu OWORD PTR [rsp+480], xmm11 vmovdqu OWORD PTR [rsp+496], xmm12 vmovdqu OWORD PTR [rsp+512], xmm13 vmovdqu OWORD PTR [rsp+528], xmm14 vmovdqu OWORD PTR [rsp+544], xmm15 mov r11, rsp lea r12, QWORD PTR [rsp+256] mov r14, QWORD PTR [ptr_L_chacha20_avx1_rotl8] mov r15, QWORD PTR [ptr_L_chacha20_avx1_rotl16] mov rdi, QWORD PTR [ptr_L_chacha20_avx1_add] mov rsi, QWORD PTR [ptr_L_chacha20_avx1_four] add r11, 15 add r12, 15 and r11, -16 and r12, -16 mov eax, r9d shr eax, 8 jz L_chacha20_avx1_end128 vpshufd xmm0, [rcx], 0 vpshufd xmm1, [rcx+4], 0 vpshufd xmm2, [rcx+8], 0 vpshufd xmm3, [rcx+12], 0 vpshufd xmm4, [rcx+16], 0 vpshufd xmm5, [rcx+20], 0 vpshufd xmm6, [rcx+24], 0 vpshufd xmm7, [rcx+28], 0 vpshufd xmm8, [rcx+32], 0 vpshufd xmm9, [rcx+36], 0 vpshufd xmm10, [rcx+40], 0 vpshufd xmm11, [rcx+44], 0 vpshufd xmm12, [rcx+48], 0 vpshufd xmm13, [rcx+52], 0 vpshufd xmm14, [rcx+56], 0 vpshufd xmm15, [rcx+60], 0 vpaddd xmm12, xmm12, OWORD PTR [rdi] vmovdqa OWORD PTR [r11], xmm0 vmovdqa OWORD PTR [r11+16], xmm1 vmovdqa OWORD PTR [r11+32], xmm2 vmovdqa OWORD PTR [r11+48], xmm3 vmovdqa OWORD PTR [r11+64], xmm4 vmovdqa OWORD PTR [r11+80], xmm5 vmovdqa OWORD PTR [r11+96], xmm6 vmovdqa OWORD PTR [r11+112], xmm7 vmovdqa OWORD PTR [r11+128], xmm8 vmovdqa OWORD PTR [r11+144], xmm9 vmovdqa OWORD PTR [r11+160], xmm10 vmovdqa OWORD PTR [r11+176], xmm11 vmovdqa OWORD PTR [r11+192], xmm12 vmovdqa OWORD PTR [r11+208], xmm13 vmovdqa OWORD PTR [r11+224], xmm14 vmovdqa OWORD PTR [r11+240], xmm15 L_chacha20_avx1_start128: vmovdqa OWORD PTR [r12+48], xmm11 mov r10b, 10 L_chacha20_avx1_loop128: vpaddd xmm0, xmm0, xmm4 vpxor xmm12, xmm12, xmm0 vmovdqa xmm11, OWORD PTR [r12+48] vpshufb xmm12, xmm12, OWORD PTR [r15] vpaddd xmm8, xmm8, xmm12 vpxor xmm4, xmm4, xmm8 vpaddd xmm1, xmm1, xmm5 vpxor xmm13, xmm13, xmm1 vpshufb xmm13, xmm13, OWORD PTR [r15] vpaddd xmm9, xmm9, xmm13 vpxor xmm5, xmm5, xmm9 vpaddd xmm2, xmm2, xmm6 vpxor xmm14, xmm14, xmm2 vpshufb xmm14, xmm14, OWORD PTR [r15] vpaddd xmm10, xmm10, xmm14 vpxor xmm6, xmm6, xmm10 vpaddd xmm3, xmm3, xmm7 vpxor xmm15, xmm15, xmm3 vpshufb xmm15, xmm15, OWORD PTR [r15] vpaddd xmm11, xmm11, xmm15 vpxor xmm7, xmm7, xmm11 vmovdqa OWORD PTR [r12+48], xmm11 vpsrld xmm11, xmm4, 20 vpslld xmm4, xmm4, 12 vpxor xmm4, xmm4, xmm11 vpsrld xmm11, xmm5, 20 vpslld xmm5, xmm5, 12 vpxor xmm5, xmm5, xmm11 vpsrld xmm11, xmm6, 20 vpslld xmm6, xmm6, 12 vpxor xmm6, xmm6, xmm11 vpsrld xmm11, xmm7, 20 vpslld xmm7, xmm7, 12 vpxor xmm7, xmm7, xmm11 vpaddd xmm0, xmm0, xmm4 vpxor xmm12, xmm12, xmm0 vmovdqa xmm11, OWORD PTR [r12+48] vpshufb xmm12, xmm12, OWORD PTR [r14] vpaddd xmm8, xmm8, xmm12 vpxor xmm4, xmm4, xmm8 vpaddd xmm1, xmm1, xmm5 vpxor xmm13, xmm13, xmm1 vpshufb xmm13, xmm13, OWORD PTR [r14] vpaddd xmm9, xmm9, xmm13 vpxor xmm5, xmm5, xmm9 vpaddd xmm2, xmm2, xmm6 vpxor xmm14, xmm14, xmm2 vpshufb xmm14, xmm14, OWORD PTR [r14] vpaddd xmm10, xmm10, xmm14 vpxor xmm6, xmm6, xmm10 vpaddd xmm3, xmm3, xmm7 vpxor xmm15, xmm15, xmm3 vpshufb xmm15, xmm15, OWORD PTR [r14] vpaddd xmm11, xmm11, xmm15 vpxor xmm7, xmm7, xmm11 vmovdqa OWORD PTR [r12+48], xmm11 vpsrld xmm11, xmm4, 25 vpslld xmm4, xmm4, 7 vpxor xmm4, xmm4, xmm11 vpsrld xmm11, xmm5, 25 vpslld xmm5, xmm5, 7 vpxor xmm5, xmm5, xmm11 vpsrld xmm11, xmm6, 25 vpslld xmm6, xmm6, 7 vpxor xmm6, xmm6, xmm11 vpsrld xmm11, xmm7, 25 vpslld xmm7, xmm7, 7 vpxor xmm7, xmm7, xmm11 vpaddd xmm0, xmm0, xmm5 vpxor xmm15, xmm15, xmm0 vmovdqa xmm11, OWORD PTR [r12+48] vpshufb xmm15, xmm15, OWORD PTR [r15] vpaddd xmm10, xmm10, xmm15 vpxor xmm5, xmm5, xmm10 vpaddd xmm1, xmm1, xmm6 vpxor xmm12, xmm12, xmm1 vpshufb xmm12, xmm12, OWORD PTR [r15] vpaddd xmm11, xmm11, xmm12 vpxor xmm6, xmm6, xmm11 vpaddd xmm2, xmm2, xmm7 vpxor xmm13, xmm13, xmm2 vpshufb xmm13, xmm13, OWORD PTR [r15] vpaddd xmm8, xmm8, xmm13 vpxor xmm7, xmm7, xmm8 vpaddd xmm3, xmm3, xmm4 vpxor xmm14, xmm14, xmm3 vpshufb xmm14, xmm14, OWORD PTR [r15] vpaddd xmm9, xmm9, xmm14 vpxor xmm4, xmm4, xmm9 vmovdqa OWORD PTR [r12+48], xmm11 vpsrld xmm11, xmm5, 20 vpslld xmm5, xmm5, 12 vpxor xmm5, xmm5, xmm11 vpsrld xmm11, xmm6, 20 vpslld xmm6, xmm6, 12 vpxor xmm6, xmm6, xmm11 vpsrld xmm11, xmm7, 20 vpslld xmm7, xmm7, 12 vpxor xmm7, xmm7, xmm11 vpsrld xmm11, xmm4, 20 vpslld xmm4, xmm4, 12 vpxor xmm4, xmm4, xmm11 vpaddd xmm0, xmm0, xmm5 vpxor xmm15, xmm15, xmm0 vmovdqa xmm11, OWORD PTR [r12+48] vpshufb xmm15, xmm15, OWORD PTR [r14] vpaddd xmm10, xmm10, xmm15 vpxor xmm5, xmm5, xmm10 vpaddd xmm1, xmm1, xmm6 vpxor xmm12, xmm12, xmm1 vpshufb xmm12, xmm12, OWORD PTR [r14] vpaddd xmm11, xmm11, xmm12 vpxor xmm6, xmm6, xmm11 vpaddd xmm2, xmm2, xmm7 vpxor xmm13, xmm13, xmm2 vpshufb xmm13, xmm13, OWORD PTR [r14] vpaddd xmm8, xmm8, xmm13 vpxor xmm7, xmm7, xmm8 vpaddd xmm3, xmm3, xmm4 vpxor xmm14, xmm14, xmm3 vpshufb xmm14, xmm14, OWORD PTR [r14] vpaddd xmm9, xmm9, xmm14 vpxor xmm4, xmm4, xmm9 vmovdqa OWORD PTR [r12+48], xmm11 vpsrld xmm11, xmm5, 25 vpslld xmm5, xmm5, 7 vpxor xmm5, xmm5, xmm11 vpsrld xmm11, xmm6, 25 vpslld xmm6, xmm6, 7 vpxor xmm6, xmm6, xmm11 vpsrld xmm11, xmm7, 25 vpslld xmm7, xmm7, 7 vpxor xmm7, xmm7, xmm11 vpsrld xmm11, xmm4, 25 vpslld xmm4, xmm4, 7 vpxor xmm4, xmm4, xmm11 dec r10b jnz L_chacha20_avx1_loop128 vmovdqa xmm11, OWORD PTR [r12+48] vpaddd xmm0, xmm0, OWORD PTR [r11] vpaddd xmm1, xmm1, OWORD PTR [r11+16] vpaddd xmm2, xmm2, OWORD PTR [r11+32] vpaddd xmm3, xmm3, OWORD PTR [r11+48] vpaddd xmm4, xmm4, OWORD PTR [r11+64] vpaddd xmm5, xmm5, OWORD PTR [r11+80] vpaddd xmm6, xmm6, OWORD PTR [r11+96] vpaddd xmm7, xmm7, OWORD PTR [r11+112] vpaddd xmm8, xmm8, OWORD PTR [r11+128] vpaddd xmm9, xmm9, OWORD PTR [r11+144] vpaddd xmm10, xmm10, OWORD PTR [r11+160] vpaddd xmm11, xmm11, OWORD PTR [r11+176] vpaddd xmm12, xmm12, OWORD PTR [r11+192] vpaddd xmm13, xmm13, OWORD PTR [r11+208] vpaddd xmm14, xmm14, OWORD PTR [r11+224] vpaddd xmm15, xmm15, OWORD PTR [r11+240] vmovdqa OWORD PTR [r12], xmm8 vmovdqa OWORD PTR [r12+16], xmm9 vmovdqa OWORD PTR [r12+32], xmm10 vmovdqa OWORD PTR [r12+48], xmm11 vmovdqa OWORD PTR [r12+64], xmm12 vmovdqa OWORD PTR [r12+80], xmm13 vmovdqa OWORD PTR [r12+96], xmm14 vmovdqa OWORD PTR [r12+112], xmm15 vpunpckldq xmm8, xmm0, xmm1 vpunpckldq xmm9, xmm2, xmm3 vpunpckhdq xmm12, xmm0, xmm1 vpunpckhdq xmm13, xmm2, xmm3 vpunpckldq xmm10, xmm4, xmm5 vpunpckldq xmm11, xmm6, xmm7 vpunpckhdq xmm14, xmm4, xmm5 vpunpckhdq xmm15, xmm6, xmm7 vpunpcklqdq xmm0, xmm8, xmm9 vpunpcklqdq xmm1, xmm10, xmm11 vpunpckhqdq xmm2, xmm8, xmm9 vpunpckhqdq xmm3, xmm10, xmm11 vpunpcklqdq xmm4, xmm12, xmm13 vpunpcklqdq xmm5, xmm14, xmm15 vpunpckhqdq xmm6, xmm12, xmm13 vpunpckhqdq xmm7, xmm14, xmm15 vmovdqu xmm8, OWORD PTR [rdx] vmovdqu xmm9, OWORD PTR [rdx+16] vmovdqu xmm10, OWORD PTR [rdx+64] vmovdqu xmm11, OWORD PTR [rdx+80] vmovdqu xmm12, OWORD PTR [rdx+128] vmovdqu xmm13, OWORD PTR [rdx+144] vmovdqu xmm14, OWORD PTR [rdx+192] vmovdqu xmm15, OWORD PTR [rdx+208] vpxor xmm0, xmm0, xmm8 vpxor xmm1, xmm1, xmm9 vpxor xmm2, xmm2, xmm10 vpxor xmm3, xmm3, xmm11 vpxor xmm4, xmm4, xmm12 vpxor xmm5, xmm5, xmm13 vpxor xmm6, xmm6, xmm14 vpxor xmm7, xmm7, xmm15 vmovdqu OWORD PTR [r8], xmm0 vmovdqu OWORD PTR [r8+16], xmm1 vmovdqu OWORD PTR [r8+64], xmm2 vmovdqu OWORD PTR [r8+80], xmm3 vmovdqu OWORD PTR [r8+128], xmm4 vmovdqu OWORD PTR [r8+144], xmm5 vmovdqu OWORD PTR [r8+192], xmm6 vmovdqu OWORD PTR [r8+208], xmm7 vmovdqa xmm0, OWORD PTR [r12] vmovdqa xmm1, OWORD PTR [r12+16] vmovdqa xmm2, OWORD PTR [r12+32] vmovdqa xmm3, OWORD PTR [r12+48] vmovdqa xmm4, OWORD PTR [r12+64] vmovdqa xmm5, OWORD PTR [r12+80] vmovdqa xmm6, OWORD PTR [r12+96] vmovdqa xmm7, OWORD PTR [r12+112] vpunpckldq xmm8, xmm0, xmm1 vpunpckldq xmm9, xmm2, xmm3 vpunpckhdq xmm12, xmm0, xmm1 vpunpckhdq xmm13, xmm2, xmm3 vpunpckldq xmm10, xmm4, xmm5 vpunpckldq xmm11, xmm6, xmm7 vpunpckhdq xmm14, xmm4, xmm5 vpunpckhdq xmm15, xmm6, xmm7 vpunpcklqdq xmm0, xmm8, xmm9 vpunpcklqdq xmm1, xmm10, xmm11 vpunpckhqdq xmm2, xmm8, xmm9 vpunpckhqdq xmm3, xmm10, xmm11 vpunpcklqdq xmm4, xmm12, xmm13 vpunpcklqdq xmm5, xmm14, xmm15 vpunpckhqdq xmm6, xmm12, xmm13 vpunpckhqdq xmm7, xmm14, xmm15 vmovdqu xmm8, OWORD PTR [rdx+32] vmovdqu xmm9, OWORD PTR [rdx+48] vmovdqu xmm10, OWORD PTR [rdx+96] vmovdqu xmm11, OWORD PTR [rdx+112] vmovdqu xmm12, OWORD PTR [rdx+160] vmovdqu xmm13, OWORD PTR [rdx+176] vmovdqu xmm14, OWORD PTR [rdx+224] vmovdqu xmm15, OWORD PTR [rdx+240] vpxor xmm0, xmm0, xmm8 vpxor xmm1, xmm1, xmm9 vpxor xmm2, xmm2, xmm10 vpxor xmm3, xmm3, xmm11 vpxor xmm4, xmm4, xmm12 vpxor xmm5, xmm5, xmm13 vpxor xmm6, xmm6, xmm14 vpxor xmm7, xmm7, xmm15 vmovdqu OWORD PTR [r8+32], xmm0 vmovdqu OWORD PTR [r8+48], xmm1 vmovdqu OWORD PTR [r8+96], xmm2 vmovdqu OWORD PTR [r8+112], xmm3 vmovdqu OWORD PTR [r8+160], xmm4 vmovdqu OWORD PTR [r8+176], xmm5 vmovdqu OWORD PTR [r8+224], xmm6 vmovdqu OWORD PTR [r8+240], xmm7 vmovdqa xmm12, OWORD PTR [r11+192] add rdx, 256 add r8, 256 vpaddd xmm12, xmm12, OWORD PTR [rsi] sub r9d, 256 vmovdqa OWORD PTR [r11+192], xmm12 cmp r9d, 256 jl L_chacha20_avx1_done128 vmovdqa xmm0, OWORD PTR [r11] vmovdqa xmm1, OWORD PTR [r11+16] vmovdqa xmm2, OWORD PTR [r11+32] vmovdqa xmm3, OWORD PTR [r11+48] vmovdqa xmm4, OWORD PTR [r11+64] vmovdqa xmm5, OWORD PTR [r11+80] vmovdqa xmm6, OWORD PTR [r11+96] vmovdqa xmm7, OWORD PTR [r11+112] vmovdqa xmm8, OWORD PTR [r11+128] vmovdqa xmm9, OWORD PTR [r11+144] vmovdqa xmm10, OWORD PTR [r11+160] vmovdqa xmm11, OWORD PTR [r11+176] vmovdqa xmm12, OWORD PTR [r11+192] vmovdqa xmm13, OWORD PTR [r11+208] vmovdqa xmm14, OWORD PTR [r11+224] vmovdqa xmm15, OWORD PTR [r11+240] jmp L_chacha20_avx1_start128 L_chacha20_avx1_done128: shl eax, 2 add DWORD PTR [rcx+48], eax L_chacha20_avx1_end128: cmp r9d, 64 jl L_chacha20_avx1_block_done L_chacha20_avx1_block_start: vmovdqu xmm0, OWORD PTR [rcx] vmovdqu xmm1, OWORD PTR [rcx+16] vmovdqu xmm2, OWORD PTR [rcx+32] vmovdqu xmm3, OWORD PTR [rcx+48] vmovdqa xmm5, xmm0 vmovdqa xmm6, xmm1 vmovdqa xmm7, xmm2 vmovdqa xmm8, xmm3 mov al, 10 L_chacha20_avx1_block_crypt_start: vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r15] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 20 vpslld xmm1, xmm1, 12 vpxor xmm1, xmm1, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r14] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 25 vpslld xmm1, xmm1, 7 vpxor xmm1, xmm1, xmm4 vpshufd xmm1, xmm1, 57 vpshufd xmm2, xmm2, 78 vpshufd xmm3, xmm3, 147 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r15] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 20 vpslld xmm1, xmm1, 12 vpxor xmm1, xmm1, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r14] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 25 vpslld xmm1, xmm1, 7 vpxor xmm1, xmm1, xmm4 vpshufd xmm1, xmm1, 147 vpshufd xmm2, xmm2, 78 vpshufd xmm3, xmm3, 57 dec al jnz L_chacha20_avx1_block_crypt_start vpaddd xmm0, xmm0, xmm5 vpaddd xmm1, xmm1, xmm6 vpaddd xmm2, xmm2, xmm7 vpaddd xmm3, xmm3, xmm8 vmovdqu xmm5, OWORD PTR [rdx] vmovdqu xmm6, OWORD PTR [rdx+16] vmovdqu xmm7, OWORD PTR [rdx+32] vmovdqu xmm8, OWORD PTR [rdx+48] vpxor xmm0, xmm0, xmm5 vpxor xmm1, xmm1, xmm6 vpxor xmm2, xmm2, xmm7 vpxor xmm3, xmm3, xmm8 vmovdqu OWORD PTR [r8], xmm0 vmovdqu OWORD PTR [r8+16], xmm1 vmovdqu OWORD PTR [r8+32], xmm2 vmovdqu OWORD PTR [r8+48], xmm3 add DWORD PTR [rcx+48], 1 sub r9d, 64 add rdx, 64 add r8, 64 cmp r9d, 64 jge L_chacha20_avx1_block_start L_chacha20_avx1_block_done: cmp r9d, 0 je L_chacha20_avx1_partial_done lea r12, QWORD PTR [rcx+80] vmovdqu xmm0, OWORD PTR [rcx] vmovdqu xmm1, OWORD PTR [rcx+16] vmovdqu xmm2, OWORD PTR [rcx+32] vmovdqu xmm3, OWORD PTR [rcx+48] vmovdqa xmm5, xmm0 vmovdqa xmm6, xmm1 vmovdqa xmm7, xmm2 vmovdqa xmm8, xmm3 mov al, 10 L_chacha20_avx1_partial_crypt_start: vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r15] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 20 vpslld xmm1, xmm1, 12 vpxor xmm1, xmm1, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r14] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 25 vpslld xmm1, xmm1, 7 vpxor xmm1, xmm1, xmm4 vpshufd xmm1, xmm1, 57 vpshufd xmm2, xmm2, 78 vpshufd xmm3, xmm3, 147 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r15] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 20 vpslld xmm1, xmm1, 12 vpxor xmm1, xmm1, xmm4 vpaddd xmm0, xmm0, xmm1 vpxor xmm3, xmm3, xmm0 vpshufb xmm3, xmm3, OWORD PTR [r14] vpaddd xmm2, xmm2, xmm3 vpxor xmm1, xmm1, xmm2 vpsrld xmm4, xmm1, 25 vpslld xmm1, xmm1, 7 vpxor xmm1, xmm1, xmm4 vpshufd xmm1, xmm1, 147 vpshufd xmm2, xmm2, 78 vpshufd xmm3, xmm3, 57 dec al jnz L_chacha20_avx1_partial_crypt_start vpaddd xmm0, xmm0, xmm5 vpaddd xmm1, xmm1, xmm6 vpaddd xmm2, xmm2, xmm7 vpaddd xmm3, xmm3, xmm8 vmovdqu OWORD PTR [r12], xmm0 vmovdqu OWORD PTR [r12+16], xmm1 vmovdqu OWORD PTR [r12+32], xmm2 vmovdqu OWORD PTR [r12+48], xmm3 add DWORD PTR [rcx+48], 1 mov r10d, r9d xor r13, r13 and r10d, 7 jz L_chacha20_avx1_partial_start64 L_chacha20_avx1_partial_start8: movzx eax, BYTE PTR [r12+r13] xor al, BYTE PTR [rdx+r13] mov BYTE PTR [r8+r13], al inc r13d cmp r13d, r10d jne L_chacha20_avx1_partial_start8 je L_chacha20_avx1_partial_end64 L_chacha20_avx1_partial_start64: mov rax, QWORD PTR [r12+r13] xor rax, QWORD PTR [rdx+r13] mov QWORD PTR [r8+r13], rax add r13d, 8 L_chacha20_avx1_partial_end64: cmp r13d, r9d jne L_chacha20_avx1_partial_start64 mov r10d, 64 sub r10d, r13d mov DWORD PTR [rcx+76], r10d L_chacha20_avx1_partial_done: vzeroupper vmovdqu xmm6, OWORD PTR [rsp+400] vmovdqu xmm7, OWORD PTR [rsp+416] vmovdqu xmm8, OWORD PTR [rsp+432] vmovdqu xmm9, OWORD PTR [rsp+448] vmovdqu xmm10, OWORD PTR [rsp+464] vmovdqu xmm11, OWORD PTR [rsp+480] vmovdqu xmm12, OWORD PTR [rsp+496] vmovdqu xmm13, OWORD PTR [rsp+512] vmovdqu xmm14, OWORD PTR [rsp+528] vmovdqu xmm15, OWORD PTR [rsp+544] add rsp, 560 pop rsi pop rdi pop r15 pop r14 pop r13 pop r12 ret chacha_encrypt_avx1 ENDP _text ENDS ENDIF IFDEF HAVE_INTEL_AVX2 _DATA SEGMENT ALIGN 16 L_chacha20_avx2_rotl8 QWORD 433757367256023043, 1012478749960636427, 433757367256023043, 1012478749960636427 ptr_L_chacha20_avx2_rotl8 QWORD L_chacha20_avx2_rotl8 _DATA ENDS _DATA SEGMENT ALIGN 16 L_chacha20_avx2_rotl16 QWORD 361421592464458498, 940142975169071882, 361421592464458498, 940142975169071882 ptr_L_chacha20_avx2_rotl16 QWORD L_chacha20_avx2_rotl16 _DATA ENDS _DATA SEGMENT ALIGN 16 L_chacha20_avx2_add QWORD 4294967296, 12884901890, 21474836484, 30064771078 ptr_L_chacha20_avx2_add QWORD L_chacha20_avx2_add _DATA ENDS _DATA SEGMENT ALIGN 16 L_chacha20_avx2_eight QWORD 34359738376, 34359738376, 34359738376, 34359738376 ptr_L_chacha20_avx2_eight QWORD L_chacha20_avx2_eight _DATA ENDS _text SEGMENT READONLY PARA chacha_encrypt_avx2 PROC push r12 push r13 push r14 push r15 push rdi sub rsp, 960 vmovdqu OWORD PTR [rsp+800], xmm6 vmovdqu OWORD PTR [rsp+816], xmm7 vmovdqu OWORD PTR [rsp+832], xmm8 vmovdqu OWORD PTR [rsp+848], xmm9 vmovdqu OWORD PTR [rsp+864], xmm10 vmovdqu OWORD PTR [rsp+880], xmm11 vmovdqu OWORD PTR [rsp+896], xmm12 vmovdqu OWORD PTR [rsp+912], xmm13 vmovdqu OWORD PTR [rsp+928], xmm14 vmovdqu OWORD PTR [rsp+944], xmm15 mov r11, rsp mov r13, QWORD PTR [ptr_L_chacha20_avx2_rotl8] mov r14, QWORD PTR [ptr_L_chacha20_avx2_rotl16] mov r15, QWORD PTR [ptr_L_chacha20_avx2_add] mov rdi, QWORD PTR [ptr_L_chacha20_avx2_eight] lea r12, QWORD PTR [rsp+512] add r11, 31 add r12, 31 and r11, -32 and r12, -32 mov eax, r9d shr eax, 9 jz L_chacha20_avx2_end256 vpbroadcastd ymm0, DWORD PTR [rcx] vpbroadcastd ymm1, DWORD PTR [rcx+4] vpbroadcastd ymm2, DWORD PTR [rcx+8] vpbroadcastd ymm3, DWORD PTR [rcx+12] vpbroadcastd ymm4, DWORD PTR [rcx+16] vpbroadcastd ymm5, DWORD PTR [rcx+20] vpbroadcastd ymm6, DWORD PTR [rcx+24] vpbroadcastd ymm7, DWORD PTR [rcx+28] vpbroadcastd ymm8, DWORD PTR [rcx+32] vpbroadcastd ymm9, DWORD PTR [rcx+36] vpbroadcastd ymm10, DWORD PTR [rcx+40] vpbroadcastd ymm11, DWORD PTR [rcx+44] vpbroadcastd ymm12, DWORD PTR [rcx+48] vpbroadcastd ymm13, DWORD PTR [rcx+52] vpbroadcastd ymm14, DWORD PTR [rcx+56] vpbroadcastd ymm15, DWORD PTR [rcx+60] vpaddd ymm12, ymm12, YMMWORD PTR [r15] vmovdqa YMMWORD PTR [r11], ymm0 vmovdqa YMMWORD PTR [r11+32], ymm1 vmovdqa YMMWORD PTR [r11+64], ymm2 vmovdqa YMMWORD PTR [r11+96], ymm3 vmovdqa YMMWORD PTR [r11+128], ymm4 vmovdqa YMMWORD PTR [r11+160], ymm5 vmovdqa YMMWORD PTR [r11+192], ymm6 vmovdqa YMMWORD PTR [r11+224], ymm7 vmovdqa YMMWORD PTR [r11+256], ymm8 vmovdqa YMMWORD PTR [r11+288], ymm9 vmovdqa YMMWORD PTR [r11+320], ymm10 vmovdqa YMMWORD PTR [r11+352], ymm11 vmovdqa YMMWORD PTR [r11+384], ymm12 vmovdqa YMMWORD PTR [r11+416], ymm13 vmovdqa YMMWORD PTR [r11+448], ymm14 vmovdqa YMMWORD PTR [r11+480], ymm15 L_chacha20_avx2_start256: mov r10b, 10 vmovdqa YMMWORD PTR [r12+96], ymm11 L_chacha20_avx2_loop256: vpaddd ymm0, ymm0, ymm4 vpxor ymm12, ymm12, ymm0 vmovdqa ymm11, YMMWORD PTR [r12+96] vpshufb ymm12, ymm12, YMMWORD PTR [r14] vpaddd ymm8, ymm8, ymm12 vpxor ymm4, ymm4, ymm8 vpaddd ymm1, ymm1, ymm5 vpxor ymm13, ymm13, ymm1 vpshufb ymm13, ymm13, YMMWORD PTR [r14] vpaddd ymm9, ymm9, ymm13 vpxor ymm5, ymm5, ymm9 vpaddd ymm2, ymm2, ymm6 vpxor ymm14, ymm14, ymm2 vpshufb ymm14, ymm14, YMMWORD PTR [r14] vpaddd ymm10, ymm10, ymm14 vpxor ymm6, ymm6, ymm10 vpaddd ymm3, ymm3, ymm7 vpxor ymm15, ymm15, ymm3 vpshufb ymm15, ymm15, YMMWORD PTR [r14] vpaddd ymm11, ymm11, ymm15 vpxor ymm7, ymm7, ymm11 vmovdqa YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm4, 20 vpslld ymm4, ymm4, 12 vpxor ymm4, ymm4, ymm11 vpsrld ymm11, ymm5, 20 vpslld ymm5, ymm5, 12 vpxor ymm5, ymm5, ymm11 vpsrld ymm11, ymm6, 20 vpslld ymm6, ymm6, 12 vpxor ymm6, ymm6, ymm11 vpsrld ymm11, ymm7, 20 vpslld ymm7, ymm7, 12 vpxor ymm7, ymm7, ymm11 vpaddd ymm0, ymm0, ymm4 vpxor ymm12, ymm12, ymm0 vmovdqa ymm11, YMMWORD PTR [r12+96] vpshufb ymm12, ymm12, YMMWORD PTR [r13] vpaddd ymm8, ymm8, ymm12 vpxor ymm4, ymm4, ymm8 vpaddd ymm1, ymm1, ymm5 vpxor ymm13, ymm13, ymm1 vpshufb ymm13, ymm13, YMMWORD PTR [r13] vpaddd ymm9, ymm9, ymm13 vpxor ymm5, ymm5, ymm9 vpaddd ymm2, ymm2, ymm6 vpxor ymm14, ymm14, ymm2 vpshufb ymm14, ymm14, YMMWORD PTR [r13] vpaddd ymm10, ymm10, ymm14 vpxor ymm6, ymm6, ymm10 vpaddd ymm3, ymm3, ymm7 vpxor ymm15, ymm15, ymm3 vpshufb ymm15, ymm15, YMMWORD PTR [r13] vpaddd ymm11, ymm11, ymm15 vpxor ymm7, ymm7, ymm11 vmovdqa YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm4, 25 vpslld ymm4, ymm4, 7 vpxor ymm4, ymm4, ymm11 vpsrld ymm11, ymm5, 25 vpslld ymm5, ymm5, 7 vpxor ymm5, ymm5, ymm11 vpsrld ymm11, ymm6, 25 vpslld ymm6, ymm6, 7 vpxor ymm6, ymm6, ymm11 vpsrld ymm11, ymm7, 25 vpslld ymm7, ymm7, 7 vpxor ymm7, ymm7, ymm11 vpaddd ymm0, ymm0, ymm5 vpxor ymm15, ymm15, ymm0 vmovdqa ymm11, YMMWORD PTR [r12+96] vpshufb ymm15, ymm15, YMMWORD PTR [r14] vpaddd ymm10, ymm10, ymm15 vpxor ymm5, ymm5, ymm10 vpaddd ymm1, ymm1, ymm6 vpxor ymm12, ymm12, ymm1 vpshufb ymm12, ymm12, YMMWORD PTR [r14] vpaddd ymm11, ymm11, ymm12 vpxor ymm6, ymm6, ymm11 vpaddd ymm2, ymm2, ymm7 vpxor ymm13, ymm13, ymm2 vpshufb ymm13, ymm13, YMMWORD PTR [r14] vpaddd ymm8, ymm8, ymm13 vpxor ymm7, ymm7, ymm8 vpaddd ymm3, ymm3, ymm4 vpxor ymm14, ymm14, ymm3 vpshufb ymm14, ymm14, YMMWORD PTR [r14] vpaddd ymm9, ymm9, ymm14 vpxor ymm4, ymm4, ymm9 vmovdqa YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm5, 20 vpslld ymm5, ymm5, 12 vpxor ymm5, ymm5, ymm11 vpsrld ymm11, ymm6, 20 vpslld ymm6, ymm6, 12 vpxor ymm6, ymm6, ymm11 vpsrld ymm11, ymm7, 20 vpslld ymm7, ymm7, 12 vpxor ymm7, ymm7, ymm11 vpsrld ymm11, ymm4, 20 vpslld ymm4, ymm4, 12 vpxor ymm4, ymm4, ymm11 vpaddd ymm0, ymm0, ymm5 vpxor ymm15, ymm15, ymm0 vmovdqa ymm11, YMMWORD PTR [r12+96] vpshufb ymm15, ymm15, YMMWORD PTR [r13] vpaddd ymm10, ymm10, ymm15 vpxor ymm5, ymm5, ymm10 vpaddd ymm1, ymm1, ymm6 vpxor ymm12, ymm12, ymm1 vpshufb ymm12, ymm12, YMMWORD PTR [r13] vpaddd ymm11, ymm11, ymm12 vpxor ymm6, ymm6, ymm11 vpaddd ymm2, ymm2, ymm7 vpxor ymm13, ymm13, ymm2 vpshufb ymm13, ymm13, YMMWORD PTR [r13] vpaddd ymm8, ymm8, ymm13 vpxor ymm7, ymm7, ymm8 vpaddd ymm3, ymm3, ymm4 vpxor ymm14, ymm14, ymm3 vpshufb ymm14, ymm14, YMMWORD PTR [r13] vpaddd ymm9, ymm9, ymm14 vpxor ymm4, ymm4, ymm9 vmovdqa YMMWORD PTR [r12+96], ymm11 vpsrld ymm11, ymm5, 25 vpslld ymm5, ymm5, 7 vpxor ymm5, ymm5, ymm11 vpsrld ymm11, ymm6, 25 vpslld ymm6, ymm6, 7 vpxor ymm6, ymm6, ymm11 vpsrld ymm11, ymm7, 25 vpslld ymm7, ymm7, 7 vpxor ymm7, ymm7, ymm11 vpsrld ymm11, ymm4, 25 vpslld ymm4, ymm4, 7 vpxor ymm4, ymm4, ymm11 dec r10b jnz L_chacha20_avx2_loop256 vmovdqa ymm11, YMMWORD PTR [r12+96] vpaddd ymm0, ymm0, YMMWORD PTR [r11] vpaddd ymm1, ymm1, YMMWORD PTR [r11+32] vpaddd ymm2, ymm2, YMMWORD PTR [r11+64] vpaddd ymm3, ymm3, YMMWORD PTR [r11+96] vpaddd ymm4, ymm4, YMMWORD PTR [r11+128] vpaddd ymm5, ymm5, YMMWORD PTR [r11+160] vpaddd ymm6, ymm6, YMMWORD PTR [r11+192] vpaddd ymm7, ymm7, YMMWORD PTR [r11+224] vpaddd ymm8, ymm8, YMMWORD PTR [r11+256] vpaddd ymm9, ymm9, YMMWORD PTR [r11+288] vpaddd ymm10, ymm10, YMMWORD PTR [r11+320] vpaddd ymm11, ymm11, YMMWORD PTR [r11+352] vpaddd ymm12, ymm12, YMMWORD PTR [r11+384] vpaddd ymm13, ymm13, YMMWORD PTR [r11+416] vpaddd ymm14, ymm14, YMMWORD PTR [r11+448] vpaddd ymm15, ymm15, YMMWORD PTR [r11+480] vmovdqa YMMWORD PTR [r12], ymm8 vmovdqa YMMWORD PTR [r12+32], ymm9 vmovdqa YMMWORD PTR [r12+64], ymm10 vmovdqa YMMWORD PTR [r12+96], ymm11 vmovdqa YMMWORD PTR [r12+128], ymm12 vmovdqa YMMWORD PTR [r12+160], ymm13 vmovdqa YMMWORD PTR [r12+192], ymm14 vmovdqa YMMWORD PTR [r12+224], ymm15 vpunpckldq ymm8, ymm0, ymm1 vpunpckldq ymm9, ymm2, ymm3 vpunpckhdq ymm12, ymm0, ymm1 vpunpckhdq ymm13, ymm2, ymm3 vpunpckldq ymm10, ymm4, ymm5 vpunpckldq ymm11, ymm6, ymm7 vpunpckhdq ymm14, ymm4, ymm5 vpunpckhdq ymm15, ymm6, ymm7 vpunpcklqdq ymm0, ymm8, ymm9 vpunpcklqdq ymm1, ymm10, ymm11 vpunpckhqdq ymm2, ymm8, ymm9 vpunpckhqdq ymm3, ymm10, ymm11 vpunpcklqdq ymm4, ymm12, ymm13 vpunpcklqdq ymm5, ymm14, ymm15 vpunpckhqdq ymm6, ymm12, ymm13 vpunpckhqdq ymm7, ymm14, ymm15 vperm2i128 ymm8, ymm0, ymm1, 32 vperm2i128 ymm9, ymm2, ymm3, 32 vperm2i128 ymm12, ymm0, ymm1, 49 vperm2i128 ymm13, ymm2, ymm3, 49 vperm2i128 ymm10, ymm4, ymm5, 32 vperm2i128 ymm11, ymm6, ymm7, 32 vperm2i128 ymm14, ymm4, ymm5, 49 vperm2i128 ymm15, ymm6, ymm7, 49 vmovdqu ymm0, YMMWORD PTR [rdx] vmovdqu ymm1, YMMWORD PTR [rdx+64] vmovdqu ymm2, YMMWORD PTR [rdx+128] vmovdqu ymm3, YMMWORD PTR [rdx+192] vmovdqu ymm4, YMMWORD PTR [rdx+256] vmovdqu ymm5, YMMWORD PTR [rdx+320] vmovdqu ymm6, YMMWORD PTR [rdx+384] vmovdqu ymm7, YMMWORD PTR [rdx+448] vpxor ymm8, ymm8, ymm0 vpxor ymm9, ymm9, ymm1 vpxor ymm10, ymm10, ymm2 vpxor ymm11, ymm11, ymm3 vpxor ymm12, ymm12, ymm4 vpxor ymm13, ymm13, ymm5 vpxor ymm14, ymm14, ymm6 vpxor ymm15, ymm15, ymm7 vmovdqu YMMWORD PTR [r8], ymm8 vmovdqu YMMWORD PTR [r8+64], ymm9 vmovdqu YMMWORD PTR [r8+128], ymm10 vmovdqu YMMWORD PTR [r8+192], ymm11 vmovdqu YMMWORD PTR [r8+256], ymm12 vmovdqu YMMWORD PTR [r8+320], ymm13 vmovdqu YMMWORD PTR [r8+384], ymm14 vmovdqu YMMWORD PTR [r8+448], ymm15 vmovdqa ymm0, YMMWORD PTR [r12] vmovdqa ymm1, YMMWORD PTR [r12+32] vmovdqa ymm2, YMMWORD PTR [r12+64] vmovdqa ymm3, YMMWORD PTR [r12+96] vmovdqa ymm4, YMMWORD PTR [r12+128] vmovdqa ymm5, YMMWORD PTR [r12+160] vmovdqa ymm6, YMMWORD PTR [r12+192] vmovdqa ymm7, YMMWORD PTR [r12+224] vpunpckldq ymm8, ymm0, ymm1 vpunpckldq ymm9, ymm2, ymm3 vpunpckhdq ymm12, ymm0, ymm1 vpunpckhdq ymm13, ymm2, ymm3 vpunpckldq ymm10, ymm4, ymm5 vpunpckldq ymm11, ymm6, ymm7 vpunpckhdq ymm14, ymm4, ymm5 vpunpckhdq ymm15, ymm6, ymm7 vpunpcklqdq ymm0, ymm8, ymm9 vpunpcklqdq ymm1, ymm10, ymm11 vpunpckhqdq ymm2, ymm8, ymm9 vpunpckhqdq ymm3, ymm10, ymm11 vpunpcklqdq ymm4, ymm12, ymm13 vpunpcklqdq ymm5, ymm14, ymm15 vpunpckhqdq ymm6, ymm12, ymm13 vpunpckhqdq ymm7, ymm14, ymm15 vperm2i128 ymm8, ymm0, ymm1, 32 vperm2i128 ymm9, ymm2, ymm3, 32 vperm2i128 ymm12, ymm0, ymm1, 49 vperm2i128 ymm13, ymm2, ymm3, 49 vperm2i128 ymm10, ymm4, ymm5, 32 vperm2i128 ymm11, ymm6, ymm7, 32 vperm2i128 ymm14, ymm4, ymm5, 49 vperm2i128 ymm15, ymm6, ymm7, 49 vmovdqu ymm0, YMMWORD PTR [rdx+32] vmovdqu ymm1, YMMWORD PTR [rdx+96] vmovdqu ymm2, YMMWORD PTR [rdx+160] vmovdqu ymm3, YMMWORD PTR [rdx+224] vmovdqu ymm4, YMMWORD PTR [rdx+288] vmovdqu ymm5, YMMWORD PTR [rdx+352] vmovdqu ymm6, YMMWORD PTR [rdx+416] vmovdqu ymm7, YMMWORD PTR [rdx+480] vpxor ymm8, ymm8, ymm0 vpxor ymm9, ymm9, ymm1 vpxor ymm10, ymm10, ymm2 vpxor ymm11, ymm11, ymm3 vpxor ymm12, ymm12, ymm4 vpxor ymm13, ymm13, ymm5 vpxor ymm14, ymm14, ymm6 vpxor ymm15, ymm15, ymm7 vmovdqu YMMWORD PTR [r8+32], ymm8 vmovdqu YMMWORD PTR [r8+96], ymm9 vmovdqu YMMWORD PTR [r8+160], ymm10 vmovdqu YMMWORD PTR [r8+224], ymm11 vmovdqu YMMWORD PTR [r8+288], ymm12 vmovdqu YMMWORD PTR [r8+352], ymm13 vmovdqu YMMWORD PTR [r8+416], ymm14 vmovdqu YMMWORD PTR [r8+480], ymm15 vmovdqa ymm12, YMMWORD PTR [r11+384] add rdx, 512 add r8, 512 vpaddd ymm12, ymm12, YMMWORD PTR [rdi] sub r9d, 512 vmovdqa YMMWORD PTR [r11+384], ymm12 cmp r9d, 512 jl L_chacha20_avx2_done256 vmovdqa ymm0, YMMWORD PTR [r11] vmovdqa ymm1, YMMWORD PTR [r11+32] vmovdqa ymm2, YMMWORD PTR [r11+64] vmovdqa ymm3, YMMWORD PTR [r11+96] vmovdqa ymm4, YMMWORD PTR [r11+128] vmovdqa ymm5, YMMWORD PTR [r11+160] vmovdqa ymm6, YMMWORD PTR [r11+192] vmovdqa ymm7, YMMWORD PTR [r11+224] vmovdqa ymm8, YMMWORD PTR [r11+256] vmovdqa ymm9, YMMWORD PTR [r11+288] vmovdqa ymm10, YMMWORD PTR [r11+320] vmovdqa ymm11, YMMWORD PTR [r11+352] vmovdqa ymm12, YMMWORD PTR [r11+384] vmovdqa ymm13, YMMWORD PTR [r11+416] vmovdqa ymm14, YMMWORD PTR [r11+448] vmovdqa ymm15, YMMWORD PTR [r11+480] jmp L_chacha20_avx2_start256 L_chacha20_avx2_done256: shl eax, 3 add DWORD PTR [rcx+48], eax L_chacha20_avx2_end256: call chacha_encrypt_avx1 vzeroupper vmovdqu xmm6, OWORD PTR [rsp+800] vmovdqu xmm7, OWORD PTR [rsp+816] vmovdqu xmm8, OWORD PTR [rsp+832] vmovdqu xmm9, OWORD PTR [rsp+848] vmovdqu xmm10, OWORD PTR [rsp+864] vmovdqu xmm11, OWORD PTR [rsp+880] vmovdqu xmm12, OWORD PTR [rsp+896] vmovdqu xmm13, OWORD PTR [rsp+912] vmovdqu xmm14, OWORD PTR [rsp+928] vmovdqu xmm15, OWORD PTR [rsp+944] add rsp, 960 pop rdi pop r15 pop r14 pop r13 pop r12 ret chacha_encrypt_avx2 ENDP _text ENDS ENDIF END