/* fe_x25519_asm * * Copyright (C) 2006-2022 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #ifdef WOLFSSL_USER_SETTINGS #ifdef WOLFSSL_USER_SETTINGS_ASM /* * user_settings_asm.h is a file generated by the script user_settings_asm.sh. * The script takes in a user_settings.h and produces user_settings_asm.h, which * is a stripped down version of user_settings.h containing only preprocessor * directives. This makes the header safe to include in assembly (.S) files. */ #include "user_settings_asm.h" #else /* * Note: if user_settings.h contains any C code (e.g. a typedef or function * prototype), including it here in an assembly (.S) file will cause an * assembler failure. See user_settings_asm.h above. */ #include "user_settings.h" #endif /* WOLFSSL_USER_SETTINGS_ASM */ #endif /* WOLFSSL_USER_SETTINGS */ #ifndef HAVE_INTEL_AVX1 #define HAVE_INTEL_AVX1 #endif /* HAVE_INTEL_AVX1 */ #ifndef NO_AVX2_SUPPORT #define HAVE_INTEL_AVX2 #endif /* NO_AVX2_SUPPORT */ #ifndef __APPLE__ .text .globl fe_init .type fe_init,@function .align 16 fe_init: #else .section __TEXT,__text .globl _fe_init .p2align 4 _fe_init: #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ movq cpuFlagsSet@GOTPCREL(%rip), %rax movl (%rax), %eax #else movl _cpuFlagsSet(%rip), %eax #endif /* __APPLE__ */ testl %eax, %eax je L_fe_init_get_flags repz retq L_fe_init_get_flags: #ifndef __APPLE__ callq cpuid_get_flags@plt #else callq _cpuid_get_flags #endif /* __APPLE__ */ #ifndef __APPLE__ movq intelFlags@GOTPCREL(%rip), %rdx movl %eax, (%rdx) #else movl %eax, _intelFlags(%rip) #endif /* __APPLE__ */ andl $0x50, %eax cmpl $0x50, %eax jne L_fe_init_flags_done #ifndef __APPLE__ movq fe_mul_avx2@GOTPCREL(%rip), %rax #else leaq _fe_mul_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_mul_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_sq_avx2@GOTPCREL(%rip), %rax #else leaq _fe_sq_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_sq_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_sq_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul121666_avx2@GOTPCREL(%rip), %rax #else leaq _fe_mul121666_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_mul121666_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_mul121666_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_sq2_avx2@GOTPCREL(%rip), %rax #else leaq _fe_sq2_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_sq2_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_sq2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_invert_avx2@GOTPCREL(%rip), %rax #else leaq _fe_invert_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_invert_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_invert_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq curve25519_avx2@GOTPCREL(%rip), %rax #else leaq _curve25519_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq curve25519_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _curve25519_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_pow22523_avx2@GOTPCREL(%rip), %rax #else leaq _fe_pow22523_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_pow22523_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_pow22523_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax #else leaq _fe_ge_to_p2_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_ge_to_p2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax #else leaq _fe_ge_to_p3_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_ge_to_p3_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax #else leaq _fe_ge_dbl_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_ge_dbl_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax #else leaq _fe_ge_madd_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_madd_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_ge_madd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax #else leaq _fe_ge_msub_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_msub_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_ge_msub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_add_avx2@GOTPCREL(%rip), %rax #else leaq _fe_ge_add_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_add_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_ge_add_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax #else leaq _fe_ge_sub_avx2(%rip), %rax #endif /* __APPLE__ */ #ifndef __APPLE__ movq fe_ge_sub_p@GOTPCREL(%rip), %rdx movq %rax, (%rdx) #else movq %rax, _fe_ge_sub_p(%rip) #endif /* __APPLE__ */ L_fe_init_flags_done: #ifndef __APPLE__ movq cpuFlagsSet@GOTPCREL(%rip), %rdx movl $0x1, (%rdx) #else movl $0x1, _cpuFlagsSet(%rip) #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ repz retq #ifndef __APPLE__ .size fe_init,.-fe_init #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_frombytes .type fe_frombytes,@function .align 16 fe_frombytes: #else .section __TEXT,__text .globl _fe_frombytes .p2align 4 _fe_frombytes: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r9 movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 andq %r9, %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_frombytes,.-fe_frombytes #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_tobytes .type fe_tobytes,@function .align 16 fe_tobytes: #else .section __TEXT,__text .globl _fe_tobytes .p2align 4 _fe_tobytes: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r10 movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 addq $19, %rdx adcq $0x00, %rax adcq $0x00, %rcx adcq $0x00, %r8 shrq $63, %r8 imulq $19, %r8, %r9 movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 addq %r9, %rdx adcq $0x00, %rax adcq $0x00, %rcx adcq $0x00, %r8 andq %r10, %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_tobytes,.-fe_tobytes #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_1 .type fe_1,@function .align 16 fe_1: #else .section __TEXT,__text .globl _fe_1 .p2align 4 _fe_1: #endif /* __APPLE__ */ # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_1,.-fe_1 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_0 .type fe_0,@function .align 16 fe_0: #else .section __TEXT,__text .globl _fe_0 .p2align 4 _fe_0: #endif /* __APPLE__ */ # Set zero movq $0x00, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_0,.-fe_0 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_copy .type fe_copy,@function .align 16 fe_copy: #else .section __TEXT,__text .globl _fe_copy .p2align 4 _fe_copy: #endif /* __APPLE__ */ # Copy movq (%rsi), %rdx movq 8(%rsi), %rax movq 16(%rsi), %rcx movq 24(%rsi), %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_copy,.-fe_copy #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sub .type fe_sub,@function .align 16 fe_sub: #else .section __TEXT,__text .globl _fe_sub .p2align 4 _fe_sub: #endif /* __APPLE__ */ pushq %r12 # Sub movq (%rsi), %rax movq 8(%rsi), %rcx movq 16(%rsi), %r8 movq 24(%rsi), %r9 subq (%rdx), %rax movq $0x00, %r10 sbbq 8(%rdx), %rcx movq $-19, %r11 sbbq 16(%rdx), %r8 movq $0x7fffffffffffffff, %r12 sbbq 24(%rdx), %r9 sbbq $0x00, %r10 # Mask the modulus andq %r10, %r11 andq %r10, %r12 # Add modulus (if underflow) addq %r11, %rax adcq %r10, %rcx adcq %r10, %r8 adcq %r12, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) popq %r12 repz retq #ifndef __APPLE__ .size fe_sub,.-fe_sub #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_add .type fe_add,@function .align 16 fe_add: #else .section __TEXT,__text .globl _fe_add .p2align 4 _fe_add: #endif /* __APPLE__ */ pushq %r12 # Add movq (%rsi), %rax movq 8(%rsi), %rcx addq (%rdx), %rax movq 16(%rsi), %r8 adcq 8(%rdx), %rcx movq 24(%rsi), %r10 adcq 16(%rdx), %r8 movq $-19, %r11 adcq 24(%rdx), %r10 movq $0x7fffffffffffffff, %r12 movq %r10, %r9 sarq $63, %r10 # Mask the modulus andq %r10, %r11 andq %r10, %r12 # Sub modulus (if overflow) subq %r11, %rax sbbq %r10, %rcx sbbq %r10, %r8 sbbq %r12, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) popq %r12 repz retq #ifndef __APPLE__ .size fe_add,.-fe_add #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_neg .type fe_neg,@function .align 16 fe_neg: #else .section __TEXT,__text .globl _fe_neg .p2align 4 _fe_neg: #endif /* __APPLE__ */ movq $-19, %rdx movq $-1, %rax movq $-1, %rcx movq $0x7fffffffffffffff, %r8 subq (%rsi), %rdx sbbq 8(%rsi), %rax sbbq 16(%rsi), %rcx sbbq 24(%rsi), %r8 movq %rdx, (%rdi) movq %rax, 8(%rdi) movq %rcx, 16(%rdi) movq %r8, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_neg,.-fe_neg #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_cmov .type fe_cmov,@function .align 16 fe_cmov: #else .section __TEXT,__text .globl _fe_cmov .p2align 4 _fe_cmov: #endif /* __APPLE__ */ cmpl $0x01, %edx movq (%rdi), %rcx movq 8(%rdi), %r8 movq 16(%rdi), %r9 movq 24(%rdi), %r10 cmoveq (%rsi), %rcx cmoveq 8(%rsi), %r8 cmoveq 16(%rsi), %r9 cmoveq 24(%rsi), %r10 movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) repz retq #ifndef __APPLE__ .size fe_cmov,.-fe_cmov #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_isnonzero .type fe_isnonzero,@function .align 16 fe_isnonzero: #else .section __TEXT,__text .globl _fe_isnonzero .p2align 4 _fe_isnonzero: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r10 movq (%rdi), %rax movq 8(%rdi), %rdx movq 16(%rdi), %rcx movq 24(%rdi), %r8 addq $19, %rax adcq $0x00, %rdx adcq $0x00, %rcx adcq $0x00, %r8 shrq $63, %r8 imulq $19, %r8, %r9 movq (%rdi), %rax movq 8(%rdi), %rdx movq 16(%rdi), %rcx movq 24(%rdi), %r8 addq %r9, %rax adcq $0x00, %rdx adcq $0x00, %rcx adcq $0x00, %r8 andq %r10, %r8 orq %rdx, %rax orq %rcx, %rax orq %r8, %rax repz retq #ifndef __APPLE__ .size fe_isnonzero,.-fe_isnonzero #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_isnegative .type fe_isnegative,@function .align 16 fe_isnegative: #else .section __TEXT,__text .globl _fe_isnegative .p2align 4 _fe_isnegative: #endif /* __APPLE__ */ movq $0x7fffffffffffffff, %r11 movq (%rdi), %rdx movq 8(%rdi), %rcx movq 16(%rdi), %r8 movq 24(%rdi), %r9 movq %rdx, %rax addq $19, %rdx adcq $0x00, %rcx adcq $0x00, %r8 adcq $0x00, %r9 shrq $63, %r9 imulq $19, %r9, %r10 addq %r10, %rax andq $0x01, %rax repz retq #ifndef __APPLE__ .size fe_isnegative,.-fe_isnegative #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_cmov_table .type fe_cmov_table,@function .align 16 fe_cmov_table: #else .section __TEXT,__text .globl _fe_cmov_table .p2align 4 _fe_cmov_table: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 movq %rdx, %rcx movsbq %cl, %rax cdq xorb %dl, %al subb %dl, %al movb %al, %r15b movq $0x01, %rax xorq %rdx, %rdx xorq %r8, %r8 xorq %r9, %r9 movq $0x01, %r10 xorq %r11, %r11 xorq %r12, %r12 xorq %r13, %r13 cmpb $0x01, %r15b movq (%rsi), %r14 cmoveq %r14, %rax movq 8(%rsi), %r14 cmoveq %r14, %rdx movq 16(%rsi), %r14 cmoveq %r14, %r8 movq 24(%rsi), %r14 cmoveq %r14, %r9 movq 32(%rsi), %r14 cmoveq %r14, %r10 movq 40(%rsi), %r14 cmoveq %r14, %r11 movq 48(%rsi), %r14 cmoveq %r14, %r12 movq 56(%rsi), %r14 cmoveq %r14, %r13 cmpb $2, %r15b movq 96(%rsi), %r14 cmoveq %r14, %rax movq 104(%rsi), %r14 cmoveq %r14, %rdx movq 112(%rsi), %r14 cmoveq %r14, %r8 movq 120(%rsi), %r14 cmoveq %r14, %r9 movq 128(%rsi), %r14 cmoveq %r14, %r10 movq 136(%rsi), %r14 cmoveq %r14, %r11 movq 144(%rsi), %r14 cmoveq %r14, %r12 movq 152(%rsi), %r14 cmoveq %r14, %r13 cmpb $3, %r15b movq 192(%rsi), %r14 cmoveq %r14, %rax movq 200(%rsi), %r14 cmoveq %r14, %rdx movq 208(%rsi), %r14 cmoveq %r14, %r8 movq 216(%rsi), %r14 cmoveq %r14, %r9 movq 224(%rsi), %r14 cmoveq %r14, %r10 movq 232(%rsi), %r14 cmoveq %r14, %r11 movq 240(%rsi), %r14 cmoveq %r14, %r12 movq 248(%rsi), %r14 cmoveq %r14, %r13 cmpb $4, %r15b movq 288(%rsi), %r14 cmoveq %r14, %rax movq 296(%rsi), %r14 cmoveq %r14, %rdx movq 304(%rsi), %r14 cmoveq %r14, %r8 movq 312(%rsi), %r14 cmoveq %r14, %r9 movq 320(%rsi), %r14 cmoveq %r14, %r10 movq 328(%rsi), %r14 cmoveq %r14, %r11 movq 336(%rsi), %r14 cmoveq %r14, %r12 movq 344(%rsi), %r14 cmoveq %r14, %r13 cmpb $5, %r15b movq 384(%rsi), %r14 cmoveq %r14, %rax movq 392(%rsi), %r14 cmoveq %r14, %rdx movq 400(%rsi), %r14 cmoveq %r14, %r8 movq 408(%rsi), %r14 cmoveq %r14, %r9 movq 416(%rsi), %r14 cmoveq %r14, %r10 movq 424(%rsi), %r14 cmoveq %r14, %r11 movq 432(%rsi), %r14 cmoveq %r14, %r12 movq 440(%rsi), %r14 cmoveq %r14, %r13 cmpb $6, %r15b movq 480(%rsi), %r14 cmoveq %r14, %rax movq 488(%rsi), %r14 cmoveq %r14, %rdx movq 496(%rsi), %r14 cmoveq %r14, %r8 movq 504(%rsi), %r14 cmoveq %r14, %r9 movq 512(%rsi), %r14 cmoveq %r14, %r10 movq 520(%rsi), %r14 cmoveq %r14, %r11 movq 528(%rsi), %r14 cmoveq %r14, %r12 movq 536(%rsi), %r14 cmoveq %r14, %r13 cmpb $7, %r15b movq 576(%rsi), %r14 cmoveq %r14, %rax movq 584(%rsi), %r14 cmoveq %r14, %rdx movq 592(%rsi), %r14 cmoveq %r14, %r8 movq 600(%rsi), %r14 cmoveq %r14, %r9 movq 608(%rsi), %r14 cmoveq %r14, %r10 movq 616(%rsi), %r14 cmoveq %r14, %r11 movq 624(%rsi), %r14 cmoveq %r14, %r12 movq 632(%rsi), %r14 cmoveq %r14, %r13 cmpb $8, %r15b movq 672(%rsi), %r14 cmoveq %r14, %rax movq 680(%rsi), %r14 cmoveq %r14, %rdx movq 688(%rsi), %r14 cmoveq %r14, %r8 movq 696(%rsi), %r14 cmoveq %r14, %r9 movq 704(%rsi), %r14 cmoveq %r14, %r10 movq 712(%rsi), %r14 cmoveq %r14, %r11 movq 720(%rsi), %r14 cmoveq %r14, %r12 movq 728(%rsi), %r14 cmoveq %r14, %r13 cmpb $0x00, %cl movq %rax, %r14 cmovlq %r10, %rax cmovlq %r14, %r10 movq %rdx, %r14 cmovlq %r11, %rdx cmovlq %r14, %r11 movq %r8, %r14 cmovlq %r12, %r8 cmovlq %r14, %r12 movq %r9, %r14 cmovlq %r13, %r9 cmovlq %r14, %r13 movq %rax, (%rdi) movq %rdx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) movq %r10, 32(%rdi) movq %r11, 40(%rdi) movq %r12, 48(%rdi) movq %r13, 56(%rdi) xorq %rax, %rax xorq %rdx, %rdx xorq %r8, %r8 xorq %r9, %r9 cmpb $0x01, %r15b movq 64(%rsi), %r14 cmoveq %r14, %rax movq 72(%rsi), %r14 cmoveq %r14, %rdx movq 80(%rsi), %r14 cmoveq %r14, %r8 movq 88(%rsi), %r14 cmoveq %r14, %r9 cmpb $2, %r15b movq 160(%rsi), %r14 cmoveq %r14, %rax movq 168(%rsi), %r14 cmoveq %r14, %rdx movq 176(%rsi), %r14 cmoveq %r14, %r8 movq 184(%rsi), %r14 cmoveq %r14, %r9 cmpb $3, %r15b movq 256(%rsi), %r14 cmoveq %r14, %rax movq 264(%rsi), %r14 cmoveq %r14, %rdx movq 272(%rsi), %r14 cmoveq %r14, %r8 movq 280(%rsi), %r14 cmoveq %r14, %r9 cmpb $4, %r15b movq 352(%rsi), %r14 cmoveq %r14, %rax movq 360(%rsi), %r14 cmoveq %r14, %rdx movq 368(%rsi), %r14 cmoveq %r14, %r8 movq 376(%rsi), %r14 cmoveq %r14, %r9 cmpb $5, %r15b movq 448(%rsi), %r14 cmoveq %r14, %rax movq 456(%rsi), %r14 cmoveq %r14, %rdx movq 464(%rsi), %r14 cmoveq %r14, %r8 movq 472(%rsi), %r14 cmoveq %r14, %r9 cmpb $6, %r15b movq 544(%rsi), %r14 cmoveq %r14, %rax movq 552(%rsi), %r14 cmoveq %r14, %rdx movq 560(%rsi), %r14 cmoveq %r14, %r8 movq 568(%rsi), %r14 cmoveq %r14, %r9 cmpb $7, %r15b movq 640(%rsi), %r14 cmoveq %r14, %rax movq 648(%rsi), %r14 cmoveq %r14, %rdx movq 656(%rsi), %r14 cmoveq %r14, %r8 movq 664(%rsi), %r14 cmoveq %r14, %r9 cmpb $8, %r15b movq 736(%rsi), %r14 cmoveq %r14, %rax movq 744(%rsi), %r14 cmoveq %r14, %rdx movq 752(%rsi), %r14 cmoveq %r14, %r8 movq 760(%rsi), %r14 cmoveq %r14, %r9 movq $-19, %r10 movq $-1, %r11 movq $-1, %r12 movq $0x7fffffffffffffff, %r13 subq %rax, %r10 sbbq %rdx, %r11 sbbq %r8, %r12 sbbq %r9, %r13 cmpb $0x00, %cl cmovlq %r10, %rax cmovlq %r11, %rdx cmovlq %r12, %r8 cmovlq %r13, %r9 movq %rax, 64(%rdi) movq %rdx, 72(%rdi) movq %r8, 80(%rdi) movq %r9, 88(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_cmov_table,.-fe_cmov_table #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul .type fe_mul,@function .align 16 fe_mul: #else .section __TEXT,__text .globl _fe_mul .p2align 4 _fe_mul: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_mul_p(%rip) #else jmpq *_fe_mul_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_mul,.-fe_mul #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq .type fe_sq,@function .align 16 fe_sq: #else .section __TEXT,__text .globl _fe_sq .p2align 4 _fe_sq: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_sq_p(%rip) #else jmpq *_fe_sq_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_sq,.-fe_sq #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul121666 .type fe_mul121666,@function .align 16 fe_mul121666: #else .section __TEXT,__text .globl _fe_mul121666 .p2align 4 _fe_mul121666: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_mul121666_p(%rip) #else jmpq *_fe_mul121666_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_mul121666,.-fe_mul121666 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq2 .type fe_sq2,@function .align 16 fe_sq2: #else .section __TEXT,__text .globl _fe_sq2 .p2align 4 _fe_sq2: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_sq2_p(%rip) #else jmpq *_fe_sq2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_sq2,.-fe_sq2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_invert .type fe_invert,@function .align 16 fe_invert: #else .section __TEXT,__text .globl _fe_invert .p2align 4 _fe_invert: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_invert_p(%rip) #else jmpq *_fe_invert_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_invert,.-fe_invert #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl curve25519 .type curve25519,@function .align 16 curve25519: #else .section __TEXT,__text .globl _curve25519 .p2align 4 _curve25519: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *curve25519_p(%rip) #else jmpq *_curve25519_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size curve25519,.-curve25519 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523 .type fe_pow22523,@function .align 16 fe_pow22523: #else .section __TEXT,__text .globl _fe_pow22523 .p2align 4 _fe_pow22523: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_pow22523_p(%rip) #else jmpq *_fe_pow22523_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_pow22523,.-fe_pow22523 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_to_p2 .type fe_ge_to_p2,@function .align 16 fe_ge_to_p2: #else .section __TEXT,__text .globl _fe_ge_to_p2 .p2align 4 _fe_ge_to_p2: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_ge_to_p2_p(%rip) #else jmpq *_fe_ge_to_p2_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_ge_to_p2,.-fe_ge_to_p2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_to_p3 .type fe_ge_to_p3,@function .align 16 fe_ge_to_p3: #else .section __TEXT,__text .globl _fe_ge_to_p3 .p2align 4 _fe_ge_to_p3: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_ge_to_p3_p(%rip) #else jmpq *_fe_ge_to_p3_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_ge_to_p3,.-fe_ge_to_p3 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_dbl .type fe_ge_dbl,@function .align 16 fe_ge_dbl: #else .section __TEXT,__text .globl _fe_ge_dbl .p2align 4 _fe_ge_dbl: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_ge_dbl_p(%rip) #else jmpq *_fe_ge_dbl_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_ge_dbl,.-fe_ge_dbl #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_madd .type fe_ge_madd,@function .align 16 fe_ge_madd: #else .section __TEXT,__text .globl _fe_ge_madd .p2align 4 _fe_ge_madd: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_ge_madd_p(%rip) #else jmpq *_fe_ge_madd_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_ge_madd,.-fe_ge_madd #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_msub .type fe_ge_msub,@function .align 16 fe_ge_msub: #else .section __TEXT,__text .globl _fe_ge_msub .p2align 4 _fe_ge_msub: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_ge_msub_p(%rip) #else jmpq *_fe_ge_msub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_ge_msub,.-fe_ge_msub #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_add .type fe_ge_add,@function .align 16 fe_ge_add: #else .section __TEXT,__text .globl _fe_ge_add .p2align 4 _fe_ge_add: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_ge_add_p(%rip) #else jmpq *_fe_ge_add_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_ge_add,.-fe_ge_add #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_sub .type fe_ge_sub,@function .align 16 fe_ge_sub: #else .section __TEXT,__text .globl _fe_ge_sub .p2align 4 _fe_ge_sub: #endif /* __APPLE__ */ #ifndef __APPLE__ jmpq *fe_ge_sub_p(%rip) #else jmpq *_fe_ge_sub_p(%rip) #endif /* __APPLE__ */ #ifndef __APPLE__ .size fe_ge_sub,.-fe_ge_sub #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type cpuFlagsSet, @object .size cpuFlagsSet,4 cpuFlagsSet: .long 0 #else .section __DATA,__data .p2align 2 _cpuFlagsSet: .long 0 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type intelFlags, @object .size intelFlags,4 intelFlags: .long 0 #else .section __DATA,__data .p2align 2 _intelFlags: .long 0 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_mul_p, @object .size fe_mul_p,8 fe_mul_p: .quad fe_mul_x64 #else .section __DATA,__data .p2align 2 _fe_mul_p: .quad _fe_mul_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_sq_p, @object .size fe_sq_p,8 fe_sq_p: .quad fe_sq_x64 #else .section __DATA,__data .p2align 2 _fe_sq_p: .quad _fe_sq_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_mul121666_p, @object .size fe_mul121666_p,8 fe_mul121666_p: .quad fe_mul121666_x64 #else .section __DATA,__data .p2align 2 _fe_mul121666_p: .quad _fe_mul121666_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_sq2_p, @object .size fe_sq2_p,8 fe_sq2_p: .quad fe_sq2_x64 #else .section __DATA,__data .p2align 2 _fe_sq2_p: .quad _fe_sq2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_invert_p, @object .size fe_invert_p,8 fe_invert_p: .quad fe_invert_x64 #else .section __DATA,__data .p2align 2 _fe_invert_p: .quad _fe_invert_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type curve25519_p, @object .size curve25519_p,8 curve25519_p: .quad curve25519_x64 #else .section __DATA,__data .p2align 2 _curve25519_p: .quad _curve25519_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_pow22523_p, @object .size fe_pow22523_p,8 fe_pow22523_p: .quad fe_pow22523_x64 #else .section __DATA,__data .p2align 2 _fe_pow22523_p: .quad _fe_pow22523_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_ge_to_p2_p, @object .size fe_ge_to_p2_p,8 fe_ge_to_p2_p: .quad fe_ge_to_p2_x64 #else .section __DATA,__data .p2align 2 _fe_ge_to_p2_p: .quad _fe_ge_to_p2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_ge_to_p3_p, @object .size fe_ge_to_p3_p,8 fe_ge_to_p3_p: .quad fe_ge_to_p3_x64 #else .section __DATA,__data .p2align 2 _fe_ge_to_p3_p: .quad _fe_ge_to_p3_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_ge_dbl_p, @object .size fe_ge_dbl_p,8 fe_ge_dbl_p: .quad fe_ge_dbl_x64 #else .section __DATA,__data .p2align 2 _fe_ge_dbl_p: .quad _fe_ge_dbl_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_ge_madd_p, @object .size fe_ge_madd_p,8 fe_ge_madd_p: .quad fe_ge_madd_x64 #else .section __DATA,__data .p2align 2 _fe_ge_madd_p: .quad _fe_ge_madd_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_ge_msub_p, @object .size fe_ge_msub_p,8 fe_ge_msub_p: .quad fe_ge_msub_x64 #else .section __DATA,__data .p2align 2 _fe_ge_msub_p: .quad _fe_ge_msub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_ge_add_p, @object .size fe_ge_add_p,8 fe_ge_add_p: .quad fe_ge_add_x64 #else .section __DATA,__data .p2align 2 _fe_ge_add_p: .quad _fe_ge_add_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .data .type fe_ge_sub_p, @object .size fe_ge_sub_p,8 fe_ge_sub_p: .quad fe_ge_sub_x64 #else .section __DATA,__data .p2align 2 _fe_ge_sub_p: .quad _fe_ge_sub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul_x64 .type fe_mul_x64,@function .align 16 fe_mul_x64: #else .section __TEXT,__text .globl _fe_mul_x64 .p2align 4 _fe_mul_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rcx # Multiply # A[0] * B[0] movq (%rcx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rcx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rcx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rcx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rcx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rcx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rcx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rcx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rcx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rcx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rcx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rcx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rcx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rcx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rcx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rcx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_mul_x64,.-fe_mul_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_x64 .type fe_sq_x64,@function .align 16 fe_sq_x64: #else .section __TEXT,__text .globl _fe_sq_x64 .p2align 4 _fe_sq_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %rcx movq %rdx, %r15 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %r15, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %r15 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %r15, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r15 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %r15, %r12 adcq $0x00, %r13 adcq $0x00, %r14 # Reduce movq $0x7fffffffffffffff, %r15 # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 shldq $0x01, %r10, %r11 andq %r15, %r10 # Multiply top half by 19 movq $19, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $19, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 # Add remaining product results in addq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 adcq %rax, %r10 adcq $0x00, %rdx # Overflow shldq $0x01, %r10, %rdx imulq $19, %rdx, %rax andq %r15, %r10 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Reduce if top bit set movq %r10, %rdx sarq $63, %rdx andq $19, %rdx andq %r15, %r10 addq %rdx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_sq_x64,.-fe_sq_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_n_x64 .type fe_sq_n_x64,@function .align 16 fe_sq_n_x64: #else .section __TEXT,__text .globl _fe_sq_n_x64 .p2align 4 _fe_sq_n_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rcx L_fe_sq_n_x64: # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r8 movq %rdx, %rbx # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %rbx, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbx # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %rbx, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbx # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbx, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) decb %cl jnz L_fe_sq_n_x64 popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_sq_n_x64,.-fe_sq_n_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul121666_x64 .type fe_mul121666_x64,@function .align 16 fe_mul121666_x64: #else .section __TEXT,__text .globl _fe_mul121666_x64 .p2align 4 _fe_mul121666_x64: #endif /* __APPLE__ */ pushq %r12 # Multiply by 121666 movq $0x1db42, %rax mulq (%rsi) xorq %r10, %r10 movq %rax, %r8 movq %rdx, %r9 movq $0x1db42, %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 movq $0x1db42, %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 movq $0x1db42, %rax mulq 24(%rsi) movq $0x7fffffffffffffff, %rcx addq %rax, %r11 adcq %rdx, %r12 shldq $0x01, %r11, %r12 andq %rcx, %r11 movq $19, %rax mulq %r12 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %r12 repz retq #ifndef __APPLE__ .size fe_mul121666_x64,.-fe_mul121666_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq2_x64 .type fe_sq2_x64,@function .align 16 fe_sq2_x64: #else .section __TEXT,__text .globl _fe_sq2_x64 .p2align 4 _fe_sq2_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx # Square * 2 # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r13, %r13 addq %rax, %r12 adcq %rdx, %r13 # Double xorq %r14, %r14 addq %r8, %r8 adcq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq $0x00, %r14 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %rcx movq %rdx, %r15 # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %r15, %r8 adcq %rax, %r9 adcq $0x00, %rdx movq %rdx, %r15 # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %r15, %r10 adcq %rax, %r11 adcq $0x00, %rdx movq %rdx, %r15 # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r13 adcq %rdx, %r14 addq %r15, %r12 adcq $0x00, %r13 adcq $0x00, %r14 # Reduce movq $0x7fffffffffffffff, %rbx xorq %rax, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $3, %r14, %rax shldq $2, %r13, %r14 shldq $2, %r12, %r13 shldq $2, %r11, %r12 shldq $2, %r10, %r11 shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 shldq $0x01, %rcx, %r8 shlq $0x01, %rcx andq %rbx, %r10 # Two out left, one in right andq %rbx, %r14 # Multiply top bits by 19*19 imulq $0x169, %rax, %r15 # Multiply top half by 19 movq $19, %rax mulq %r11 xorq %r11, %r11 addq %rax, %rcx movq $19, %rax adcq %rdx, %r11 mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 # Add remaining produce results in addq %r15, %rcx adcq %r11, %r8 adcq %r12, %r9 adcq %r13, %r10 adcq %rax, %r10 adcq $0x00, %rdx # Overflow shldq $0x01, %r10, %rdx imulq $19, %rdx, %rax andq %rbx, %r10 addq %rax, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Reduce if top bit set movq %r10, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r10 addq %rdx, %rcx adcq $0x00, %r8 adcq $0x00, %r9 adcq $0x00, %r10 # Store movq %rcx, (%rdi) movq %r8, 8(%rdi) movq %r9, 16(%rdi) movq %r10, 24(%rdi) popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_sq2_x64,.-fe_sq2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_invert_x64 .type fe_invert_x64,@function .align 16 fe_invert_x64: #else .section __TEXT,__text .globl _fe_invert_x64 .p2align 4 _fe_invert_x64: #endif /* __APPLE__ */ subq $0x90, %rsp # Invert movq %rdi, 128(%rsp) movq %rsi, 136(%rsp) movq %rsp, %rdi movq 136(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 136(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq 128(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq 136(%rsp), %rsi movq 128(%rsp), %rdi addq $0x90, %rsp repz retq #ifndef __APPLE__ .text .globl curve25519_x64 .type curve25519_x64,@function .align 16 curve25519_x64: #else .section __TEXT,__text .globl _curve25519_x64 .p2align 4 _curve25519_x64: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx pushq %rbp movq %rdx, %r8 subq $0xb8, %rsp xorq %rbx, %rbx movq %rdi, 176(%rsp) # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) # Set zero movq $0x00, (%rsp) movq $0x00, 8(%rsp) movq $0x00, 16(%rsp) movq $0x00, 24(%rsp) # Set one movq $0x01, 32(%rsp) movq $0x00, 40(%rsp) movq $0x00, 48(%rsp) movq $0x00, 56(%rsp) # Copy movq (%r8), %rcx movq 8(%r8), %r9 movq 16(%r8), %r10 movq 24(%r8), %r11 movq %rcx, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) movb $62, 168(%rsp) movq $3, 160(%rsp) L_curve25519_x64_words: L_curve25519_x64_bits: movq 160(%rsp), %r9 movb 168(%rsp), %cl movq (%rsi,%r9,8), %rbp shrq %cl, %rbp andq $0x01, %rbp xorq %rbp, %rbx negq %rbx # Conditional Swap movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %r11 xorq 64(%rsp), %rcx xorq 72(%rsp), %r9 xorq 80(%rsp), %r10 xorq 88(%rsp), %r11 andq %rbx, %rcx andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 xorq %rcx, (%rdi) xorq %r9, 8(%rdi) xorq %r10, 16(%rdi) xorq %r11, 24(%rdi) xorq %rcx, 64(%rsp) xorq %r9, 72(%rsp) xorq %r10, 80(%rsp) xorq %r11, 88(%rsp) # Conditional Swap movq (%rsp), %rcx movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 xorq 32(%rsp), %rcx xorq 40(%rsp), %r9 xorq 48(%rsp), %r10 xorq 56(%rsp), %r11 andq %rbx, %rcx andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 xorq %rcx, (%rsp) xorq %r9, 8(%rsp) xorq %r10, 16(%rsp) xorq %r11, 24(%rsp) xorq %rcx, 32(%rsp) xorq %r9, 40(%rsp) xorq %r10, 48(%rsp) xorq %r11, 56(%rsp) movq %rbp, %rbx # Add movq (%rdi), %rcx movq 8(%rdi), %r9 movq 16(%rdi), %r10 movq 24(%rdi), %rbp movq %rcx, %r12 addq (%rsp), %rcx movq %r9, %r13 adcq 8(%rsp), %r9 movq %r10, %r14 adcq 16(%rsp), %r10 movq %rbp, %r15 adcq 24(%rsp), %rbp movq $-19, %rax movq %rbp, %r11 movq $0x7fffffffffffffff, %rdx sarq $63, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Sub modulus (if overflow) subq %rax, %rcx sbbq %rbp, %r9 sbbq %rbp, %r10 sbbq %rdx, %r11 # Sub subq (%rsp), %r12 movq $0x00, %rbp sbbq 8(%rsp), %r13 movq $-19, %rax sbbq 16(%rsp), %r14 movq $0x7fffffffffffffff, %rdx sbbq 24(%rsp), %r15 sbbq $0x00, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Add modulus (if underflow) addq %rax, %r12 adcq %rbp, %r13 adcq %rbp, %r14 adcq %rdx, %r15 movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 128(%rsp) movq %r13, 136(%rsp) movq %r14, 144(%rsp) movq %r15, 152(%rsp) # Add movq 64(%rsp), %rcx movq 72(%rsp), %r9 movq 80(%rsp), %r10 movq 88(%rsp), %rbp movq %rcx, %r12 addq 32(%rsp), %rcx movq %r9, %r13 adcq 40(%rsp), %r9 movq %r10, %r14 adcq 48(%rsp), %r10 movq %rbp, %r15 adcq 56(%rsp), %rbp movq $-19, %rax movq %rbp, %r11 movq $0x7fffffffffffffff, %rdx sarq $63, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Sub modulus (if overflow) subq %rax, %rcx sbbq %rbp, %r9 sbbq %rbp, %r10 sbbq %rdx, %r11 # Sub subq 32(%rsp), %r12 movq $0x00, %rbp sbbq 40(%rsp), %r13 movq $-19, %rax sbbq 48(%rsp), %r14 movq $0x7fffffffffffffff, %rdx sbbq 56(%rsp), %r15 sbbq $0x00, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Add modulus (if underflow) addq %rax, %r12 adcq %rbp, %r13 adcq %rbp, %r14 adcq %rdx, %r15 movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) movq %r12, 96(%rsp) movq %r13, 104(%rsp) movq %r14, 112(%rsp) movq %r15, 120(%rsp) # Multiply # A[0] * B[0] movq (%rdi), %rax mulq 96(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 8(%rdi), %rax mulq 96(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rdi), %rax mulq 104(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rdi), %rax mulq 96(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rdi), %rax mulq 104(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rdi), %rax mulq 112(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rdi), %rax mulq 96(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rdi), %rax mulq 104(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rdi), %rax mulq 112(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rdi), %rax mulq 120(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rdi), %rax mulq 104(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rdi), %rax mulq 112(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rdi), %rax mulq 120(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rdi), %rax mulq 112(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rdi), %rax mulq 120(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rdi), %rax mulq 120(%rsp) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) # Multiply # A[0] * B[0] movq 128(%rsp), %rax mulq (%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 136(%rsp), %rax mulq (%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 128(%rsp), %rax mulq 8(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 144(%rsp), %rax mulq (%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 136(%rsp), %rax mulq 8(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 128(%rsp), %rax mulq 16(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 152(%rsp), %rax mulq (%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 144(%rsp), %rax mulq 8(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 136(%rsp), %rax mulq 16(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 128(%rsp), %rax mulq 24(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 152(%rsp), %rax mulq 8(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 144(%rsp), %rax mulq 16(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 136(%rsp), %rax mulq 24(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 152(%rsp), %rax mulq 16(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 144(%rsp), %rax mulq 24(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 152(%rsp), %rax mulq 24(%rsp) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) # Square # A[0] * A[1] movq 128(%rsp), %rax mulq 136(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq 128(%rsp), %rax mulq 144(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq 128(%rsp), %rax mulq 152(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 136(%rsp), %rax mulq 144(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 136(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 144(%rsp), %rax mulq 152(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq 128(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 136(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 144(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 152(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Square # A[0] * A[1] movq (%rdi), %rax mulq 8(%rdi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rdi), %rax mulq 16(%rdi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rdi), %rax mulq 24(%rdi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rdi), %rax mulq 16(%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rdi), %rax mulq 24(%rdi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rdi), %rax mulq 24(%rdi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rdi), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 8(%rdi), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 16(%rdi), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 24(%rdi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Add movq 32(%rsp), %rcx movq 40(%rsp), %r9 movq 48(%rsp), %r10 movq 56(%rsp), %rbp movq %rcx, %r12 addq (%rsp), %rcx movq %r9, %r13 adcq 8(%rsp), %r9 movq %r10, %r14 adcq 16(%rsp), %r10 movq %rbp, %r15 adcq 24(%rsp), %rbp movq $-19, %rax movq %rbp, %r11 movq $0x7fffffffffffffff, %rdx sarq $63, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Sub modulus (if overflow) subq %rax, %rcx sbbq %rbp, %r9 sbbq %rbp, %r10 sbbq %rdx, %r11 # Sub subq (%rsp), %r12 movq $0x00, %rbp sbbq 8(%rsp), %r13 movq $-19, %rax sbbq 16(%rsp), %r14 movq $0x7fffffffffffffff, %rdx sbbq 24(%rsp), %r15 sbbq $0x00, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Add modulus (if underflow) addq %rax, %r12 adcq %rbp, %r13 adcq %rbp, %r14 adcq %rdx, %r15 movq %rcx, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) movq %r12, (%rsp) movq %r13, 8(%rsp) movq %r14, 16(%rsp) movq %r15, 24(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) # Sub movq 128(%rsp), %rcx movq 136(%rsp), %r9 movq 144(%rsp), %r10 movq 152(%rsp), %r11 subq 96(%rsp), %rcx movq $0x00, %rbp sbbq 104(%rsp), %r9 movq $-19, %rax sbbq 112(%rsp), %r10 movq $0x7fffffffffffffff, %rdx sbbq 120(%rsp), %r11 sbbq $0x00, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Add modulus (if underflow) addq %rax, %rcx adcq %rbp, %r9 adcq %rbp, %r10 adcq %rdx, %r11 movq %rcx, 128(%rsp) movq %r9, 136(%rsp) movq %r10, 144(%rsp) movq %r11, 152(%rsp) # Square # A[0] * A[1] movq (%rsp), %rax mulq 8(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rsp), %rax mulq 16(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rsp), %rax mulq 24(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rsp), %rax mulq 16(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rsp), %rax mulq 24(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rsp), %rax mulq 24(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 8(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 16(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 24(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) # Multiply by 121666 movq $0x1db42, %rax mulq 128(%rsp) xorq %r10, %r10 movq %rax, %rcx movq %rdx, %r9 movq $0x1db42, %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 movq $0x1db42, %rax mulq 144(%rsp) xorq %r13, %r13 addq %rax, %r10 adcq %rdx, %r11 movq $0x1db42, %rax mulq 152(%rsp) movq $0x7fffffffffffffff, %r12 addq %rax, %r11 adcq %rdx, %r13 shldq $0x01, %r11, %r13 andq %r12, %r11 movq $19, %rax mulq %r13 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %rcx, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) # Square # A[0] * A[1] movq 64(%rsp), %rax mulq 72(%rsp) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq 64(%rsp), %rax mulq 80(%rsp) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq 64(%rsp), %rax mulq 88(%rsp) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 72(%rsp), %rax mulq 80(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 72(%rsp), %rax mulq 88(%rsp) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 80(%rsp), %rax mulq 88(%rsp) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq 64(%rsp), %rax mulq %rax movq %rax, %rcx movq %rdx, %rbp # A[1] * A[1] movq 72(%rsp), %rax mulq %rax addq %rbp, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rbp # A[2] * A[2] movq 80(%rsp), %rax mulq %rax addq %rbp, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rbp # A[3] * A[3] movq 88(%rsp), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rbp, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, 64(%rsp) movq %r9, 72(%rsp) movq %r10, 80(%rsp) movq %r11, 88(%rsp) # Add movq 96(%rsp), %rcx movq 104(%rsp), %r9 addq 32(%rsp), %rcx movq 112(%rsp), %r10 adcq 40(%rsp), %r9 movq 120(%rsp), %rbp adcq 48(%rsp), %r10 movq $-19, %rax adcq 56(%rsp), %rbp movq $0x7fffffffffffffff, %rdx movq %rbp, %r11 sarq $63, %rbp # Mask the modulus andq %rbp, %rax andq %rbp, %rdx # Sub modulus (if overflow) subq %rax, %rcx sbbq %rbp, %r9 sbbq %rbp, %r10 sbbq %rdx, %r11 movq %rcx, 96(%rsp) movq %r9, 104(%rsp) movq %r10, 112(%rsp) movq %r11, 120(%rsp) # Multiply # A[0] * B[0] movq (%rsp), %rax mulq (%r8) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 8(%rsp), %rax mulq (%r8) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rsp), %rax mulq 8(%r8) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rsp), %rax mulq (%r8) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rsp), %rax mulq 8(%r8) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rsp), %rax mulq 16(%r8) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rsp), %rax mulq (%r8) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rsp), %rax mulq 8(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rsp), %rax mulq 16(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rsp), %rax mulq 24(%r8) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rsp), %rax mulq 8(%r8) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rsp), %rax mulq 16(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rsp), %rax mulq 24(%r8) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rsp), %rax mulq 16(%r8) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rsp), %rax mulq 24(%r8) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rsp), %rax mulq 24(%r8) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, 32(%rsp) movq %r9, 40(%rsp) movq %r10, 48(%rsp) movq %r11, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rax mulq 128(%rsp) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 104(%rsp), %rax mulq 128(%rsp) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq 96(%rsp), %rax mulq 136(%rsp) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 112(%rsp), %rax mulq 128(%rsp) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 104(%rsp), %rax mulq 136(%rsp) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq 96(%rsp), %rax mulq 144(%rsp) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 120(%rsp), %rax mulq 128(%rsp) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 112(%rsp), %rax mulq 136(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 104(%rsp), %rax mulq 144(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq 96(%rsp), %rax mulq 152(%rsp) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 120(%rsp), %rax mulq 136(%rsp) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 112(%rsp), %rax mulq 144(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 104(%rsp), %rax mulq 152(%rsp) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 120(%rsp), %rax mulq 144(%rsp) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 112(%rsp), %rax mulq 152(%rsp) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 120(%rsp), %rax mulq 152(%rsp) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %rcx, (%rsp) movq %r9, 8(%rsp) movq %r10, 16(%rsp) movq %r11, 24(%rsp) decb 168(%rsp) jge L_curve25519_x64_bits movq $63, 168(%rsp) decb 160(%rsp) jge L_curve25519_x64_words # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq 176(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rsp), %rax mulq (%rdi) movq %rax, %rcx movq %rdx, %r9 # A[0] * B[1] movq 8(%rsp), %rax mulq (%rdi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rsp), %rax mulq 8(%rdi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rsp), %rax mulq (%rdi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rsp), %rax mulq 8(%rdi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rsp), %rax mulq 16(%rdi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rsp), %rax mulq (%rdi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rsp), %rax mulq 8(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rsp), %rax mulq 16(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rsp), %rax mulq 24(%rdi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rsp), %rax mulq 8(%rdi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rsp), %rax mulq 16(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rsp), %rax mulq 24(%rdi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rsp), %rax mulq 16(%rdi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rsp), %rax mulq 24(%rdi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rsp), %rax mulq 24(%rdi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rbp # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rbp, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %rcx movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbp, %r11 addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbp, %r11 addq %rdx, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 movq %rcx, %rax addq $19, %rax movq %r9, %rax adcq $0x00, %rax movq %r10, %rax adcq $0x00, %rax movq %r11, %rax adcq $0x00, %rax sarq $63, %rax andq $19, %rax addq %rax, %rcx adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 andq %rbp, %r11 # Store movq %rcx, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) xorq %rax, %rax addq $0xb8, %rsp popq %rbp popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size curve25519_x64,.-curve25519_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523_x64 .type fe_pow22523_x64,@function .align 16 fe_pow22523_x64: #else .section __TEXT,__text .globl _fe_pow22523_x64 .p2align 4 _fe_pow22523_x64: #endif /* __APPLE__ */ subq $0x70, %rsp # pow22523 movq %rdi, 96(%rsp) movq %rsi, 104(%rsp) movq %rsp, %rdi movq 104(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 104(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_x64@plt #else callq _fe_sq_n_x64 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_x64@plt #else callq _fe_sq_x64 #endif /* __APPLE__ */ movq 96(%rsp), %rdi movq %rsp, %rsi movq 104(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_x64@plt #else callq _fe_mul_x64 #endif /* __APPLE__ */ movq 104(%rsp), %rsi movq 96(%rsp), %rdi addq $0x70, %rsp repz retq #ifndef __APPLE__ .text .globl fe_ge_to_p2_x64 .type fe_ge_to_p2_x64,@function .align 16 fe_ge_to_p2_x64: #else .section __TEXT,__text .globl _fe_ge_to_p2_x64 .p2align 4 _fe_ge_to_p2_x64: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $40, %rsp movq %rsi, (%rsp) movq %rdx, 8(%rsp) movq %rcx, 16(%rsp) movq %r8, 24(%rsp) movq %r9, 32(%rsp) movq 16(%rsp), %rsi movq 88(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 24(%rsp), %rsi movq 32(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 32(%rsp), %rsi movq 88(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $40, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_to_p3_x64 .type fe_ge_to_p3_x64,@function .align 16 fe_ge_to_p3_x64: #else .section __TEXT,__text .globl _fe_ge_to_p3_x64 .p2align 4 _fe_ge_to_p3_x64: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $40, %rsp movq %rsi, (%rsp) movq %rdx, 8(%rsp) movq %rcx, 16(%rsp) movq %r8, 24(%rsp) movq %r9, 32(%rsp) movq 24(%rsp), %rsi movq 96(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 32(%rsp), %rsi movq 88(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 88(%rsp), %rsi movq 96(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq 24(%rsp), %rsi movq 32(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $40, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_dbl_x64 .type fe_ge_dbl_x64,@function .align 16 fe_ge_dbl_x64: #else .section __TEXT,__text .globl _fe_ge_dbl_x64 .p2align 4 _fe_ge_dbl_x64: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x50, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq (%rsp), %rdi movq 32(%rsp), %rsi # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r8 movq %rdx, %rcx # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %rcx, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rcx # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %rcx, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rcx # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rcx, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq 40(%rsp), %rsi # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r8 movq %rdx, %rcx # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %rcx, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rcx # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %rcx, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rcx # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rcx, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi movq 128(%rsp), %rsi # Square * 2 # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r8 movq %rdx, %rcx # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %rcx, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rcx # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %rcx, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rcx # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rcx, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rbx xorq %rax, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $3, %r15, %rax shldq $2, %r14, %r15 shldq $2, %r13, %r14 shldq $2, %r12, %r13 shldq $2, %r11, %r12 shldq $0x01, %r10, %r11 shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 shlq $0x01, %r8 andq %rbx, %r11 # Two out left, one in right andq %rbx, %r15 # Multiply top bits by 19*19 imulq $0x169, %rax, %rcx # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining produce results in addq %rcx, %r8 adcq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rbx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 32(%rsp), %rsi movq 40(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) leaq 48(%rsp), %rdi movq 8(%rsp), %rsi # Square # A[0] * A[1] movq (%rsi), %rax mulq 8(%rsi) movq %rax, %r9 movq %rdx, %r10 # A[0] * A[2] movq (%rsi), %rax mulq 16(%rsi) xorq %r11, %r11 addq %rax, %r10 adcq %rdx, %r11 # A[0] * A[3] movq (%rsi), %rax mulq 24(%rsi) xorq %r12, %r12 addq %rax, %r11 adcq %rdx, %r12 # A[1] * A[2] movq 8(%rsi), %rax mulq 16(%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * A[3] movq 8(%rsi), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 # A[2] * A[3] movq 16(%rsi), %rax mulq 24(%rsi) xorq %r14, %r14 addq %rax, %r13 adcq %rdx, %r14 # Double xorq %r15, %r15 addq %r9, %r9 adcq %r10, %r10 adcq %r11, %r11 adcq %r12, %r12 adcq %r13, %r13 adcq %r14, %r14 adcq $0x00, %r15 # A[0] * A[0] movq (%rsi), %rax mulq %rax movq %rax, %r8 movq %rdx, %rcx # A[1] * A[1] movq 8(%rsi), %rax mulq %rax addq %rcx, %r9 adcq %rax, %r10 adcq $0x00, %rdx movq %rdx, %rcx # A[2] * A[2] movq 16(%rsi), %rax mulq %rax addq %rcx, %r11 adcq %rax, %r12 adcq $0x00, %rdx movq %rdx, %rcx # A[3] * A[3] movq 24(%rsi), %rax mulq %rax addq %rax, %r14 adcq %rdx, %r15 addq %rcx, %r13 adcq $0x00, %r14 adcq $0x00, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 16(%rsp), %rsi movq (%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq 16(%rsp), %rsi movq (%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi leaq 48(%rsp), %rsi movq 8(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi movq 24(%rsp), %rsi movq 16(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $0x50, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_dbl_x64,.-fe_ge_dbl_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_madd_x64 .type fe_ge_madd_x64,@function .align 16 fe_ge_madd_x64: #else .section __TEXT,__text .globl _fe_ge_madd_x64 .p2align 4 _fe_ge_madd_x64: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x50, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq (%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq (%rsp), %rsi movq 152(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 8(%rsp), %rsi movq 160(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi movq 144(%rsp), %rsi movq 136(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) leaq 48(%rsp), %rdi movq 128(%rsp), %rsi movq 128(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $0x50, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_madd_x64,.-fe_ge_madd_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_msub_x64 .type fe_ge_msub_x64,@function .align 16 fe_ge_msub_x64: #else .section __TEXT,__text .globl _fe_ge_msub_x64 .p2align 4 _fe_ge_msub_x64: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x50, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq (%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq (%rsp), %rsi movq 160(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 8(%rsp), %rsi movq 152(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi movq 144(%rsp), %rsi movq 136(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) leaq 48(%rsp), %rdi movq 128(%rsp), %rsi movq 128(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $0x50, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_msub_x64,.-fe_ge_msub_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_add_x64 .type fe_ge_add_x64,@function .align 16 fe_ge_add_x64: #else .section __TEXT,__text .globl _fe_ge_add_x64 .p2align 4 _fe_ge_add_x64: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x50, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq (%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq (%rsp), %rsi movq 160(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 8(%rsp), %rsi movq 168(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi movq 152(%rsp), %rsi movq 136(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 128(%rsp), %rsi movq 144(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) leaq 48(%rsp), %rdi movq (%rsp), %rsi movq (%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $0x50, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_add_x64,.-fe_ge_add_x64 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_sub_x64 .type fe_ge_sub_x64,@function .align 16 fe_ge_sub_x64: #else .section __TEXT,__text .globl _fe_ge_sub_x64 .p2align 4 _fe_ge_sub_x64: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x50, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq (%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 40(%rsp), %rsi movq 32(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq (%rsp), %rsi movq 168(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 8(%rsp), %rsi movq 160(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi movq 152(%rsp), %rsi movq 136(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 128(%rsp), %rsi movq 144(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rax mulq (%rsi) movq %rax, %r8 movq %rdx, %r9 # A[0] * B[1] movq 8(%rbx), %rax mulq (%rsi) xorq %r10, %r10 addq %rax, %r9 adcq %rdx, %r10 # A[1] * B[0] movq (%rbx), %rax mulq 8(%rsi) xorq %r11, %r11 addq %rax, %r9 adcq %rdx, %r10 adcq $0x00, %r11 # A[0] * B[2] movq 16(%rbx), %rax mulq (%rsi) addq %rax, %r10 adcq %rdx, %r11 # A[1] * B[1] movq 8(%rbx), %rax mulq 8(%rsi) xorq %r12, %r12 addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[2] * B[0] movq (%rbx), %rax mulq 16(%rsi) addq %rax, %r10 adcq %rdx, %r11 adcq $0x00, %r12 # A[0] * B[3] movq 24(%rbx), %rax mulq (%rsi) xorq %r13, %r13 addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[2] movq 16(%rbx), %rax mulq 8(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[2] * B[1] movq 8(%rbx), %rax mulq 16(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[3] * B[0] movq (%rbx), %rax mulq 24(%rsi) addq %rax, %r11 adcq %rdx, %r12 adcq $0x00, %r13 # A[1] * B[3] movq 24(%rbx), %rax mulq 8(%rsi) xorq %r14, %r14 addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[2] movq 16(%rbx), %rax mulq 16(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[3] * B[1] movq 8(%rbx), %rax mulq 24(%rsi) addq %rax, %r12 adcq %rdx, %r13 adcq $0x00, %r14 # A[2] * B[3] movq 24(%rbx), %rax mulq 16(%rsi) xorq %r15, %r15 addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[2] movq 16(%rbx), %rax mulq 24(%rsi) addq %rax, %r13 adcq %rdx, %r14 adcq $0x00, %r15 # A[3] * B[3] movq 24(%rbx), %rax mulq 24(%rsi) addq %rax, %r14 adcq %rdx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rax mulq %r12 xorq %r12, %r12 addq %rax, %r8 movq $19, %rax adcq %rdx, %r12 mulq %r13 xorq %r13, %r13 addq %rax, %r9 movq $19, %rax adcq %rdx, %r13 mulq %r14 xorq %r14, %r14 addq %rax, %r10 movq $19, %rax adcq %rdx, %r14 mulq %r15 # Add remaining product results in addq %r12, %r9 adcq %r13, %r10 adcq %r14, %r11 adcq %rax, %r11 adcq $0x00, %rdx # Overflow shldq $0x01, %r11, %rdx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) leaq 48(%rsp), %rdi movq (%rsp), %rsi movq (%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 16(%rsp), %rsi movq 8(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rbx), %r8 movq $0x00, %rcx sbbq 8(%rbx), %r9 movq $-19, %rax sbbq 16(%rbx), %r10 movq $0x7fffffffffffffff, %rdx sbbq 24(%rbx), %r11 sbbq $0x00, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Add modulus (if underflow) addq %rax, %r8 adcq %rcx, %r9 adcq %rcx, %r10 adcq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rdi leaq 48(%rsp), %rsi movq 24(%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rcx adcq 16(%rbx), %r10 movq $-19, %rax adcq 24(%rbx), %rcx movq $0x7fffffffffffffff, %rdx movq %rcx, %r11 sarq $63, %rcx # Mask the modulus andq %rcx, %rax andq %rcx, %rdx # Sub modulus (if overflow) subq %rax, %r8 sbbq %rcx, %r9 sbbq %rcx, %r10 sbbq %rdx, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $0x50, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_sub_x64,.-fe_ge_sub_x64 #endif /* __APPLE__ */ #ifdef HAVE_INTEL_AVX2 #ifndef __APPLE__ .text .globl fe_mul_avx2 .type fe_mul_avx2,@function .align 16 fe_mul_avx2: #else .section __TEXT,__text .globl _fe_mul_avx2 .p2align 4 _fe_mul_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbx movq %rdx, %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rax, %rcx xorq %r15, %r15 adcxq %rax, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rcx, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rax, %rcx adoxq %rax, %r9 # A[2] * B[1] mulxq 16(%rsi), %rax, %r14 adoxq %rcx, %r10 adcxq %rax, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rax, %rcx adcxq %r14, %r12 adoxq %rax, %r11 adcxq %r15, %r13 adoxq %rcx, %r12 # A[0] * B[2] mulxq (%rsi), %rax, %rcx adoxq %r15, %r13 xorq %r14, %r14 adcxq %rax, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rax adcxq %rcx, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rax, %r11 mulxq 24(%rsi), %rax, %rcx adcxq %rax, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rax adcxq %rcx, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rax, %r13 mulxq 24(%rsi), %rax, %rcx adoxq %r15, %r14 adcxq %rax, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rax adcxq %rcx, %r15 xorq %rcx, %rcx adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rax, %r12 mulxq 24(%rsi), %rdx, %rax adoxq %rdx, %r11 adoxq %rax, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rax adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rax, %r14 mulxq 24(%rsi), %rax, %rdx adcxq %rcx, %r15 adoxq %rax, %r13 adoxq %rdx, %r14 adoxq %rcx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r12, %rax, %r12 adcxq %rax, %r8 adoxq %r12, %r9 mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %rbx popq %r15 popq %r14 popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_mul_avx2,.-fe_mul_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_avx2 .type fe_sq_avx2,@function .align 16 fe_sq_avx2: #else .section __TEXT,__text .globl _fe_sq_avx2 .p2align 4 _fe_sq_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 # Square # A[0] * A[1] movq (%rsi), %rdx mulxq 8(%rsi), %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx mulxq 8(%rsi), %rcx, %rbx xorq %r15, %r15 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq (%rsi), %rcx, %rbx adoxq %r15, %r13 adcxq %rcx, %r10 adoxq %r15, %r14 # A[1] * A[3] movq 8(%rsi), %rdx mulxq 24(%rsi), %rax, %r8 adcxq %rbx, %r11 adcxq %rax, %r12 adcxq %r8, %r13 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 # A[0] * A[0] movq (%rsi), %rdx mulxq %rdx, %r8, %rax adcxq %r9, %r9 # A[1] * A[1] movq 8(%rsi), %rdx mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rax, %r9 adcxq %r11, %r11 adoxq %rcx, %r10 # A[2] * A[2] movq 16(%rsi), %rdx mulxq %rdx, %rax, %rcx adcxq %r12, %r12 adoxq %rbx, %r11 adcxq %r13, %r13 adoxq %rax, %r12 # A[3] * A[3] movq 24(%rsi), %rdx mulxq %rdx, %rax, %rbx adcxq %r14, %r14 adoxq %rcx, %r13 adcxq %r15, %r15 adoxq %rax, %r14 adoxq %rbx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r12, %rax, %r12 adcxq %rax, %r8 adoxq %r12, %r9 mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_sq_avx2,.-fe_sq_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq_n_avx2 .type fe_sq_n_avx2,@function .align 16 fe_sq_n_avx2: #else .section __TEXT,__text .globl _fe_sq_n_avx2 .p2align 4 _fe_sq_n_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbp movq %rdx, %rbp L_fe_sq_n_avx2: # Square # A[0] * A[1] movq (%rsi), %rdx mulxq 8(%rsi), %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx mulxq 8(%rsi), %rcx, %rbx xorq %r15, %r15 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq (%rsi), %rcx, %rbx adoxq %r15, %r13 adcxq %rcx, %r10 adoxq %r15, %r14 # A[1] * A[3] movq 8(%rsi), %rdx mulxq 24(%rsi), %rax, %r8 adcxq %rbx, %r11 adcxq %rax, %r12 adcxq %r8, %r13 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 # A[0] * A[0] movq (%rsi), %rdx mulxq %rdx, %r8, %rax adcxq %r9, %r9 # A[1] * A[1] movq 8(%rsi), %rdx mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rax, %r9 adcxq %r11, %r11 adoxq %rcx, %r10 # A[2] * A[2] movq 16(%rsi), %rdx mulxq %rdx, %rax, %rcx adcxq %r12, %r12 adoxq %rbx, %r11 adcxq %r13, %r13 adoxq %rax, %r12 # A[3] * A[3] movq 24(%rsi), %rdx mulxq %rdx, %rax, %rbx adcxq %r14, %r14 adoxq %rcx, %r13 adcxq %r15, %r15 adoxq %rax, %r14 adoxq %rbx, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r12, %rax, %r12 adcxq %rax, %r8 adoxq %r12, %r9 mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rax andq %rcx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) decb %bpl jnz L_fe_sq_n_avx2 popq %rbp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_sq_n_avx2,.-fe_sq_n_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_mul121666_avx2 .type fe_mul121666_avx2,@function .align 16 fe_mul121666_avx2: #else .section __TEXT,__text .globl _fe_mul121666_avx2 .p2align 4 _fe_mul121666_avx2: #endif /* __APPLE__ */ pushq %r12 pushq %r13 movq $0x1db42, %rdx mulxq (%rsi), %rax, %r13 mulxq 8(%rsi), %rcx, %r12 mulxq 16(%rsi), %r8, %r11 mulxq 24(%rsi), %r9, %r10 addq %r13, %rcx adcq %r12, %r8 adcq %r11, %r9 adcq $0x00, %r10 movq $0x7fffffffffffffff, %r13 shldq $0x01, %r9, %r10 andq %r13, %r9 imulq $19, %r10, %r10 addq %r10, %rax adcq $0x00, %rcx adcq $0x00, %r8 adcq $0x00, %r9 movq %rax, (%rdi) movq %rcx, 8(%rdi) movq %r8, 16(%rdi) movq %r9, 24(%rdi) popq %r13 popq %r12 repz retq #ifndef __APPLE__ .size fe_mul121666_avx2,.-fe_mul121666_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq2_avx2 .type fe_sq2_avx2,@function .align 16 fe_sq2_avx2: #else .section __TEXT,__text .globl _fe_sq2_avx2 .p2align 4 _fe_sq2_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 # Square * 2 # A[0] * A[1] movq (%rsi), %rdx mulxq 8(%rsi), %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx mulxq 8(%rsi), %rcx, %rbx xorq %r15, %r15 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rbx, %r12 # A[2] * A[0] mulxq (%rsi), %rcx, %rbx adoxq %r15, %r13 adcxq %rcx, %r10 adoxq %r15, %r14 # A[1] * A[3] movq 8(%rsi), %rdx mulxq 24(%rsi), %rax, %r8 adcxq %rbx, %r11 adcxq %rax, %r12 adcxq %r8, %r13 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 # A[0] * A[0] movq (%rsi), %rdx mulxq %rdx, %r8, %rax adcxq %r9, %r9 # A[1] * A[1] movq 8(%rsi), %rdx mulxq %rdx, %rcx, %rbx adcxq %r10, %r10 adoxq %rax, %r9 adcxq %r11, %r11 adoxq %rcx, %r10 # A[2] * A[2] movq 16(%rsi), %rdx mulxq %rdx, %rax, %rcx adcxq %r12, %r12 adoxq %rbx, %r11 adcxq %r13, %r13 adoxq %rax, %r12 # A[3] * A[3] movq 24(%rsi), %rdx mulxq %rdx, %rax, %rbx adcxq %r14, %r14 adoxq %rcx, %r13 adcxq %r15, %r15 adoxq %rax, %r14 adoxq %rbx, %r15 # Reduce movq $0x7fffffffffffffff, %rbx xorq %rax, %rax # Move top half into t4-t7 and remove top bit from t3 and double shldq $3, %r15, %rax shldq $2, %r14, %r15 shldq $2, %r13, %r14 shldq $2, %r12, %r13 shldq $2, %r11, %r12 shldq $0x01, %r10, %r11 shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 shlq $0x01, %r8 andq %rbx, %r11 # Two out left, one in right andq %rbx, %r15 # Multiply top bits by 19*19 imulq $0x169, %rax, %rcx xorq %rbx, %rbx # Multiply top half by 19 movq $19, %rdx adoxq %rcx, %r8 mulxq %r12, %rax, %r12 adcxq %rax, %r8 adoxq %r12, %r9 mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rbx, %rdx adcxq %rbx, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rbx imulq $19, %rdx, %rax andq %rbx, %r11 addq %rax, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_sq2_avx2,.-fe_sq2_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_invert_avx2 .type fe_invert_avx2,@function .align 16 fe_invert_avx2: #else .section __TEXT,__text .globl _fe_invert_avx2 .p2align 4 _fe_invert_avx2: #endif /* __APPLE__ */ subq $0x90, %rsp # Invert movq %rdi, 128(%rsp) movq %rsi, 136(%rsp) movq %rsp, %rdi movq 136(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 136(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq 128(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 136(%rsp), %rsi movq 128(%rsp), %rdi addq $0x90, %rsp repz retq #ifndef __APPLE__ .text .globl curve25519_avx2 .type curve25519_avx2,@function .align 16 curve25519_avx2: #else .section __TEXT,__text .globl _curve25519_avx2 .p2align 4 _curve25519_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 pushq %rbp movq %rdx, %r8 subq $0xc0, %rsp movq $0x00, 184(%rsp) movq %rdi, 176(%rsp) # Set one movq $0x01, (%rdi) movq $0x00, 8(%rdi) movq $0x00, 16(%rdi) movq $0x00, 24(%rdi) # Set zero movq $0x00, (%rsp) movq $0x00, 8(%rsp) movq $0x00, 16(%rsp) movq $0x00, 24(%rsp) # Set one movq $0x01, 32(%rsp) movq $0x00, 40(%rsp) movq $0x00, 48(%rsp) movq $0x00, 56(%rsp) # Copy movq (%r8), %r9 movq 8(%r8), %r10 movq 16(%r8), %r11 movq 24(%r8), %r12 movq %r9, 64(%rsp) movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) movb $62, 168(%rsp) movq $3, 160(%rsp) L_curve25519_avx2_words: L_curve25519_avx2_bits: movq 184(%rsp), %rbx movq 160(%rsp), %r9 movb 168(%rsp), %cl movq (%rsi,%r9,8), %rax shrq %cl, %rax andq $0x01, %rax xorq %rax, %rbx negq %rbx # Conditional Swap movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %r12 xorq 64(%rsp), %r9 xorq 72(%rsp), %r10 xorq 80(%rsp), %r11 xorq 88(%rsp), %r12 andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 andq %rbx, %r12 xorq %r9, (%rdi) xorq %r10, 8(%rdi) xorq %r11, 16(%rdi) xorq %r12, 24(%rdi) xorq %r9, 64(%rsp) xorq %r10, 72(%rsp) xorq %r11, 80(%rsp) xorq %r12, 88(%rsp) # Conditional Swap movq (%rsp), %r9 movq 8(%rsp), %r10 movq 16(%rsp), %r11 movq 24(%rsp), %r12 xorq 32(%rsp), %r9 xorq 40(%rsp), %r10 xorq 48(%rsp), %r11 xorq 56(%rsp), %r12 andq %rbx, %r9 andq %rbx, %r10 andq %rbx, %r11 andq %rbx, %r12 xorq %r9, (%rsp) xorq %r10, 8(%rsp) xorq %r11, 16(%rsp) xorq %r12, 24(%rsp) xorq %r9, 32(%rsp) xorq %r10, 40(%rsp) xorq %r11, 48(%rsp) xorq %r12, 56(%rsp) movq %rax, 184(%rsp) # Add movq (%rdi), %r9 movq 8(%rdi), %r10 movq 16(%rdi), %r11 movq 24(%rdi), %rax movq %r9, %r13 addq (%rsp), %r9 movq %r10, %r14 adcq 8(%rsp), %r10 movq %r11, %r15 adcq 16(%rsp), %r11 movq %rax, %rbp adcq 24(%rsp), %rax movq $-19, %rcx movq %rax, %r12 movq $0x7fffffffffffffff, %rbx sarq $63, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Sub modulus (if overflow) subq %rcx, %r9 sbbq %rax, %r10 sbbq %rax, %r11 sbbq %rbx, %r12 # Sub subq (%rsp), %r13 movq $0x00, %rax sbbq 8(%rsp), %r14 movq $-19, %rcx sbbq 16(%rsp), %r15 movq $0x7fffffffffffffff, %rbx sbbq 24(%rsp), %rbp sbbq $0x00, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Add modulus (if underflow) addq %rcx, %r13 adcq %rax, %r14 adcq %rax, %r15 adcq %rbx, %rbp movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) movq %r13, 128(%rsp) movq %r14, 136(%rsp) movq %r15, 144(%rsp) movq %rbp, 152(%rsp) # Add movq 64(%rsp), %r9 movq 72(%rsp), %r10 movq 80(%rsp), %r11 movq 88(%rsp), %rax movq %r9, %r13 addq 32(%rsp), %r9 movq %r10, %r14 adcq 40(%rsp), %r10 movq %r11, %r15 adcq 48(%rsp), %r11 movq %rax, %rbp adcq 56(%rsp), %rax movq $-19, %rcx movq %rax, %r12 movq $0x7fffffffffffffff, %rbx sarq $63, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Sub modulus (if overflow) subq %rcx, %r9 sbbq %rax, %r10 sbbq %rax, %r11 sbbq %rbx, %r12 # Sub subq 32(%rsp), %r13 movq $0x00, %rax sbbq 40(%rsp), %r14 movq $-19, %rcx sbbq 48(%rsp), %r15 movq $0x7fffffffffffffff, %rbx sbbq 56(%rsp), %rbp sbbq $0x00, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Add modulus (if underflow) addq %rcx, %r13 adcq %rax, %r14 adcq %rax, %r15 adcq %rbx, %rbp movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) movq %r13, 96(%rsp) movq %r14, 104(%rsp) movq %r15, 112(%rsp) movq %rbp, 120(%rsp) # Multiply # A[0] * B[0] movq (%rdi), %rdx mulxq 96(%rsp), %r9, %r10 # A[2] * B[0] mulxq 112(%rsp), %r11, %r12 # A[1] * B[0] mulxq 104(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[1] * B[3] movq 24(%rdi), %rdx mulxq 104(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] movq 8(%rdi), %rdx mulxq 96(%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 112(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 16(%rdi), %rdx mulxq 104(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq 96(%rsp), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 8(%rdi), %rdx mulxq 104(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[3] * B[1] movq 8(%rdi), %rdx adoxq %rcx, %r12 mulxq 120(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 16(%rdi), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 24(%rdi), %rdx adoxq %rcx, %r14 mulxq 120(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq 96(%rsp), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq (%rdi), %rdx adcxq %rcx, %r13 mulxq 120(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[2] * B[3] movq 24(%rdi), %rdx mulxq 112(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[3] * B[2] movq 16(%rdi), %rdx adcxq %rcx, %r15 mulxq 120(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rbx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rbx, %rbx mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rbx, %rdx adcxq %rbx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rbx imulq $19, %rdx, %rcx andq %rbx, %r12 addq %rcx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, 32(%rsp) movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) # Multiply # A[0] * B[0] movq 128(%rsp), %rdx mulxq (%rsp), %r9, %r10 # A[2] * B[0] mulxq 16(%rsp), %r11, %r12 # A[1] * B[0] mulxq 8(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[1] * B[3] movq 152(%rsp), %rdx mulxq 8(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] movq 136(%rsp), %rdx mulxq (%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 16(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 144(%rsp), %rdx mulxq 8(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq (%rsp), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 136(%rsp), %rdx mulxq 8(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[3] * B[1] movq 136(%rsp), %rdx adoxq %rcx, %r12 mulxq 24(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 144(%rsp), %rdx mulxq 16(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 152(%rsp), %rdx adoxq %rcx, %r14 mulxq 24(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq (%rsp), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 128(%rsp), %rdx adcxq %rcx, %r13 mulxq 24(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[2] * B[3] movq 152(%rsp), %rdx mulxq 16(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[3] * B[2] movq 144(%rsp), %rdx adcxq %rcx, %r15 mulxq 24(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rbx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rbx, %rbx mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rbx, %rdx adcxq %rbx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rbx imulq $19, %rdx, %rcx andq %rbx, %r12 addq %rcx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) # Square # A[0] * A[1] movq 128(%rsp), %rdx mulxq 136(%rsp), %r10, %r11 # A[0] * A[3] mulxq 152(%rsp), %r12, %r13 # A[2] * A[1] movq 144(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx xorq %rbp, %rbp adoxq %rcx, %r12 # A[2] * A[3] mulxq 152(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq 128(%rsp), %rcx, %rbx adoxq %rbp, %r14 adcxq %rcx, %r11 adoxq %rbp, %r15 # A[1] * A[3] movq 136(%rsp), %rdx mulxq 152(%rsp), %rax, %r9 adcxq %rbx, %r12 adcxq %rax, %r13 adcxq %r9, %r14 adcxq %rbp, %r15 # Double with Carry Flag xorq %rbp, %rbp # A[0] * A[0] movq 128(%rsp), %rdx mulxq %rdx, %r9, %rax adcxq %r10, %r10 # A[1] * A[1] movq 136(%rsp), %rdx mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rax, %r10 adcxq %r12, %r12 adoxq %rcx, %r11 # A[2] * A[2] movq 144(%rsp), %rdx mulxq %rdx, %rax, %rcx adcxq %r13, %r13 adoxq %rbx, %r12 adcxq %r14, %r14 adoxq %rax, %r13 # A[3] * A[3] movq 152(%rsp), %rdx mulxq %rdx, %rax, %rbx adcxq %r15, %r15 adoxq %rcx, %r14 adcxq %rbp, %rbp adoxq %rax, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rcx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %rax, %r15 adcxq %rax, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rax andq %rcx, %r12 addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) # Square # A[0] * A[1] movq (%rdi), %rdx mulxq 8(%rdi), %r10, %r11 # A[0] * A[3] mulxq 24(%rdi), %r12, %r13 # A[2] * A[1] movq 16(%rdi), %rdx mulxq 8(%rdi), %rcx, %rbx xorq %rbp, %rbp adoxq %rcx, %r12 # A[2] * A[3] mulxq 24(%rdi), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq (%rdi), %rcx, %rbx adoxq %rbp, %r14 adcxq %rcx, %r11 adoxq %rbp, %r15 # A[1] * A[3] movq 8(%rdi), %rdx mulxq 24(%rdi), %rax, %r9 adcxq %rbx, %r12 adcxq %rax, %r13 adcxq %r9, %r14 adcxq %rbp, %r15 # Double with Carry Flag xorq %rbp, %rbp # A[0] * A[0] movq (%rdi), %rdx mulxq %rdx, %r9, %rax adcxq %r10, %r10 # A[1] * A[1] movq 8(%rdi), %rdx mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rax, %r10 adcxq %r12, %r12 adoxq %rcx, %r11 # A[2] * A[2] movq 16(%rdi), %rdx mulxq %rdx, %rax, %rcx adcxq %r13, %r13 adoxq %rbx, %r12 adcxq %r14, %r14 adoxq %rax, %r13 # A[3] * A[3] movq 24(%rdi), %rdx mulxq %rdx, %rax, %rbx adcxq %r15, %r15 adoxq %rcx, %r14 adcxq %rbp, %rbp adoxq %rax, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rcx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %rax, %r15 adcxq %rax, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rax andq %rcx, %r12 addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) # Add movq 32(%rsp), %r9 movq 40(%rsp), %r10 movq 48(%rsp), %r11 movq 56(%rsp), %rax movq %r9, %r13 addq (%rsp), %r9 movq %r10, %r14 adcq 8(%rsp), %r10 movq %r11, %r15 adcq 16(%rsp), %r11 movq %rax, %rbp adcq 24(%rsp), %rax movq $-19, %rcx movq %rax, %r12 movq $0x7fffffffffffffff, %rbx sarq $63, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Sub modulus (if overflow) subq %rcx, %r9 sbbq %rax, %r10 sbbq %rax, %r11 sbbq %rbx, %r12 # Sub subq (%rsp), %r13 movq $0x00, %rax sbbq 8(%rsp), %r14 movq $-19, %rcx sbbq 16(%rsp), %r15 movq $0x7fffffffffffffff, %rbx sbbq 24(%rsp), %rbp sbbq $0x00, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Add modulus (if underflow) addq %rcx, %r13 adcq %rax, %r14 adcq %rax, %r15 adcq %rbx, %rbp movq %r9, 64(%rsp) movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) movq %r13, (%rsp) movq %r14, 8(%rsp) movq %r15, 16(%rsp) movq %rbp, 24(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq 128(%rsp), %r9, %r10 # A[2] * B[0] mulxq 144(%rsp), %r11, %r12 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[1] * B[3] movq 120(%rsp), %rdx mulxq 136(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] movq 104(%rsp), %rdx mulxq 128(%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq 128(%rsp), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[3] * B[1] movq 104(%rsp), %rdx adoxq %rcx, %r12 mulxq 152(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r14 mulxq 152(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq 128(%rsp), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 96(%rsp), %rdx adcxq %rcx, %r13 mulxq 152(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[2] * B[3] movq 120(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[3] * B[2] movq 112(%rsp), %rdx adcxq %rcx, %r15 mulxq 152(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rbx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rbx, %rbx mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rbx, %rdx adcxq %rbx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rbx imulq $19, %rdx, %rcx andq %rbx, %r12 addq %rcx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) # Sub movq 128(%rsp), %r9 movq 136(%rsp), %r10 movq 144(%rsp), %r11 movq 152(%rsp), %r12 subq 96(%rsp), %r9 movq $0x00, %rax sbbq 104(%rsp), %r10 movq $-19, %rcx sbbq 112(%rsp), %r11 movq $0x7fffffffffffffff, %rbx sbbq 120(%rsp), %r12 sbbq $0x00, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Add modulus (if underflow) addq %rcx, %r9 adcq %rax, %r10 adcq %rax, %r11 adcq %rbx, %r12 movq %r9, 128(%rsp) movq %r10, 136(%rsp) movq %r11, 144(%rsp) movq %r12, 152(%rsp) # Square # A[0] * A[1] movq (%rsp), %rdx mulxq 8(%rsp), %r10, %r11 # A[0] * A[3] mulxq 24(%rsp), %r12, %r13 # A[2] * A[1] movq 16(%rsp), %rdx mulxq 8(%rsp), %rcx, %rbx xorq %rbp, %rbp adoxq %rcx, %r12 # A[2] * A[3] mulxq 24(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq (%rsp), %rcx, %rbx adoxq %rbp, %r14 adcxq %rcx, %r11 adoxq %rbp, %r15 # A[1] * A[3] movq 8(%rsp), %rdx mulxq 24(%rsp), %rax, %r9 adcxq %rbx, %r12 adcxq %rax, %r13 adcxq %r9, %r14 adcxq %rbp, %r15 # Double with Carry Flag xorq %rbp, %rbp # A[0] * A[0] movq (%rsp), %rdx mulxq %rdx, %r9, %rax adcxq %r10, %r10 # A[1] * A[1] movq 8(%rsp), %rdx mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rax, %r10 adcxq %r12, %r12 adoxq %rcx, %r11 # A[2] * A[2] movq 16(%rsp), %rdx mulxq %rdx, %rax, %rcx adcxq %r13, %r13 adoxq %rbx, %r12 adcxq %r14, %r14 adoxq %rax, %r13 # A[3] * A[3] movq 24(%rsp), %rdx mulxq %rdx, %rax, %rbx adcxq %r15, %r15 adoxq %rcx, %r14 adcxq %rbp, %rbp adoxq %rax, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rcx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %rax, %r15 adcxq %rax, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rax andq %rcx, %r12 addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) movq $0x1db42, %rdx mulxq 128(%rsp), %r9, %rbp mulxq 136(%rsp), %r10, %r15 mulxq 144(%rsp), %r11, %r14 mulxq 152(%rsp), %r12, %r13 addq %rbp, %r10 adcq %r15, %r11 adcq %r14, %r12 adcq $0x00, %r13 movq $0x7fffffffffffffff, %rbp shldq $0x01, %r12, %r13 andq %rbp, %r12 imulq $19, %r13, %r13 addq %r13, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 movq %r9, 32(%rsp) movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) # Square # A[0] * A[1] movq 64(%rsp), %rdx mulxq 72(%rsp), %r10, %r11 # A[0] * A[3] mulxq 88(%rsp), %r12, %r13 # A[2] * A[1] movq 80(%rsp), %rdx mulxq 72(%rsp), %rcx, %rbx xorq %rbp, %rbp adoxq %rcx, %r12 # A[2] * A[3] mulxq 88(%rsp), %r14, %r15 adoxq %rbx, %r13 # A[2] * A[0] mulxq 64(%rsp), %rcx, %rbx adoxq %rbp, %r14 adcxq %rcx, %r11 adoxq %rbp, %r15 # A[1] * A[3] movq 72(%rsp), %rdx mulxq 88(%rsp), %rax, %r9 adcxq %rbx, %r12 adcxq %rax, %r13 adcxq %r9, %r14 adcxq %rbp, %r15 # Double with Carry Flag xorq %rbp, %rbp # A[0] * A[0] movq 64(%rsp), %rdx mulxq %rdx, %r9, %rax adcxq %r10, %r10 # A[1] * A[1] movq 72(%rsp), %rdx mulxq %rdx, %rcx, %rbx adcxq %r11, %r11 adoxq %rax, %r10 adcxq %r12, %r12 adoxq %rcx, %r11 # A[2] * A[2] movq 80(%rsp), %rdx mulxq %rdx, %rax, %rcx adcxq %r13, %r13 adoxq %rbx, %r12 adcxq %r14, %r14 adoxq %rax, %r13 # A[3] * A[3] movq 88(%rsp), %rdx mulxq %rdx, %rax, %rbx adcxq %r15, %r15 adoxq %rcx, %r14 adcxq %rbp, %rbp adoxq %rax, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rcx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r13, %rax, %r13 adcxq %rax, %r9 adoxq %r13, %r10 mulxq %r14, %rax, %r14 adcxq %rax, %r10 adoxq %r14, %r11 mulxq %r15, %rax, %r15 adcxq %rax, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rax andq %rcx, %r12 addq %rax, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, 64(%rsp) movq %r10, 72(%rsp) movq %r11, 80(%rsp) movq %r12, 88(%rsp) # Add movq 96(%rsp), %r9 movq 104(%rsp), %r10 addq 32(%rsp), %r9 movq 112(%rsp), %r11 adcq 40(%rsp), %r10 movq 120(%rsp), %rax adcq 48(%rsp), %r11 movq $-19, %rcx adcq 56(%rsp), %rax movq $0x7fffffffffffffff, %rbx movq %rax, %r12 sarq $63, %rax # Mask the modulus andq %rax, %rcx andq %rax, %rbx # Sub modulus (if overflow) subq %rcx, %r9 sbbq %rax, %r10 sbbq %rax, %r11 sbbq %rbx, %r12 movq %r9, 96(%rsp) movq %r10, 104(%rsp) movq %r11, 112(%rsp) movq %r12, 120(%rsp) # Multiply # A[0] * B[0] movq (%rsp), %rdx mulxq (%r8), %r9, %r10 # A[2] * B[0] mulxq 16(%r8), %r11, %r12 # A[1] * B[0] mulxq 8(%r8), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[1] * B[3] movq 24(%rsp), %rdx mulxq 8(%r8), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] movq 8(%rsp), %rdx mulxq (%r8), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 16(%r8), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 16(%rsp), %rdx mulxq 8(%r8), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq (%r8), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 8(%rsp), %rdx mulxq 8(%r8), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[3] * B[1] movq 8(%rsp), %rdx adoxq %rcx, %r12 mulxq 24(%r8), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 16(%rsp), %rdx mulxq 16(%r8), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 24(%rsp), %rdx adoxq %rcx, %r14 mulxq 24(%r8), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq (%r8), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq (%rsp), %rdx adcxq %rcx, %r13 mulxq 24(%r8), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[2] * B[3] movq 24(%rsp), %rdx mulxq 16(%r8), %rdx, %rcx adcxq %rdx, %r14 # A[3] * B[2] movq 16(%rsp), %rdx adcxq %rcx, %r15 mulxq 24(%r8), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rbx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rbx, %rbx mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rbx, %rdx adcxq %rbx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rbx imulq $19, %rdx, %rcx andq %rbx, %r12 addq %rcx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, 32(%rsp) movq %r10, 40(%rsp) movq %r11, 48(%rsp) movq %r12, 56(%rsp) # Multiply # A[0] * B[0] movq 96(%rsp), %rdx mulxq 128(%rsp), %r9, %r10 # A[2] * B[0] mulxq 144(%rsp), %r11, %r12 # A[1] * B[0] mulxq 136(%rsp), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[1] * B[3] movq 120(%rsp), %rdx mulxq 136(%rsp), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] movq 104(%rsp), %rdx mulxq 128(%rsp), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 144(%rsp), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 112(%rsp), %rdx mulxq 136(%rsp), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq 128(%rsp), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 104(%rsp), %rdx mulxq 136(%rsp), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[3] * B[1] movq 104(%rsp), %rdx adoxq %rcx, %r12 mulxq 152(%rsp), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 112(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 120(%rsp), %rdx adoxq %rcx, %r14 mulxq 152(%rsp), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq 128(%rsp), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq 96(%rsp), %rdx adcxq %rcx, %r13 mulxq 152(%rsp), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[2] * B[3] movq 120(%rsp), %rdx mulxq 144(%rsp), %rdx, %rcx adcxq %rdx, %r14 # A[3] * B[2] movq 112(%rsp), %rdx adcxq %rcx, %r15 mulxq 152(%rsp), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rbx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rbx, %rbx mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rbx, %rdx adcxq %rbx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rbx imulq $19, %rdx, %rcx andq %rbx, %r12 addq %rcx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Store movq %r9, (%rsp) movq %r10, 8(%rsp) movq %r11, 16(%rsp) movq %r12, 24(%rsp) decb 168(%rsp) jge L_curve25519_avx2_bits movq $63, 168(%rsp) decb 160(%rsp) jge L_curve25519_avx2_words # Invert leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi movq %rsp, %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $19, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $9, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 128(%rsp), %rdi leaq 128(%rsp), %rsi movq $0x63, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 128(%rsp), %rsi leaq 96(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 96(%rsp), %rdi leaq 96(%rsp), %rsi movq $49, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 96(%rsp), %rsi leaq 64(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movq $4, %rdx #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 176(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rsp), %rdx mulxq (%rdi), %r9, %r10 # A[2] * B[0] mulxq 16(%rdi), %r11, %r12 # A[1] * B[0] mulxq 8(%rdi), %rcx, %rbx xorq %rbp, %rbp adcxq %rcx, %r10 # A[1] * B[3] movq 24(%rsp), %rdx mulxq 8(%rdi), %r13, %r14 adcxq %rbx, %r11 # A[0] * B[1] movq 8(%rsp), %rdx mulxq (%rdi), %rcx, %rbx adoxq %rcx, %r10 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r15 adoxq %rbx, %r11 adcxq %rcx, %r12 # A[1] * B[2] movq 16(%rsp), %rdx mulxq 8(%rdi), %rcx, %rbx adcxq %r15, %r13 adoxq %rcx, %r12 adcxq %rbp, %r14 adoxq %rbx, %r13 # A[0] * B[2] mulxq (%rdi), %rcx, %rbx adoxq %rbp, %r14 xorq %r15, %r15 adcxq %rcx, %r11 # A[1] * B[1] movq 8(%rsp), %rdx mulxq 8(%rdi), %rdx, %rcx adcxq %rbx, %r12 adoxq %rdx, %r11 # A[3] * B[1] movq 8(%rsp), %rdx adoxq %rcx, %r12 mulxq 24(%rdi), %rcx, %rbx adcxq %rcx, %r13 # A[2] * B[2] movq 16(%rsp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rbx, %r14 adoxq %rdx, %r13 # A[3] * B[3] movq 24(%rsp), %rdx adoxq %rcx, %r14 mulxq 24(%rdi), %rcx, %rbx adoxq %rbp, %r15 adcxq %rcx, %r15 # A[0] * B[3] mulxq (%rdi), %rdx, %rcx adcxq %rbx, %rbp xorq %rbx, %rbx adcxq %rdx, %r12 # A[3] * B[0] movq (%rsp), %rdx adcxq %rcx, %r13 mulxq 24(%rdi), %rdx, %rcx adoxq %rdx, %r12 adoxq %rcx, %r13 # A[2] * B[3] movq 24(%rsp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rdx, %r14 # A[3] * B[2] movq 16(%rsp), %rdx adcxq %rcx, %r15 mulxq 24(%rdi), %rcx, %rdx adcxq %rbx, %rbp adoxq %rcx, %r14 adoxq %rdx, %r15 adoxq %rbx, %rbp # Reduce movq $0x7fffffffffffffff, %rbx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r15, %rbp shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 andq %rbx, %r12 # Multiply top half by 19 movq $19, %rdx xorq %rbx, %rbx mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %rcx, %r15 adcxq %rcx, %r11 adoxq %r15, %r12 mulxq %rbp, %rbp, %rdx adcxq %rbp, %r12 adoxq %rbx, %rdx adcxq %rbx, %rdx # Overflow shldq $0x01, %r12, %rdx movq $0x7fffffffffffffff, %rbx imulq $19, %rdx, %rcx andq %rbx, %r12 addq %rcx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 # Reduce if top bit set movq %r12, %rdx sarq $63, %rdx andq $19, %rdx andq %rbx, %r12 addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 movq $0x7fffffffffffffff, %rbx movq %r9, %rdx addq $19, %rdx movq %r10, %rdx adcq $0x00, %rdx movq %r11, %rdx adcq $0x00, %rdx movq %r12, %rdx adcq $0x00, %rdx sarq $63, %rdx andq $19, %rdx addq %rdx, %r9 adcq $0x00, %r10 adcq $0x00, %r11 adcq $0x00, %r12 andq %rbx, %r12 # Store movq %r9, (%rdi) movq %r10, 8(%rdi) movq %r11, 16(%rdi) movq %r12, 24(%rdi) xorq %rax, %rax addq $0xc0, %rsp popq %rbp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size curve25519_avx2,.-curve25519_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_pow22523_avx2 .type fe_pow22523_avx2,@function .align 16 fe_pow22523_avx2: #else .section __TEXT,__text .globl _fe_pow22523_avx2 .p2align 4 _fe_pow22523_avx2: #endif /* __APPLE__ */ subq $0x70, %rsp # pow22523 movq %rdi, 96(%rsp) movq %rsi, 104(%rsp) movq %rsp, %rdi movq 104(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq 104(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movb $4, %dl #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movb $9, %dl #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movb $19, %dl #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movb $9, %dl #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movb $49, %dl #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 64(%rsp), %rdi leaq 64(%rsp), %rsi movb $0x63, %dl #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 64(%rsp), %rsi leaq 32(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ leaq 32(%rsp), %rdi leaq 32(%rsp), %rsi movb $49, %dl #ifndef __APPLE__ callq fe_sq_n_avx2@plt #else callq _fe_sq_n_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi leaq 32(%rsp), %rsi movq %rsp, %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ movq %rsp, %rdi movq %rsp, %rsi #ifndef __APPLE__ callq fe_sq_avx2@plt #else callq _fe_sq_avx2 #endif /* __APPLE__ */ movq 96(%rsp), %rdi movq %rsp, %rsi movq 104(%rsp), %rdx #ifndef __APPLE__ callq fe_mul_avx2@plt #else callq _fe_mul_avx2 #endif /* __APPLE__ */ movq 104(%rsp), %rsi movq 96(%rsp), %rdi addq $0x70, %rsp repz retq #ifndef __APPLE__ .text .globl fe_ge_to_p2_avx2 .type fe_ge_to_p2_avx2,@function .align 16 fe_ge_to_p2_avx2: #else .section __TEXT,__text .globl _fe_ge_to_p2_avx2 .p2align 4 _fe_ge_to_p2_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $40, %rsp movq %rsi, (%rsp) movq %rdx, 8(%rsp) movq %rcx, 16(%rsp) movq %r8, 24(%rsp) movq %r9, 32(%rsp) movq 16(%rsp), %rsi movq 88(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 24(%rsp), %rsi movq 32(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 88(%rsp), %rsi # Multiply # A[0] * B[0] movq (%rsi), %rdx mulxq (%rbx), %r8, %r9 # A[2] * B[0] mulxq 16(%rbx), %r10, %r11 # A[1] * B[0] mulxq 8(%rbx), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rsi), %rdx mulxq 8(%rbx), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rsi), %rdx mulxq (%rbx), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rbx), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rsi), %rdx mulxq 8(%rbx), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rbx), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rsi), %rdx mulxq 8(%rbx), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rsi), %rdx adoxq %rcx, %r11 mulxq 24(%rbx), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rsi), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rsi), %rdx adoxq %rcx, %r13 mulxq 24(%rbx), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rbx), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rsi), %rdx adcxq %rcx, %r12 mulxq 24(%rbx), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rsi), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rsi), %rdx adcxq %rcx, %r14 mulxq 24(%rbx), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $40, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_to_p3_avx2 .type fe_ge_to_p3_avx2,@function .align 16 fe_ge_to_p3_avx2: #else .section __TEXT,__text .globl _fe_ge_to_p3_avx2 .p2align 4 _fe_ge_to_p3_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $40, %rsp movq %rsi, (%rsp) movq %rdx, 8(%rsp) movq %rcx, 16(%rsp) movq %r8, 24(%rsp) movq %r9, 32(%rsp) movq 24(%rsp), %rsi movq 96(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq (%rsp), %rdi movq 32(%rsp), %rsi movq 88(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq 96(%rsp), %rsi # Multiply # A[0] * B[0] movq (%rsi), %rdx mulxq (%rbx), %r8, %r9 # A[2] * B[0] mulxq 16(%rbx), %r10, %r11 # A[1] * B[0] mulxq 8(%rbx), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rsi), %rdx mulxq 8(%rbx), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rsi), %rdx mulxq (%rbx), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rbx), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rsi), %rdx mulxq 8(%rbx), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rbx), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rsi), %rdx mulxq 8(%rbx), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rsi), %rdx adoxq %rcx, %r11 mulxq 24(%rbx), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rsi), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rsi), %rdx adoxq %rcx, %r13 mulxq 24(%rbx), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rbx), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rsi), %rdx adcxq %rcx, %r12 mulxq 24(%rbx), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rsi), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rsi), %rdx adcxq %rcx, %r14 mulxq 24(%rbx), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq 24(%rsp), %rsi movq 32(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) addq $40, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_dbl_avx2 .type fe_ge_dbl_avx2,@function .align 16 fe_ge_dbl_avx2: #else .section __TEXT,__text .globl _fe_ge_dbl_avx2 .p2align 4 _fe_ge_dbl_avx2: #endif /* __APPLE__ */ pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $48, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq 32(%rsp), %rsi # Square # A[0] * A[1] movq (%rsi), %rdx mulxq 8(%rsi), %r9, %r10 # A[0] * A[3] mulxq 24(%rsi), %r11, %r12 # A[2] * A[1] movq 16(%rsi), %rdx mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rsi), %r13, %r14 adoxq %rax, %r12 # A[2] * A[0] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 adcxq %rcx, %r10 adoxq %r15, %r14 # A[1] * A[3] movq 8(%rsi), %rdx mulxq 24(%rsi), %rbp, %r8 adcxq %rax, %r11 adcxq %rbp, %r12 adcxq %r8, %r13 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 # A[0] * A[0] movq (%rsi), %rdx mulxq %rdx, %r8, %rbp adcxq %r9, %r9 # A[1] * A[1] movq 8(%rsi), %rdx mulxq %rdx, %rcx, %rax adcxq %r10, %r10 adoxq %rbp, %r9 adcxq %r11, %r11 adoxq %rcx, %r10 # A[2] * A[2] movq 16(%rsi), %rdx mulxq %rdx, %rbp, %rcx adcxq %r12, %r12 adoxq %rax, %r11 adcxq %r13, %r13 adoxq %rbp, %r12 # A[3] * A[3] movq 24(%rsi), %rdx mulxq %rdx, %rbp, %rax adcxq %r14, %r14 adoxq %rcx, %r13 adcxq %r15, %r15 adoxq %rbp, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r12, %rbp, %r12 adcxq %rbp, %r8 adoxq %r12, %r9 mulxq %r13, %rbp, %r13 adcxq %rbp, %r9 adoxq %r13, %r10 mulxq %r14, %rbp, %r14 adcxq %rbp, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rbp andq %rcx, %r11 addq %rbp, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 16(%rsp), %rdi movq 40(%rsp), %rbx # Square # A[0] * A[1] movq (%rbx), %rdx mulxq 8(%rbx), %r9, %r10 # A[0] * A[3] mulxq 24(%rbx), %r11, %r12 # A[2] * A[1] movq 16(%rbx), %rdx mulxq 8(%rbx), %rcx, %rax xorq %r15, %r15 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rbx), %r13, %r14 adoxq %rax, %r12 # A[2] * A[0] mulxq (%rbx), %rcx, %rax adoxq %r15, %r13 adcxq %rcx, %r10 adoxq %r15, %r14 # A[1] * A[3] movq 8(%rbx), %rdx mulxq 24(%rbx), %rbp, %r8 adcxq %rax, %r11 adcxq %rbp, %r12 adcxq %r8, %r13 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 # A[0] * A[0] movq (%rbx), %rdx mulxq %rdx, %r8, %rbp adcxq %r9, %r9 # A[1] * A[1] movq 8(%rbx), %rdx mulxq %rdx, %rcx, %rax adcxq %r10, %r10 adoxq %rbp, %r9 adcxq %r11, %r11 adoxq %rcx, %r10 # A[2] * A[2] movq 16(%rbx), %rdx mulxq %rdx, %rbp, %rcx adcxq %r12, %r12 adoxq %rax, %r11 adcxq %r13, %r13 adoxq %rbp, %r12 # A[3] * A[3] movq 24(%rbx), %rdx mulxq %rdx, %rbp, %rax adcxq %r14, %r14 adoxq %rcx, %r13 adcxq %r15, %r15 adoxq %rbp, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r12, %rbp, %r12 adcxq %rbp, %r8 adoxq %r12, %r9 mulxq %r13, %rbp, %r13 adcxq %rbp, %r9 adoxq %r13, %r10 mulxq %r14, %rbp, %r14 adcxq %rbp, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rbp andq %rcx, %r11 addq %rbp, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi # Add movq (%rsi), %r8 movq 8(%rsi), %r9 addq (%rbx), %r8 movq 16(%rsi), %r10 adcq 8(%rbx), %r9 movq 24(%rsi), %rdx adcq 16(%rbx), %r10 movq $-19, %rcx adcq 24(%rbx), %rdx movq $0x7fffffffffffffff, %rax movq %rdx, %r11 sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 24(%rsp), %rsi # Square # A[0] * A[1] movq (%rdi), %rdx mulxq 8(%rdi), %r9, %r10 # A[0] * A[3] mulxq 24(%rdi), %r11, %r12 # A[2] * A[1] movq 16(%rdi), %rdx mulxq 8(%rdi), %rcx, %rax xorq %r15, %r15 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rdi), %r13, %r14 adoxq %rax, %r12 # A[2] * A[0] mulxq (%rdi), %rcx, %rax adoxq %r15, %r13 adcxq %rcx, %r10 adoxq %r15, %r14 # A[1] * A[3] movq 8(%rdi), %rdx mulxq 24(%rdi), %rbp, %r8 adcxq %rax, %r11 adcxq %rbp, %r12 adcxq %r8, %r13 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 # A[0] * A[0] movq (%rdi), %rdx mulxq %rdx, %r8, %rbp adcxq %r9, %r9 # A[1] * A[1] movq 8(%rdi), %rdx mulxq %rdx, %rcx, %rax adcxq %r10, %r10 adoxq %rbp, %r9 adcxq %r11, %r11 adoxq %rcx, %r10 # A[2] * A[2] movq 16(%rdi), %rdx mulxq %rdx, %rbp, %rcx adcxq %r12, %r12 adoxq %rax, %r11 adcxq %r13, %r13 adoxq %rbp, %r12 # A[3] * A[3] movq 24(%rdi), %rdx mulxq %rdx, %rbp, %rax adcxq %r14, %r14 adoxq %rcx, %r13 adcxq %r15, %r15 adoxq %rbp, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rcx # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rcx, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rcx, %rcx mulxq %r12, %rbp, %r12 adcxq %rbp, %r8 adoxq %r12, %r9 mulxq %r13, %rbp, %r13 adcxq %rbp, %r9 adoxq %r13, %r10 mulxq %r14, %rbp, %r14 adcxq %rbp, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rcx, %rdx adcxq %rcx, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rcx imulq $19, %rdx, %rbp andq %rcx, %r11 addq %rbp, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rcx, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 16(%rsp), %rsi movq (%rsp), %rbx # Add movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %rdx movq %r8, %r12 addq (%rbx), %r8 movq %r9, %r13 adcq 8(%rbx), %r9 movq %r10, %r14 adcq 16(%rbx), %r10 movq %rdx, %r15 adcq 24(%rbx), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rbx), %r12 movq $0x00, %rdx sbbq 8(%rbx), %r13 movq $-19, %rcx sbbq 16(%rbx), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rbx), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rsi) movq %r13, 8(%rsi) movq %r14, 16(%rsi) movq %r15, 24(%rsi) movq 24(%rsp), %rsi # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rdi), %r8 movq $0x00, %rdx sbbq 8(%rdi), %r9 movq $-19, %rcx sbbq 16(%rdi), %r10 movq $0x7fffffffffffffff, %rax sbbq 24(%rdi), %r11 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r8 adcq %rdx, %r9 adcq %rdx, %r10 adcq %rax, %r11 movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq 104(%rsp), %rdi # Square * 2 # A[0] * A[1] movq (%rdi), %rdx mulxq 8(%rdi), %r9, %r10 # A[0] * A[3] mulxq 24(%rdi), %r11, %r12 # A[2] * A[1] movq 16(%rdi), %rdx mulxq 8(%rdi), %rcx, %rax xorq %r15, %r15 adoxq %rcx, %r11 # A[2] * A[3] mulxq 24(%rdi), %r13, %r14 adoxq %rax, %r12 # A[2] * A[0] mulxq (%rdi), %rcx, %rax adoxq %r15, %r13 adcxq %rcx, %r10 adoxq %r15, %r14 # A[1] * A[3] movq 8(%rdi), %rdx mulxq 24(%rdi), %rbp, %r8 adcxq %rax, %r11 adcxq %rbp, %r12 adcxq %r8, %r13 adcxq %r15, %r14 # Double with Carry Flag xorq %r15, %r15 # A[0] * A[0] movq (%rdi), %rdx mulxq %rdx, %r8, %rbp adcxq %r9, %r9 # A[1] * A[1] movq 8(%rdi), %rdx mulxq %rdx, %rcx, %rax adcxq %r10, %r10 adoxq %rbp, %r9 adcxq %r11, %r11 adoxq %rcx, %r10 # A[2] * A[2] movq 16(%rdi), %rdx mulxq %rdx, %rbp, %rcx adcxq %r12, %r12 adoxq %rax, %r11 adcxq %r13, %r13 adoxq %rbp, %r12 # A[3] * A[3] movq 24(%rdi), %rdx mulxq %rdx, %rbp, %rax adcxq %r14, %r14 adoxq %rcx, %r13 adcxq %r15, %r15 adoxq %rbp, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax xorq %rbp, %rbp # Move top half into t4-t7 and remove top bit from t3 and double shldq $3, %r15, %rbp shldq $2, %r14, %r15 shldq $2, %r13, %r14 shldq $2, %r12, %r13 shldq $2, %r11, %r12 shldq $0x01, %r10, %r11 shldq $0x01, %r9, %r10 shldq $0x01, %r8, %r9 shlq $0x01, %r8 andq %rax, %r11 # Two out left, one in right andq %rax, %r15 # Multiply top bits by 19*19 imulq $0x169, %rbp, %rcx xorq %rax, %rax # Multiply top half by 19 movq $19, %rdx adoxq %rcx, %r8 mulxq %r12, %rbp, %r12 adcxq %rbp, %r8 adoxq %r12, %r9 mulxq %r13, %rbp, %r13 adcxq %rbp, %r9 adoxq %r13, %r10 mulxq %r14, %rbp, %r14 adcxq %rbp, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rbp andq %rax, %r11 addq %rbp, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 16(%rsp), %rdi # Sub movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 subq (%rdi), %r8 movq $0x00, %rdx sbbq 8(%rdi), %r9 movq $-19, %rcx sbbq 16(%rdi), %r10 movq $0x7fffffffffffffff, %rax sbbq 24(%rdi), %r11 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r8 adcq %rdx, %r9 adcq %rdx, %r10 adcq %rax, %r11 movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) addq $48, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx popq %rbp repz retq #ifndef __APPLE__ .size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_madd_avx2 .type fe_ge_madd_avx2,@function .align 16 fe_ge_madd_avx2: #else .section __TEXT,__text .globl _fe_ge_madd_avx2 .p2align 4 _fe_ge_madd_avx2: #endif /* __APPLE__ */ pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $48, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq 8(%rsp), %rsi movq 40(%rsp), %rbx movq 32(%rsp), %rbp # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rbp), %r8 movq %r9, %r13 adcq 8(%rbp), %r9 movq %r10, %r14 adcq 16(%rbp), %r10 movq %rdx, %r15 adcq 24(%rbp), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rbp), %r12 movq $0x00, %rdx sbbq 8(%rbp), %r13 movq $-19, %rcx sbbq 16(%rbp), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rbp), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rsi) movq %r13, 8(%rsi) movq %r14, 16(%rsi) movq %r15, 24(%rsi) movq 16(%rsp), %rbx movq 128(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rdi), %r8, %r9 # A[2] * B[0] mulxq 16(%rdi), %r10, %r11 # A[1] * B[0] mulxq 8(%rdi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rdi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rdi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rdi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rdi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rdi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rdi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rdi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rdi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rdi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rdi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq 136(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rdi), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rdi), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rdi), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rdi), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rdi), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rdi), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rdi), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rdi), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rdi), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rdi), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rdi), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 24(%rsp), %rdi movq 120(%rsp), %rsi movq 112(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rdi movq (%rsp), %rsi # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rdi), %r8 movq %r9, %r13 adcq 8(%rdi), %r9 movq %r10, %r14 adcq 16(%rdi), %r10 movq %rdx, %r15 adcq 24(%rdi), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rdi), %r12 movq $0x00, %rdx sbbq 8(%rdi), %r13 movq $-19, %rcx sbbq 16(%rdi), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rdi), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rsi) movq %r13, 8(%rsi) movq %r14, 16(%rsi) movq %r15, 24(%rsi) movq 104(%rsp), %rdi # Double movq (%rdi), %r8 movq 8(%rdi), %r9 addq %r8, %r8 movq 16(%rdi), %r10 adcq %r9, %r9 movq 24(%rdi), %rdx adcq %r10, %r10 movq $-19, %rcx adcq %rdx, %rdx movq $0x7fffffffffffffff, %rax movq %rdx, %r11 sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq 24(%rsp), %rdi # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rdi), %r8 movq %r9, %r13 adcq 8(%rdi), %r9 movq %r10, %r14 adcq 16(%rdi), %r10 movq %rdx, %r15 adcq 24(%rdi), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rdi), %r12 movq $0x00, %rdx sbbq 8(%rdi), %r13 movq $-19, %rcx sbbq 16(%rdi), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rdi), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq %r12, (%rdi) movq %r13, 8(%rdi) movq %r14, 16(%rdi) movq %r15, 24(%rdi) addq $48, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx popq %rbp repz retq #ifndef __APPLE__ .size fe_ge_madd_avx2,.-fe_ge_madd_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_msub_avx2 .type fe_ge_msub_avx2,@function .align 16 fe_ge_msub_avx2: #else .section __TEXT,__text .globl _fe_ge_msub_avx2 .p2align 4 _fe_ge_msub_avx2: #endif /* __APPLE__ */ pushq %rbp pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $48, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq 8(%rsp), %rsi movq 40(%rsp), %rbx movq 32(%rsp), %rbp # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rbp), %r8 movq %r9, %r13 adcq 8(%rbp), %r9 movq %r10, %r14 adcq 16(%rbp), %r10 movq %rdx, %r15 adcq 24(%rbp), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rbp), %r12 movq $0x00, %rdx sbbq 8(%rbp), %r13 movq $-19, %rcx sbbq 16(%rbp), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rbp), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rsi) movq %r13, 8(%rsi) movq %r14, 16(%rsi) movq %r15, 24(%rsi) movq 16(%rsp), %rbx movq 136(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rdi), %r8, %r9 # A[2] * B[0] mulxq 16(%rdi), %r10, %r11 # A[1] * B[0] mulxq 8(%rdi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rdi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rdi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rdi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rdi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rdi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rdi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rdi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rdi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rdi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rdi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq 128(%rsp), %rdi # Multiply # A[0] * B[0] movq (%rdi), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rdi), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rdi), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rdi), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rdi), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rdi), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rdi), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rdi), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rdi), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rdi), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rdi), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 24(%rsp), %rdi movq 120(%rsp), %rsi movq 112(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq 8(%rsp), %rsi movq (%rsp), %rbp # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rsi), %r8 movq %r9, %r13 adcq 8(%rsi), %r9 movq %r10, %r14 adcq 16(%rsi), %r10 movq %rdx, %r15 adcq 24(%rsi), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rsi), %r12 movq $0x00, %rdx sbbq 8(%rsi), %r13 movq $-19, %rcx sbbq 16(%rsi), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rsi), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq %r12, (%rbp) movq %r13, 8(%rbp) movq %r14, 16(%rbp) movq %r15, 24(%rbp) movq 104(%rsp), %rsi # Double movq (%rsi), %r8 movq 8(%rsi), %r9 addq %r8, %r8 movq 16(%rsi), %r10 adcq %r9, %r9 movq 24(%rsi), %rdx adcq %r10, %r10 movq $-19, %rcx adcq %rdx, %rdx movq $0x7fffffffffffffff, %rax movq %rdx, %r11 sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rdi), %r8 movq %r9, %r13 adcq 8(%rdi), %r9 movq %r10, %r14 adcq 16(%rdi), %r10 movq %rdx, %r15 adcq 24(%rdi), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rdi), %r12 movq $0x00, %rdx sbbq 8(%rdi), %r13 movq $-19, %rcx sbbq 16(%rdi), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rdi), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rbx) movq %r13, 8(%rbx) movq %r14, 16(%rbx) movq %r15, 24(%rbx) addq $48, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx popq %rbp repz retq #ifndef __APPLE__ .size fe_ge_msub_avx2,.-fe_ge_msub_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_add_avx2 .type fe_ge_add_avx2,@function .align 16 fe_ge_add_avx2: #else .section __TEXT,__text .globl _fe_ge_add_avx2 .p2align 4 _fe_ge_add_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x50, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq 8(%rsp), %rsi movq 40(%rsp), %rbx movq 32(%rsp), %rbp # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rbp), %r8 movq %r9, %r13 adcq 8(%rbp), %r9 movq %r10, %r14 adcq 16(%rbp), %r10 movq %rdx, %r15 adcq 24(%rbp), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rbp), %r12 movq $0x00, %rdx sbbq 8(%rbp), %r13 movq $-19, %rcx sbbq 16(%rbp), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rbp), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rsi) movq %r13, 8(%rsi) movq %r14, 16(%rsi) movq %r15, 24(%rsi) movq 16(%rsp), %rbx movq 168(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rdi), %r8, %r9 # A[2] * B[0] mulxq 16(%rdi), %r10, %r11 # A[1] * B[0] mulxq 8(%rdi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rdi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rdi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rdi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rdi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rdi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rdi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rdi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rdi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rdi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rdi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq 176(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 24(%rsp), %rsi movq 160(%rsp), %rbx movq 144(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rbx), %r8, %r9 # A[2] * B[0] mulxq 16(%rbx), %r10, %r11 # A[1] * B[0] mulxq 8(%rbx), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rbx), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rbx), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rbx), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rbx), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rbx), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rbx), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rbx), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rbx), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rbx), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rbx), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rbx), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 136(%rsp), %rsi movq 152(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) leaq 48(%rsp), %rsi # Double movq (%rdi), %r8 movq 8(%rdi), %r9 addq %r8, %r8 movq 16(%rdi), %r10 adcq %r9, %r9 movq 24(%rdi), %rdx adcq %r10, %r10 movq $-19, %rcx adcq %rdx, %rdx movq $0x7fffffffffffffff, %rax movq %rdx, %r11 sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 8(%rsp), %rbx movq 16(%rsp), %rbp # Add movq (%rbp), %r8 movq 8(%rbp), %r9 movq 16(%rbp), %r10 movq 24(%rbp), %rdx movq %r8, %r12 addq (%rbx), %r8 movq %r9, %r13 adcq 8(%rbx), %r9 movq %r10, %r14 adcq 16(%rbx), %r10 movq %rdx, %r15 adcq 24(%rbx), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rbx), %r12 movq $0x00, %rdx sbbq 8(%rbx), %r13 movq $-19, %rcx sbbq 16(%rbx), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rbx), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq %r12, (%rdi) movq %r13, 8(%rdi) movq %r14, 16(%rdi) movq %r15, 24(%rdi) movq 24(%rsp), %rdi # Add movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %rdx movq %r8, %r12 addq (%rdi), %r8 movq %r9, %r13 adcq 8(%rdi), %r9 movq %r10, %r14 adcq 16(%rdi), %r10 movq %rdx, %r15 adcq 24(%rdi), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rdi), %r12 movq $0x00, %rdx sbbq 8(%rdi), %r13 movq $-19, %rcx sbbq 16(%rdi), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rdi), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rbp) movq %r9, 8(%rbp) movq %r10, 16(%rbp) movq %r11, 24(%rbp) movq %r12, (%rdi) movq %r13, 8(%rdi) movq %r14, 16(%rdi) movq %r15, 24(%rdi) addq $0x50, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_add_avx2,.-fe_ge_add_avx2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_ge_sub_avx2 .type fe_ge_sub_avx2,@function .align 16 fe_ge_sub_avx2: #else .section __TEXT,__text .globl _fe_ge_sub_avx2 .p2align 4 _fe_ge_sub_avx2: #endif /* __APPLE__ */ pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 subq $0x50, %rsp movq %rdi, (%rsp) movq %rsi, 8(%rsp) movq %rdx, 16(%rsp) movq %rcx, 24(%rsp) movq %r8, 32(%rsp) movq %r9, 40(%rsp) movq 8(%rsp), %rsi movq 40(%rsp), %rbx movq 32(%rsp), %rbp # Add movq (%rbx), %r8 movq 8(%rbx), %r9 movq 16(%rbx), %r10 movq 24(%rbx), %rdx movq %r8, %r12 addq (%rbp), %r8 movq %r9, %r13 adcq 8(%rbp), %r9 movq %r10, %r14 adcq 16(%rbp), %r10 movq %rdx, %r15 adcq 24(%rbp), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rbp), %r12 movq $0x00, %rdx sbbq 8(%rbp), %r13 movq $-19, %rcx sbbq 16(%rbp), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rbp), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rsi) movq %r13, 8(%rsi) movq %r14, 16(%rsi) movq %r15, 24(%rsi) movq 16(%rsp), %rbx movq 176(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rdi), %r8, %r9 # A[2] * B[0] mulxq 16(%rdi), %r10, %r11 # A[1] * B[0] mulxq 8(%rdi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rdi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rdi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rdi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rdi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rdi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rdi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rdi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rdi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rdi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rdi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rdi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rdi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq 168(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 24(%rsp), %rsi movq 160(%rsp), %rbx movq 144(%rsp), %rbp # Multiply # A[0] * B[0] movq (%rbp), %rdx mulxq (%rbx), %r8, %r9 # A[2] * B[0] mulxq 16(%rbx), %r10, %r11 # A[1] * B[0] mulxq 8(%rbx), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbp), %rdx mulxq 8(%rbx), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbp), %rdx mulxq (%rbx), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rbx), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbp), %rdx mulxq 8(%rbx), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rbx), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbp), %rdx mulxq 8(%rbx), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbp), %rdx adoxq %rcx, %r11 mulxq 24(%rbx), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbp), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbp), %rdx adoxq %rcx, %r13 mulxq 24(%rbx), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rbx), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbp), %rdx adcxq %rcx, %r12 mulxq 24(%rbx), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbp), %rdx mulxq 16(%rbx), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbp), %rdx adcxq %rcx, %r14 mulxq 24(%rbx), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 136(%rsp), %rsi movq 152(%rsp), %rbx # Multiply # A[0] * B[0] movq (%rbx), %rdx mulxq (%rsi), %r8, %r9 # A[2] * B[0] mulxq 16(%rsi), %r10, %r11 # A[1] * B[0] mulxq 8(%rsi), %rcx, %rax xorq %r15, %r15 adcxq %rcx, %r9 # A[1] * B[3] movq 24(%rbx), %rdx mulxq 8(%rsi), %r12, %r13 adcxq %rax, %r10 # A[0] * B[1] movq 8(%rbx), %rdx mulxq (%rsi), %rcx, %rax adoxq %rcx, %r9 # A[2] * B[1] mulxq 16(%rsi), %rcx, %r14 adoxq %rax, %r10 adcxq %rcx, %r11 # A[1] * B[2] movq 16(%rbx), %rdx mulxq 8(%rsi), %rcx, %rax adcxq %r14, %r12 adoxq %rcx, %r11 adcxq %r15, %r13 adoxq %rax, %r12 # A[0] * B[2] mulxq (%rsi), %rcx, %rax adoxq %r15, %r13 xorq %r14, %r14 adcxq %rcx, %r10 # A[1] * B[1] movq 8(%rbx), %rdx mulxq 8(%rsi), %rdx, %rcx adcxq %rax, %r11 adoxq %rdx, %r10 # A[3] * B[1] movq 8(%rbx), %rdx adoxq %rcx, %r11 mulxq 24(%rsi), %rcx, %rax adcxq %rcx, %r12 # A[2] * B[2] movq 16(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rax, %r13 adoxq %rdx, %r12 # A[3] * B[3] movq 24(%rbx), %rdx adoxq %rcx, %r13 mulxq 24(%rsi), %rcx, %rax adoxq %r15, %r14 adcxq %rcx, %r14 # A[0] * B[3] mulxq (%rsi), %rdx, %rcx adcxq %rax, %r15 xorq %rax, %rax adcxq %rdx, %r11 # A[3] * B[0] movq (%rbx), %rdx adcxq %rcx, %r12 mulxq 24(%rsi), %rdx, %rcx adoxq %rdx, %r11 adoxq %rcx, %r12 # A[2] * B[3] movq 24(%rbx), %rdx mulxq 16(%rsi), %rdx, %rcx adcxq %rdx, %r13 # A[3] * B[2] movq 16(%rbx), %rdx adcxq %rcx, %r14 mulxq 24(%rsi), %rcx, %rdx adcxq %rax, %r15 adoxq %rcx, %r13 adoxq %rdx, %r14 adoxq %rax, %r15 # Reduce movq $0x7fffffffffffffff, %rax # Move top half into t4-t7 and remove top bit from t3 shldq $0x01, %r14, %r15 shldq $0x01, %r13, %r14 shldq $0x01, %r12, %r13 shldq $0x01, %r11, %r12 andq %rax, %r11 # Multiply top half by 19 movq $19, %rdx xorq %rax, %rax mulxq %r12, %rcx, %r12 adcxq %rcx, %r8 adoxq %r12, %r9 mulxq %r13, %rcx, %r13 adcxq %rcx, %r9 adoxq %r13, %r10 mulxq %r14, %rcx, %r14 adcxq %rcx, %r10 adoxq %r14, %r11 mulxq %r15, %r15, %rdx adcxq %r15, %r11 adoxq %rax, %rdx adcxq %rax, %rdx # Overflow shldq $0x01, %r11, %rdx movq $0x7fffffffffffffff, %rax imulq $19, %rdx, %rcx andq %rax, %r11 addq %rcx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Reduce if top bit set movq %r11, %rdx sarq $63, %rdx andq $19, %rdx andq %rax, %r11 addq %rdx, %r8 adcq $0x00, %r9 adcq $0x00, %r10 adcq $0x00, %r11 # Store movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) leaq 48(%rsp), %rsi # Double movq (%rdi), %r8 movq 8(%rdi), %r9 addq %r8, %r8 movq 16(%rdi), %r10 adcq %r9, %r9 movq 24(%rdi), %rdx adcq %r10, %r10 movq $-19, %rcx adcq %rdx, %rdx movq $0x7fffffffffffffff, %rax movq %rdx, %r11 sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 movq %r8, (%rsi) movq %r9, 8(%rsi) movq %r10, 16(%rsi) movq %r11, 24(%rsi) movq 8(%rsp), %rbx movq 16(%rsp), %rbp # Add movq (%rbp), %r8 movq 8(%rbp), %r9 movq 16(%rbp), %r10 movq 24(%rbp), %rdx movq %r8, %r12 addq (%rbx), %r8 movq %r9, %r13 adcq 8(%rbx), %r9 movq %r10, %r14 adcq 16(%rbx), %r10 movq %rdx, %r15 adcq 24(%rbx), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rbx), %r12 movq $0x00, %rdx sbbq 8(%rbx), %r13 movq $-19, %rcx sbbq 16(%rbx), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rbx), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rbx) movq %r9, 8(%rbx) movq %r10, 16(%rbx) movq %r11, 24(%rbx) movq %r12, (%rdi) movq %r13, 8(%rdi) movq %r14, 16(%rdi) movq %r15, 24(%rdi) movq 24(%rsp), %rdi # Add movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %rdx movq %r8, %r12 addq (%rdi), %r8 movq %r9, %r13 adcq 8(%rdi), %r9 movq %r10, %r14 adcq 16(%rdi), %r10 movq %rdx, %r15 adcq 24(%rdi), %rdx movq $-19, %rcx movq %rdx, %r11 movq $0x7fffffffffffffff, %rax sarq $63, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Sub modulus (if overflow) subq %rcx, %r8 sbbq %rdx, %r9 sbbq %rdx, %r10 sbbq %rax, %r11 # Sub subq (%rdi), %r12 movq $0x00, %rdx sbbq 8(%rdi), %r13 movq $-19, %rcx sbbq 16(%rdi), %r14 movq $0x7fffffffffffffff, %rax sbbq 24(%rdi), %r15 sbbq $0x00, %rdx # Mask the modulus andq %rdx, %rcx andq %rdx, %rax # Add modulus (if underflow) addq %rcx, %r12 adcq %rdx, %r13 adcq %rdx, %r14 adcq %rax, %r15 movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, (%rbp) movq %r13, 8(%rbp) movq %r14, 16(%rbp) movq %r15, 24(%rbp) addq $0x50, %rsp popq %r15 popq %r14 popq %r13 popq %r12 popq %rbp popq %rbx repz retq #ifndef __APPLE__ .size fe_ge_sub_avx2,.-fe_ge_sub_avx2 #endif /* __APPLE__ */ #endif /* HAVE_INTEL_AVX2 */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif