/* armv8-curve25519 * * Copyright (C) 2006-2023 wolfSSL Inc. * * This file is part of wolfSSL. * * wolfSSL is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * wolfSSL is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA */ #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include /* Generated using (from wolfssl): * cd ../scripts * ruby ./x25519/x25519.rb arm64 ../wolfssl/wolfcrypt/src/port/arm/armv8-curve25519.S */ #ifdef WOLFSSL_ARMASM #ifdef __aarch64__ #ifndef WOLFSSL_ARMASM_INLINE #if defined(HAVE_CURVE25519) || defined(HAVE_ED25519) #if !defined(CURVE25519_SMALL) || !defined(ED25519_SMALL) #ifndef __APPLE__ .text .globl fe_init .type fe_init,@function .align 2 fe_init: #else .section __TEXT,__text .globl _fe_init .p2align 2 _fe_init: #endif /* __APPLE__ */ ret #ifndef __APPLE__ .size fe_init,.-fe_init #endif /* __APPLE__ */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_frombytes .type fe_frombytes,@function .align 2 fe_frombytes: #else .section __TEXT,__text .globl _fe_frombytes .p2align 2 _fe_frombytes: #endif /* __APPLE__ */ ldp x2, x3, [x1] ldp x4, x5, [x1, #16] and x5, x5, #0x7fffffffffffffff stp x2, x3, [x0] stp x4, x5, [x0, #16] ret #ifndef __APPLE__ .size fe_frombytes,.-fe_frombytes #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_tobytes .type fe_tobytes,@function .align 2 fe_tobytes: #else .section __TEXT,__text .globl _fe_tobytes .p2align 2 _fe_tobytes: #endif /* __APPLE__ */ mov x7, #19 ldp x2, x3, [x1] ldp x4, x5, [x1, #16] adds x6, x2, x7 adcs x6, x3, xzr adcs x6, x4, xzr adc x6, x5, xzr and x6, x7, x6, asr 63 adds x2, x2, x6 adcs x3, x3, xzr adcs x4, x4, xzr adc x5, x5, xzr and x5, x5, #0x7fffffffffffffff stp x2, x3, [x0] stp x4, x5, [x0, #16] ret #ifndef __APPLE__ .size fe_tobytes,.-fe_tobytes #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_1 .type fe_1,@function .align 2 fe_1: #else .section __TEXT,__text .globl _fe_1 .p2align 2 _fe_1: #endif /* __APPLE__ */ # Set one mov x1, #1 stp x1, xzr, [x0] stp xzr, xzr, [x0, #16] ret #ifndef __APPLE__ .size fe_1,.-fe_1 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_0 .type fe_0,@function .align 2 fe_0: #else .section __TEXT,__text .globl _fe_0 .p2align 2 _fe_0: #endif /* __APPLE__ */ # Set zero stp xzr, xzr, [x0] stp xzr, xzr, [x0, #16] ret #ifndef __APPLE__ .size fe_0,.-fe_0 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_copy .type fe_copy,@function .align 2 fe_copy: #else .section __TEXT,__text .globl _fe_copy .p2align 2 _fe_copy: #endif /* __APPLE__ */ # Copy ldp x2, x3, [x1] ldp x4, x5, [x1, #16] stp x2, x3, [x0] stp x4, x5, [x0, #16] ret #ifndef __APPLE__ .size fe_copy,.-fe_copy #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sub .type fe_sub,@function .align 2 fe_sub: #else .section __TEXT,__text .globl _fe_sub .p2align 2 _fe_sub: #endif /* __APPLE__ */ # Sub ldp x3, x4, [x1] ldp x5, x6, [x1, #16] ldp x7, x8, [x2] ldp x9, x10, [x2, #16] subs x3, x3, x7 sbcs x4, x4, x8 sbcs x5, x5, x9 sbcs x6, x6, x10 csetm x11, cc mov x12, #-19 # Mask the modulus extr x11, x11, x6, #63 mul x12, x11, x12 # Add modulus (if underflow) subs x3, x3, x12 sbcs x4, x4, xzr and x6, x6, #0x7fffffffffffffff sbcs x5, x5, xzr sbc x6, x6, xzr stp x3, x4, [x0] stp x5, x6, [x0, #16] ret #ifndef __APPLE__ .size fe_sub,.-fe_sub #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_add .type fe_add,@function .align 2 fe_add: #else .section __TEXT,__text .globl _fe_add .p2align 2 _fe_add: #endif /* __APPLE__ */ # Add ldp x3, x4, [x1] ldp x5, x6, [x1, #16] ldp x7, x8, [x2] ldp x9, x10, [x2, #16] adds x3, x3, x7 adcs x4, x4, x8 adcs x5, x5, x9 adcs x6, x6, x10 cset x11, cs mov x12, #19 # Mask the modulus extr x11, x11, x6, #63 mul x12, x11, x12 # Sub modulus (if overflow) adds x3, x3, x12 adcs x4, x4, xzr and x6, x6, #0x7fffffffffffffff adcs x5, x5, xzr adc x6, x6, xzr stp x3, x4, [x0] stp x5, x6, [x0, #16] ret #ifndef __APPLE__ .size fe_add,.-fe_add #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_neg .type fe_neg,@function .align 2 fe_neg: #else .section __TEXT,__text .globl _fe_neg .p2align 2 _fe_neg: #endif /* __APPLE__ */ ldp x2, x3, [x1] ldp x4, x5, [x1, #16] mov x6, #-19 mov x7, #-1 mov x8, #-1 mov x9, #0x7fffffffffffffff subs x6, x6, x2 sbcs x7, x7, x3 sbcs x8, x8, x4 sbc x9, x9, x5 stp x6, x7, [x0] stp x8, x9, [x0, #16] ret #ifndef __APPLE__ .size fe_neg,.-fe_neg #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_isnonzero .type fe_isnonzero,@function .align 2 fe_isnonzero: #else .section __TEXT,__text .globl _fe_isnonzero .p2align 2 _fe_isnonzero: #endif /* __APPLE__ */ mov x6, #19 ldp x1, x2, [x0] ldp x3, x4, [x0, #16] adds x5, x1, x6 adcs x5, x2, xzr adcs x5, x3, xzr adc x5, x4, xzr and x5, x6, x5, asr 63 adds x1, x1, x5 adcs x2, x2, xzr adcs x3, x3, xzr adc x4, x4, xzr and x4, x4, #0x7fffffffffffffff orr x0, x1, x2 orr x3, x3, x4 orr x0, x0, x3 ret #ifndef __APPLE__ .size fe_isnonzero,.-fe_isnonzero #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_isnegative .type fe_isnegative,@function .align 2 fe_isnegative: #else .section __TEXT,__text .globl _fe_isnegative .p2align 2 _fe_isnegative: #endif /* __APPLE__ */ mov x6, #19 ldp x1, x2, [x0] ldp x3, x4, [x0, #16] adds x5, x1, x6 adcs x5, x2, xzr adcs x5, x3, xzr adc x5, x4, xzr and x0, x1, #1 eor x0, x0, x5, lsr 63 ret #ifndef __APPLE__ .size fe_isnegative,.-fe_isnegative #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_cmov_table .type fe_cmov_table,@function .align 2 fe_cmov_table: #else .section __TEXT,__text .globl _fe_cmov_table .p2align 2 _fe_cmov_table: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 str x17, [x29, #40] str x19, [x29, #48] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] stp x26, x27, [x29, #104] str x28, [x29, #120] str x0, [x29, #16] sxtb x2, w2 sbfx x3, x2, #7, #1 eor x0, x2, x3 sub x0, x0, x3 mov x4, #1 mov x5, xzr mov x6, xzr mov x7, xzr mov x8, #1 mov x9, xzr mov x10, xzr mov x11, xzr mov x12, xzr mov x13, xzr mov x14, xzr mov x15, xzr cmp x0, #1 ldp x16, x17, [x1] ldp x19, x20, [x1, #16] ldp x21, x22, [x1, #32] ldp x23, x24, [x1, #48] ldp x25, x26, [x1, #64] ldp x27, x28, [x1, #80] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq cmp x0, #2 ldp x16, x17, [x1, #96] ldp x19, x20, [x1, #112] ldp x21, x22, [x1, #128] ldp x23, x24, [x1, #144] ldp x25, x26, [x1, #160] ldp x27, x28, [x1, #176] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq cmp x0, #3 ldp x16, x17, [x1, #192] ldp x19, x20, [x1, #208] ldp x21, x22, [x1, #224] ldp x23, x24, [x1, #240] ldp x25, x26, [x1, #256] ldp x27, x28, [x1, #272] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq cmp x0, #4 ldp x16, x17, [x1, #288] ldp x19, x20, [x1, #304] ldp x21, x22, [x1, #320] ldp x23, x24, [x1, #336] ldp x25, x26, [x1, #352] ldp x27, x28, [x1, #368] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq add x1, x1, #0x180 cmp x0, #5 ldp x16, x17, [x1] ldp x19, x20, [x1, #16] ldp x21, x22, [x1, #32] ldp x23, x24, [x1, #48] ldp x25, x26, [x1, #64] ldp x27, x28, [x1, #80] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq cmp x0, #6 ldp x16, x17, [x1, #96] ldp x19, x20, [x1, #112] ldp x21, x22, [x1, #128] ldp x23, x24, [x1, #144] ldp x25, x26, [x1, #160] ldp x27, x28, [x1, #176] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq cmp x0, #7 ldp x16, x17, [x1, #192] ldp x19, x20, [x1, #208] ldp x21, x22, [x1, #224] ldp x23, x24, [x1, #240] ldp x25, x26, [x1, #256] ldp x27, x28, [x1, #272] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq cmp x0, #8 ldp x16, x17, [x1, #288] ldp x19, x20, [x1, #304] ldp x21, x22, [x1, #320] ldp x23, x24, [x1, #336] ldp x25, x26, [x1, #352] ldp x27, x28, [x1, #368] csel x4, x16, x4, eq csel x5, x17, x5, eq csel x6, x19, x6, eq csel x7, x20, x7, eq csel x8, x21, x8, eq csel x9, x22, x9, eq csel x10, x23, x10, eq csel x11, x24, x11, eq csel x12, x25, x12, eq csel x13, x26, x13, eq csel x14, x27, x14, eq csel x15, x28, x15, eq mov x16, #-19 mov x17, #-1 mov x19, #-1 mov x20, #0x7fffffffffffffff subs x16, x16, x12 sbcs x17, x17, x13 sbcs x19, x19, x14 sbc x20, x20, x15 cmp x2, #0 mov x3, x4 csel x4, x8, x4, lt csel x8, x3, x8, lt mov x3, x5 csel x5, x9, x5, lt csel x9, x3, x9, lt mov x3, x6 csel x6, x10, x6, lt csel x10, x3, x10, lt mov x3, x7 csel x7, x11, x7, lt csel x11, x3, x11, lt csel x12, x16, x12, lt csel x13, x17, x13, lt csel x14, x19, x14, lt csel x15, x20, x15, lt ldr x0, [x29, #16] stp x4, x5, [x0] stp x6, x7, [x0, #16] stp x8, x9, [x0, #32] stp x10, x11, [x0, #48] stp x12, x13, [x0, #64] stp x14, x15, [x0, #80] ldr x17, [x29, #40] ldr x19, [x29, #48] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] ldp x26, x27, [x29, #104] ldr x28, [x29, #120] ldp x29, x30, [sp], #0x80 ret #ifndef __APPLE__ .size fe_cmov_table,.-fe_cmov_table #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #ifndef __APPLE__ .text .globl fe_mul .type fe_mul,@function .align 2 fe_mul: #else .section __TEXT,__text .globl _fe_mul .p2align 2 _fe_mul: #endif /* __APPLE__ */ stp x29, x30, [sp, #-64]! add x29, sp, #0 str x17, [x29, #24] str x19, [x29, #32] stp x20, x21, [x29, #40] str x22, [x29, #56] # Multiply ldp x14, x15, [x1] ldp x16, x17, [x1, #16] ldp x19, x20, [x2] ldp x21, x22, [x2, #16] # A[0] * B[0] umulh x7, x14, x19 mul x6, x14, x19 # A[2] * B[0] umulh x9, x16, x19 mul x8, x16, x19 # A[1] * B[0] mul x3, x15, x19 adds x7, x7, x3 umulh x4, x15, x19 adcs x8, x8, x4 # A[1] * B[3] umulh x11, x15, x22 adc x9, x9, xzr mul x10, x15, x22 # A[0] * B[1] mul x3, x14, x20 adds x7, x7, x3 umulh x4, x14, x20 adcs x8, x8, x4 # A[2] * B[1] mul x3, x16, x20 adcs x9, x9, x3 umulh x4, x16, x20 adcs x10, x10, x4 adc x11, x11, xzr # A[1] * B[2] mul x3, x15, x21 adds x9, x9, x3 umulh x4, x15, x21 adcs x10, x10, x4 adcs x11, x11, xzr adc x12, xzr, xzr # A[0] * B[2] mul x3, x14, x21 adds x8, x8, x3 umulh x4, x14, x21 adcs x9, x9, x4 adcs x10, x10, xzr adcs x11, x11, xzr adc x12, x12, xzr # A[1] * B[1] mul x3, x15, x20 adds x8, x8, x3 umulh x4, x15, x20 adcs x9, x9, x4 # A[3] * B[1] mul x3, x17, x20 adcs x10, x10, x3 umulh x4, x17, x20 adcs x11, x11, x4 adc x12, x12, xzr # A[2] * B[2] mul x3, x16, x21 adds x10, x10, x3 umulh x4, x16, x21 adcs x11, x11, x4 # A[3] * B[3] mul x3, x17, x22 adcs x12, x12, x3 umulh x13, x17, x22 adc x13, x13, xzr # A[0] * B[3] mul x3, x14, x22 adds x9, x9, x3 umulh x4, x14, x22 adcs x10, x10, x4 # A[2] * B[3] mul x3, x16, x22 adcs x11, x11, x3 umulh x4, x16, x22 adcs x12, x12, x4 adc x13, x13, xzr # A[3] * B[0] mul x3, x17, x19 adds x9, x9, x3 umulh x4, x17, x19 adcs x10, x10, x4 # A[3] * B[2] mul x3, x17, x21 adcs x11, x11, x3 umulh x4, x17, x21 adcs x12, x12, x4 adc x13, x13, xzr # Reduce mov x3, #38 mul x4, x3, x13 adds x9, x9, x4 umulh x5, x3, x13 adc x5, x5, xzr mov x3, #19 extr x5, x5, x9, #63 mul x5, x5, x3 and x9, x9, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x10 adds x6, x6, x4 umulh x10, x3, x10 mul x4, x3, x11 adcs x7, x7, x4 umulh x11, x3, x11 mul x4, x3, x12 adcs x8, x8, x4 umulh x12, x3, x12 adc x9, x9, xzr # Add high product results in adds x6, x6, x5 adcs x7, x7, x10 adcs x8, x8, x11 adc x9, x9, x12 # Reduce if top bit set mov x3, #19 and x4, x3, x9, asr 63 adds x6, x6, x4 adcs x7, x7, xzr and x9, x9, #0x7fffffffffffffff adcs x8, x8, xzr adc x9, x9, xzr # Store stp x6, x7, [x0] stp x8, x9, [x0, #16] ldr x17, [x29, #24] ldr x19, [x29, #32] ldp x20, x21, [x29, #40] ldr x22, [x29, #56] ldp x29, x30, [sp], #0x40 ret #ifndef __APPLE__ .size fe_mul,.-fe_mul #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_sq .type fe_sq,@function .align 2 fe_sq: #else .section __TEXT,__text .globl _fe_sq .p2align 2 _fe_sq: #endif /* __APPLE__ */ # Square ldp x13, x14, [x1] ldp x15, x16, [x1, #16] # A[0] * A[1] umulh x7, x13, x14 mul x6, x13, x14 # A[0] * A[3] umulh x9, x13, x16 mul x8, x13, x16 # A[0] * A[2] mul x2, x13, x15 adds x7, x7, x2 umulh x3, x13, x15 adcs x8, x8, x3 # A[1] * A[3] mul x2, x14, x16 adcs x9, x9, x2 umulh x10, x14, x16 adc x10, x10, xzr # A[1] * A[2] mul x2, x14, x15 adds x8, x8, x2 umulh x3, x14, x15 adcs x9, x9, x3 # A[2] * A[3] mul x2, x15, x16 adcs x10, x10, x2 umulh x11, x15, x16 adc x11, x11, xzr # Double adds x6, x6, x6 adcs x7, x7, x7 adcs x8, x8, x8 adcs x9, x9, x9 adcs x10, x10, x10 adcs x11, x11, x11 adc x12, xzr, xzr # A[0] * A[0] umulh x3, x13, x13 mul x5, x13, x13 # A[1] * A[1] mul x2, x14, x14 adds x6, x6, x3 umulh x3, x14, x14 adcs x7, x7, x2 # A[2] * A[2] mul x2, x15, x15 adcs x8, x8, x3 umulh x3, x15, x15 adcs x9, x9, x2 # A[3] * A[3] mul x2, x16, x16 adcs x10, x10, x3 umulh x3, x16, x16 adcs x11, x11, x2 adc x12, x12, x3 # Reduce mov x2, #38 mul x3, x2, x12 adds x8, x8, x3 umulh x4, x2, x12 adc x4, x4, xzr mov x2, #19 extr x4, x4, x8, #63 mul x4, x4, x2 and x8, x8, #0x7fffffffffffffff mov x2, #38 mul x3, x2, x9 adds x5, x5, x3 umulh x9, x2, x9 mul x3, x2, x10 adcs x6, x6, x3 umulh x10, x2, x10 mul x3, x2, x11 adcs x7, x7, x3 umulh x11, x2, x11 adc x8, x8, xzr # Add high product results in adds x5, x5, x4 adcs x6, x6, x9 adcs x7, x7, x10 adc x8, x8, x11 # Reduce if top bit set mov x2, #19 and x3, x2, x8, asr 63 adds x5, x5, x3 adcs x6, x6, xzr and x8, x8, #0x7fffffffffffffff adcs x7, x7, xzr adc x8, x8, xzr # Store stp x5, x6, [x0] stp x7, x8, [x0, #16] ret #ifndef __APPLE__ .size fe_sq,.-fe_sq #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl fe_invert .type fe_invert,@function .align 2 fe_invert: #else .section __TEXT,__text .globl _fe_invert .p2align 2 _fe_invert: #endif /* __APPLE__ */ stp x29, x30, [sp, #-176]! add x29, sp, #0 str x17, [x29, #160] str x20, [x29, #168] # Invert str x0, [x29, #144] str x1, [x29, #152] add x0, x29, #16 #ifndef NDEBUG ldr x1, [x29, #152] #endif /* !NDEBUG */ #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ add x0, x29, #48 add x1, x29, #16 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #48 #endif /* !NDEBUG */ add x1, x29, #48 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #48 #endif /* !NDEBUG */ ldr x1, [x29, #152] add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ add x0, x29, #16 add x1, x29, #16 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ add x0, x29, #0x50 #ifndef NDEBUG add x1, x29, #16 #endif /* !NDEBUG */ #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ add x0, x29, #48 add x1, x29, #48 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 5 times mov x20, #5 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_invert1: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert1 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] #ifndef NDEBUG add x0, x29, #48 #endif /* !NDEBUG */ add x1, x29, #0x50 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 10 times mov x20, #10 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_invert2: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert2 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] add x0, x29, #0x50 #ifndef NDEBUG add x1, x29, #0x50 #endif /* !NDEBUG */ add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 20 times mov x20, #20 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_fe_invert3: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert3 # Store stp x6, x7, [x29, #112] stp x8, x9, [x29, #128] #ifndef NDEBUG add x0, x29, #0x50 #endif /* !NDEBUG */ add x1, x29, #0x70 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 10 times mov x20, #10 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_fe_invert4: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert4 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] add x0, x29, #48 add x1, x29, #0x50 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 50 times mov x20, #50 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_invert5: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert5 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] add x0, x29, #0x50 #ifndef NDEBUG add x1, x29, #0x50 #endif /* !NDEBUG */ add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 100 times mov x20, #0x64 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_fe_invert6: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert6 # Store stp x6, x7, [x29, #112] stp x8, x9, [x29, #128] #ifndef NDEBUG add x0, x29, #0x50 #endif /* !NDEBUG */ add x1, x29, #0x70 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 50 times mov x20, #50 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_fe_invert7: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert7 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] add x0, x29, #48 add x1, x29, #0x50 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 5 times mov x20, #5 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_invert8: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x20, x20, #1 bne L_fe_invert8 # Store stp x6, x7, [x29, #48] stp x8, x9, [x29, #64] ldr x0, [x29, #144] add x1, x29, #48 add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ ldr x17, [x29, #160] ldr x20, [x29, #168] ldp x29, x30, [sp], #0xb0 ret #ifndef __APPLE__ .size fe_invert,.-fe_invert #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl curve25519 .type curve25519,@function .align 2 curve25519: #else .section __TEXT,__text .globl _curve25519 .p2align 2 _curve25519: #endif /* __APPLE__ */ stp x29, x30, [sp, #-288]! add x29, sp, #0 str x17, [x29, #200] str x19, [x29, #208] stp x20, x21, [x29, #216] stp x22, x23, [x29, #232] stp x24, x25, [x29, #248] stp x26, x27, [x29, #264] str x28, [x29, #280] mov x23, xzr str x0, [x29, #176] str x2, [x29, #184] ldp x6, x7, [x2] ldp x8, x9, [x2, #16] mov x10, #1 mov x11, xzr mov x12, xzr mov x13, xzr stp x10, x11, [x0] stp x12, x13, [x0, #16] # Set zero stp xzr, xzr, [x29, #16] stp xzr, xzr, [x29, #32] mov x24, #0xfe L_curve25519_bits: lsr x3, x24, #6 and x4, x24, #63 ldr x5, [x1, x3, LSL 3] lsr x5, x5, x4 eor x23, x23, x5 # Conditional Swap subs xzr, xzr, x23, lsl 63 ldp x25, x26, [x29, #16] ldp x27, x28, [x29, #32] csel x19, x25, x10, ne csel x25, x10, x25, ne csel x20, x26, x11, ne csel x26, x11, x26, ne csel x21, x27, x12, ne csel x27, x12, x27, ne csel x22, x28, x13, ne csel x28, x13, x28, ne # Conditional Swap subs xzr, xzr, x23, lsl 63 ldp x10, x11, [x0] ldp x12, x13, [x0, #16] csel x14, x10, x6, ne csel x10, x6, x10, ne csel x15, x11, x7, ne csel x11, x7, x11, ne csel x16, x12, x8, ne csel x12, x8, x12, ne csel x17, x13, x9, ne csel x13, x9, x13, ne mov x23, x5 # Add adds x6, x10, x25 adcs x7, x11, x26 adcs x8, x12, x27 adcs x9, x13, x28 cset x5, cs mov x3, #19 extr x5, x5, x9, #63 mul x3, x5, x3 # Sub modulus (if overflow) adds x6, x6, x3 adcs x7, x7, xzr and x9, x9, #0x7fffffffffffffff adcs x8, x8, xzr adc x9, x9, xzr # Sub subs x25, x10, x25 sbcs x26, x11, x26 sbcs x27, x12, x27 sbcs x28, x13, x28 csetm x5, cc mov x3, #-19 extr x5, x5, x28, #63 mul x3, x5, x3 # Add modulus (if underflow) subs x25, x25, x3 sbcs x26, x26, xzr and x28, x28, #0x7fffffffffffffff sbcs x27, x27, xzr sbc x28, x28, xzr stp x25, x26, [x29, #80] stp x27, x28, [x29, #96] # Add adds x10, x14, x19 adcs x11, x15, x20 adcs x12, x16, x21 adcs x13, x17, x22 cset x5, cs mov x3, #19 extr x5, x5, x13, #63 mul x3, x5, x3 # Sub modulus (if overflow) adds x10, x10, x3 adcs x11, x11, xzr and x13, x13, #0x7fffffffffffffff adcs x12, x12, xzr adc x13, x13, xzr # Sub subs x14, x14, x19 sbcs x15, x15, x20 sbcs x16, x16, x21 sbcs x17, x17, x22 csetm x5, cc mov x3, #-19 extr x5, x5, x17, #63 mul x3, x5, x3 # Add modulus (if underflow) subs x14, x14, x3 sbcs x15, x15, xzr and x17, x17, #0x7fffffffffffffff sbcs x16, x16, xzr sbc x17, x17, xzr # Multiply # A[0] * B[0] umulh x20, x14, x6 mul x19, x14, x6 # A[2] * B[0] umulh x22, x16, x6 mul x21, x16, x6 # A[1] * B[0] mul x3, x15, x6 adds x20, x20, x3 umulh x4, x15, x6 adcs x21, x21, x4 # A[1] * B[3] umulh x26, x15, x9 adc x22, x22, xzr mul x25, x15, x9 # A[0] * B[1] mul x3, x14, x7 adds x20, x20, x3 umulh x4, x14, x7 adcs x21, x21, x4 # A[2] * B[1] mul x3, x16, x7 adcs x22, x22, x3 umulh x4, x16, x7 adcs x25, x25, x4 adc x26, x26, xzr # A[1] * B[2] mul x3, x15, x8 adds x22, x22, x3 umulh x4, x15, x8 adcs x25, x25, x4 adcs x26, x26, xzr adc x27, xzr, xzr # A[0] * B[2] mul x3, x14, x8 adds x21, x21, x3 umulh x4, x14, x8 adcs x22, x22, x4 adcs x25, x25, xzr adcs x26, x26, xzr adc x27, x27, xzr # A[1] * B[1] mul x3, x15, x7 adds x21, x21, x3 umulh x4, x15, x7 adcs x22, x22, x4 # A[3] * B[1] mul x3, x17, x7 adcs x25, x25, x3 umulh x4, x17, x7 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[2] mul x3, x16, x8 adds x25, x25, x3 umulh x4, x16, x8 adcs x26, x26, x4 # A[3] * B[3] mul x3, x17, x9 adcs x27, x27, x3 umulh x28, x17, x9 adc x28, x28, xzr # A[0] * B[3] mul x3, x14, x9 adds x22, x22, x3 umulh x4, x14, x9 adcs x25, x25, x4 # A[2] * B[3] mul x3, x16, x9 adcs x26, x26, x3 umulh x4, x16, x9 adcs x27, x27, x4 adc x28, x28, xzr # A[3] * B[0] mul x3, x17, x6 adds x22, x22, x3 umulh x4, x17, x6 adcs x25, x25, x4 # A[3] * B[2] mul x3, x17, x8 adcs x26, x26, x3 umulh x4, x17, x8 adcs x27, x27, x4 adc x28, x28, xzr # Reduce mov x3, #38 mul x4, x3, x28 adds x22, x22, x4 umulh x5, x3, x28 adc x5, x5, xzr mov x3, #19 extr x5, x5, x22, #63 mul x5, x5, x3 and x22, x22, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x25 adds x19, x19, x4 umulh x25, x3, x25 mul x4, x3, x26 adcs x20, x20, x4 umulh x26, x3, x26 mul x4, x3, x27 adcs x21, x21, x4 umulh x27, x3, x27 adc x22, x22, xzr # Add high product results in adds x19, x19, x5 adcs x20, x20, x25 adcs x21, x21, x26 adc x22, x22, x27 # Store stp x19, x20, [x29, #48] stp x21, x22, [x29, #64] # Multiply ldp x25, x26, [x29, #80] ldp x27, x28, [x29, #96] # A[0] * B[0] umulh x20, x10, x25 mul x19, x10, x25 # A[2] * B[0] umulh x22, x12, x25 mul x21, x12, x25 # A[1] * B[0] mul x3, x11, x25 adds x20, x20, x3 umulh x4, x11, x25 adcs x21, x21, x4 # A[1] * B[3] umulh x15, x11, x28 adc x22, x22, xzr mul x14, x11, x28 # A[0] * B[1] mul x3, x10, x26 adds x20, x20, x3 umulh x4, x10, x26 adcs x21, x21, x4 # A[2] * B[1] mul x3, x12, x26 adcs x22, x22, x3 umulh x4, x12, x26 adcs x14, x14, x4 adc x15, x15, xzr # A[1] * B[2] mul x3, x11, x27 adds x22, x22, x3 umulh x4, x11, x27 adcs x14, x14, x4 adcs x15, x15, xzr adc x16, xzr, xzr # A[0] * B[2] mul x3, x10, x27 adds x21, x21, x3 umulh x4, x10, x27 adcs x22, x22, x4 adcs x14, x14, xzr adcs x15, x15, xzr adc x16, x16, xzr # A[1] * B[1] mul x3, x11, x26 adds x21, x21, x3 umulh x4, x11, x26 adcs x22, x22, x4 # A[3] * B[1] mul x3, x13, x26 adcs x14, x14, x3 umulh x4, x13, x26 adcs x15, x15, x4 adc x16, x16, xzr # A[2] * B[2] mul x3, x12, x27 adds x14, x14, x3 umulh x4, x12, x27 adcs x15, x15, x4 # A[3] * B[3] mul x3, x13, x28 adcs x16, x16, x3 umulh x17, x13, x28 adc x17, x17, xzr # A[0] * B[3] mul x3, x10, x28 adds x22, x22, x3 umulh x4, x10, x28 adcs x14, x14, x4 # A[2] * B[3] mul x3, x12, x28 adcs x15, x15, x3 umulh x4, x12, x28 adcs x16, x16, x4 adc x17, x17, xzr # A[3] * B[0] mul x3, x13, x25 adds x22, x22, x3 umulh x4, x13, x25 adcs x14, x14, x4 # A[3] * B[2] mul x3, x13, x27 adcs x15, x15, x3 umulh x4, x13, x27 adcs x16, x16, x4 adc x17, x17, xzr # Reduce mov x3, #38 mul x4, x3, x17 adds x22, x22, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x22, #63 mul x5, x5, x3 and x22, x22, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x19, x19, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x20, x20, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x21, x21, x4 umulh x16, x3, x16 adc x22, x22, xzr # Add high product results in adds x19, x19, x5 adcs x20, x20, x14 adcs x21, x21, x15 adc x22, x22, x16 # Square # A[0] * A[1] umulh x12, x25, x26 mul x11, x25, x26 # A[0] * A[3] umulh x14, x25, x28 mul x13, x25, x28 # A[0] * A[2] mul x3, x25, x27 adds x12, x12, x3 umulh x4, x25, x27 adcs x13, x13, x4 # A[1] * A[3] mul x3, x26, x28 adcs x14, x14, x3 umulh x15, x26, x28 adc x15, x15, xzr # A[1] * A[2] mul x3, x26, x27 adds x13, x13, x3 umulh x4, x26, x27 adcs x14, x14, x4 # A[2] * A[3] mul x3, x27, x28 adcs x15, x15, x3 umulh x16, x27, x28 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x25, x25 mul x10, x25, x25 # A[1] * A[1] mul x3, x26, x26 adds x11, x11, x4 umulh x4, x26, x26 adcs x12, x12, x3 # A[2] * A[2] mul x3, x27, x27 adcs x13, x13, x4 umulh x4, x27, x27 adcs x14, x14, x3 # A[3] * A[3] mul x3, x28, x28 adcs x15, x15, x4 umulh x4, x28, x28 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x10, x10, x5 adcs x11, x11, x14 adcs x12, x12, x15 adc x13, x13, x16 # Square # A[0] * A[1] umulh x16, x6, x7 mul x15, x6, x7 # A[0] * A[3] umulh x25, x6, x9 mul x17, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x16, x16, x3 umulh x4, x6, x8 adcs x17, x17, x4 # A[1] * A[3] mul x3, x7, x9 adcs x25, x25, x3 umulh x26, x7, x9 adc x26, x26, xzr # A[1] * A[2] mul x3, x7, x8 adds x17, x17, x3 umulh x4, x7, x8 adcs x25, x25, x4 # A[2] * A[3] mul x3, x8, x9 adcs x26, x26, x3 umulh x27, x8, x9 adc x27, x27, xzr # Double adds x15, x15, x15 adcs x16, x16, x16 adcs x17, x17, x17 adcs x25, x25, x25 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x14, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x15, x15, x4 umulh x4, x7, x7 adcs x16, x16, x3 # A[2] * A[2] mul x3, x8, x8 adcs x17, x17, x4 umulh x4, x8, x8 adcs x25, x25, x3 # A[3] * A[3] mul x3, x9, x9 adcs x26, x26, x4 umulh x4, x9, x9 adcs x27, x27, x3 adc x28, x28, x4 # Reduce mov x3, #38 mul x4, x3, x28 adds x17, x17, x4 umulh x5, x3, x28 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x25 adds x14, x14, x4 umulh x25, x3, x25 mul x4, x3, x26 adcs x15, x15, x4 umulh x26, x3, x26 mul x4, x3, x27 adcs x16, x16, x4 umulh x27, x3, x27 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x25 adcs x16, x16, x26 adc x17, x17, x27 # Multiply # A[0] * B[0] umulh x7, x14, x10 mul x6, x14, x10 # A[2] * B[0] umulh x9, x16, x10 mul x8, x16, x10 # A[1] * B[0] mul x3, x15, x10 adds x7, x7, x3 umulh x4, x15, x10 adcs x8, x8, x4 # A[1] * B[3] umulh x26, x15, x13 adc x9, x9, xzr mul x25, x15, x13 # A[0] * B[1] mul x3, x14, x11 adds x7, x7, x3 umulh x4, x14, x11 adcs x8, x8, x4 # A[2] * B[1] mul x3, x16, x11 adcs x9, x9, x3 umulh x4, x16, x11 adcs x25, x25, x4 adc x26, x26, xzr # A[1] * B[2] mul x3, x15, x12 adds x9, x9, x3 umulh x4, x15, x12 adcs x25, x25, x4 adcs x26, x26, xzr adc x27, xzr, xzr # A[0] * B[2] mul x3, x14, x12 adds x8, x8, x3 umulh x4, x14, x12 adcs x9, x9, x4 adcs x25, x25, xzr adcs x26, x26, xzr adc x27, x27, xzr # A[1] * B[1] mul x3, x15, x11 adds x8, x8, x3 umulh x4, x15, x11 adcs x9, x9, x4 # A[3] * B[1] mul x3, x17, x11 adcs x25, x25, x3 umulh x4, x17, x11 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[2] mul x3, x16, x12 adds x25, x25, x3 umulh x4, x16, x12 adcs x26, x26, x4 # A[3] * B[3] mul x3, x17, x13 adcs x27, x27, x3 umulh x28, x17, x13 adc x28, x28, xzr # A[0] * B[3] mul x3, x14, x13 adds x9, x9, x3 umulh x4, x14, x13 adcs x25, x25, x4 # A[2] * B[3] mul x3, x16, x13 adcs x26, x26, x3 umulh x4, x16, x13 adcs x27, x27, x4 adc x28, x28, xzr # A[3] * B[0] mul x3, x17, x10 adds x9, x9, x3 umulh x4, x17, x10 adcs x25, x25, x4 # A[3] * B[2] mul x3, x17, x12 adcs x26, x26, x3 umulh x4, x17, x12 adcs x27, x27, x4 adc x28, x28, xzr # Reduce mov x3, #38 mul x4, x3, x28 adds x9, x9, x4 umulh x5, x3, x28 adc x5, x5, xzr mov x3, #19 extr x5, x5, x9, #63 mul x5, x5, x3 and x9, x9, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x25 adds x6, x6, x4 umulh x25, x3, x25 mul x4, x3, x26 adcs x7, x7, x4 umulh x26, x3, x26 mul x4, x3, x27 adcs x8, x8, x4 umulh x27, x3, x27 adc x9, x9, xzr # Add high product results in adds x6, x6, x5 adcs x7, x7, x25 adcs x8, x8, x26 adc x9, x9, x27 # Store stp x6, x7, [x0] stp x8, x9, [x0, #16] # Sub subs x14, x14, x10 sbcs x15, x15, x11 sbcs x16, x16, x12 sbcs x17, x17, x13 csetm x5, cc mov x3, #-19 # Mask the modulus extr x5, x5, x17, #63 mul x3, x5, x3 # Add modulus (if underflow) subs x14, x14, x3 sbcs x15, x15, xzr and x17, x17, #0x7fffffffffffffff sbcs x16, x16, xzr sbc x17, x17, xzr # Multiply by 121666 mov x5, #0xdb42 movk x5, #1, lsl 16 mul x6, x14, x5 umulh x7, x14, x5 mul x3, x15, x5 umulh x8, x15, x5 adds x7, x7, x3 adc x8, x8, xzr mul x3, x16, x5 umulh x9, x16, x5 adds x8, x8, x3 adc x9, x9, xzr mul x3, x17, x5 umulh x4, x17, x5 adds x9, x9, x3 adc x4, x4, xzr mov x5, #19 extr x4, x4, x9, #63 mul x4, x4, x5 adds x6, x6, x4 adcs x7, x7, xzr and x9, x9, #0x7fffffffffffffff adcs x8, x8, xzr adc x9, x9, xzr # Add adds x10, x10, x6 adcs x11, x11, x7 adcs x12, x12, x8 adcs x13, x13, x9 cset x5, cs mov x3, #19 # Mask the modulus extr x5, x5, x13, #63 mul x3, x5, x3 # Sub modulus (if overflow) adds x10, x10, x3 adcs x11, x11, xzr and x13, x13, #0x7fffffffffffffff adcs x12, x12, xzr adc x13, x13, xzr # Multiply # A[0] * B[0] umulh x7, x14, x10 mul x6, x14, x10 # A[2] * B[0] umulh x9, x16, x10 mul x8, x16, x10 # A[1] * B[0] mul x3, x15, x10 adds x7, x7, x3 umulh x4, x15, x10 adcs x8, x8, x4 # A[1] * B[3] umulh x26, x15, x13 adc x9, x9, xzr mul x25, x15, x13 # A[0] * B[1] mul x3, x14, x11 adds x7, x7, x3 umulh x4, x14, x11 adcs x8, x8, x4 # A[2] * B[1] mul x3, x16, x11 adcs x9, x9, x3 umulh x4, x16, x11 adcs x25, x25, x4 adc x26, x26, xzr # A[1] * B[2] mul x3, x15, x12 adds x9, x9, x3 umulh x4, x15, x12 adcs x25, x25, x4 adcs x26, x26, xzr adc x27, xzr, xzr # A[0] * B[2] mul x3, x14, x12 adds x8, x8, x3 umulh x4, x14, x12 adcs x9, x9, x4 adcs x25, x25, xzr adcs x26, x26, xzr adc x27, x27, xzr # A[1] * B[1] mul x3, x15, x11 adds x8, x8, x3 umulh x4, x15, x11 adcs x9, x9, x4 # A[3] * B[1] mul x3, x17, x11 adcs x25, x25, x3 umulh x4, x17, x11 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[2] mul x3, x16, x12 adds x25, x25, x3 umulh x4, x16, x12 adcs x26, x26, x4 # A[3] * B[3] mul x3, x17, x13 adcs x27, x27, x3 umulh x28, x17, x13 adc x28, x28, xzr # A[0] * B[3] mul x3, x14, x13 adds x9, x9, x3 umulh x4, x14, x13 adcs x25, x25, x4 # A[2] * B[3] mul x3, x16, x13 adcs x26, x26, x3 umulh x4, x16, x13 adcs x27, x27, x4 adc x28, x28, xzr # A[3] * B[0] mul x3, x17, x10 adds x9, x9, x3 umulh x4, x17, x10 adcs x25, x25, x4 # A[3] * B[2] mul x3, x17, x12 adcs x26, x26, x3 umulh x4, x17, x12 adcs x27, x27, x4 adc x28, x28, xzr # Reduce mov x3, #38 mul x4, x3, x28 adds x9, x9, x4 umulh x5, x3, x28 adc x5, x5, xzr mov x3, #19 extr x5, x5, x9, #63 mul x5, x5, x3 and x9, x9, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x25 adds x6, x6, x4 umulh x25, x3, x25 mul x4, x3, x26 adcs x7, x7, x4 umulh x26, x3, x26 mul x4, x3, x27 adcs x8, x8, x4 umulh x27, x3, x27 adc x9, x9, xzr # Add high product results in adds x6, x6, x5 adcs x7, x7, x25 adcs x8, x8, x26 adc x9, x9, x27 # Store stp x6, x7, [x29, #16] stp x8, x9, [x29, #32] # Add ldp x25, x26, [x29, #48] ldp x27, x28, [x29, #64] adds x10, x25, x19 adcs x11, x26, x20 adcs x12, x27, x21 adcs x13, x28, x22 cset x5, cs mov x3, #19 extr x5, x5, x13, #63 mul x3, x5, x3 # Sub modulus (if overflow) adds x10, x10, x3 adcs x11, x11, xzr and x13, x13, #0x7fffffffffffffff adcs x12, x12, xzr adc x13, x13, xzr # Sub subs x19, x25, x19 sbcs x20, x26, x20 sbcs x21, x27, x21 sbcs x22, x28, x22 csetm x5, cc mov x3, #-19 extr x5, x5, x22, #63 mul x3, x5, x3 # Add modulus (if underflow) subs x19, x19, x3 sbcs x20, x20, xzr and x22, x22, #0x7fffffffffffffff sbcs x21, x21, xzr sbc x22, x22, xzr # Square # A[0] * A[1] umulh x8, x10, x11 mul x7, x10, x11 # A[0] * A[3] umulh x25, x10, x13 mul x9, x10, x13 # A[0] * A[2] mul x3, x10, x12 adds x8, x8, x3 umulh x4, x10, x12 adcs x9, x9, x4 # A[1] * A[3] mul x3, x11, x13 adcs x25, x25, x3 umulh x26, x11, x13 adc x26, x26, xzr # A[1] * A[2] mul x3, x11, x12 adds x9, x9, x3 umulh x4, x11, x12 adcs x25, x25, x4 # A[2] * A[3] mul x3, x12, x13 adcs x26, x26, x3 umulh x27, x12, x13 adc x27, x27, xzr # Double adds x7, x7, x7 adcs x8, x8, x8 adcs x9, x9, x9 adcs x25, x25, x25 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr # A[0] * A[0] umulh x4, x10, x10 mul x6, x10, x10 # A[1] * A[1] mul x3, x11, x11 adds x7, x7, x4 umulh x4, x11, x11 adcs x8, x8, x3 # A[2] * A[2] mul x3, x12, x12 adcs x9, x9, x4 umulh x4, x12, x12 adcs x25, x25, x3 # A[3] * A[3] mul x3, x13, x13 adcs x26, x26, x4 umulh x4, x13, x13 adcs x27, x27, x3 adc x28, x28, x4 # Reduce mov x3, #38 mul x4, x3, x28 adds x9, x9, x4 umulh x5, x3, x28 adc x5, x5, xzr mov x3, #19 extr x5, x5, x9, #63 mul x5, x5, x3 and x9, x9, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x25 adds x6, x6, x4 umulh x25, x3, x25 mul x4, x3, x26 adcs x7, x7, x4 umulh x26, x3, x26 mul x4, x3, x27 adcs x8, x8, x4 umulh x27, x3, x27 adc x9, x9, xzr # Add high product results in adds x6, x6, x5 adcs x7, x7, x25 adcs x8, x8, x26 adc x9, x9, x27 # Square # A[0] * A[1] umulh x16, x19, x20 mul x15, x19, x20 # A[0] * A[3] umulh x25, x19, x22 mul x17, x19, x22 # A[0] * A[2] mul x3, x19, x21 adds x16, x16, x3 umulh x4, x19, x21 adcs x17, x17, x4 # A[1] * A[3] mul x3, x20, x22 adcs x25, x25, x3 umulh x26, x20, x22 adc x26, x26, xzr # A[1] * A[2] mul x3, x20, x21 adds x17, x17, x3 umulh x4, x20, x21 adcs x25, x25, x4 # A[2] * A[3] mul x3, x21, x22 adcs x26, x26, x3 umulh x27, x21, x22 adc x27, x27, xzr # Double adds x15, x15, x15 adcs x16, x16, x16 adcs x17, x17, x17 adcs x25, x25, x25 adcs x26, x26, x26 adcs x27, x27, x27 adc x28, xzr, xzr # A[0] * A[0] umulh x4, x19, x19 mul x14, x19, x19 # A[1] * A[1] mul x3, x20, x20 adds x15, x15, x4 umulh x4, x20, x20 adcs x16, x16, x3 # A[2] * A[2] mul x3, x21, x21 adcs x17, x17, x4 umulh x4, x21, x21 adcs x25, x25, x3 # A[3] * A[3] mul x3, x22, x22 adcs x26, x26, x4 umulh x4, x22, x22 adcs x27, x27, x3 adc x28, x28, x4 # Reduce mov x3, #38 mul x4, x3, x28 adds x17, x17, x4 umulh x5, x3, x28 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x25 adds x14, x14, x4 umulh x25, x3, x25 mul x4, x3, x26 adcs x15, x15, x4 umulh x26, x3, x26 mul x4, x3, x27 adcs x16, x16, x4 umulh x27, x3, x27 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x25 adcs x16, x16, x26 adc x17, x17, x27 # Multiply ldp x19, x20, [x2] ldp x21, x22, [x2, #16] # A[0] * B[0] umulh x11, x19, x14 mul x10, x19, x14 # A[2] * B[0] umulh x13, x21, x14 mul x12, x21, x14 # A[1] * B[0] mul x3, x20, x14 adds x11, x11, x3 umulh x4, x20, x14 adcs x12, x12, x4 # A[1] * B[3] umulh x26, x20, x17 adc x13, x13, xzr mul x25, x20, x17 # A[0] * B[1] mul x3, x19, x15 adds x11, x11, x3 umulh x4, x19, x15 adcs x12, x12, x4 # A[2] * B[1] mul x3, x21, x15 adcs x13, x13, x3 umulh x4, x21, x15 adcs x25, x25, x4 adc x26, x26, xzr # A[1] * B[2] mul x3, x20, x16 adds x13, x13, x3 umulh x4, x20, x16 adcs x25, x25, x4 adcs x26, x26, xzr adc x27, xzr, xzr # A[0] * B[2] mul x3, x19, x16 adds x12, x12, x3 umulh x4, x19, x16 adcs x13, x13, x4 adcs x25, x25, xzr adcs x26, x26, xzr adc x27, x27, xzr # A[1] * B[1] mul x3, x20, x15 adds x12, x12, x3 umulh x4, x20, x15 adcs x13, x13, x4 # A[3] * B[1] mul x3, x22, x15 adcs x25, x25, x3 umulh x4, x22, x15 adcs x26, x26, x4 adc x27, x27, xzr # A[2] * B[2] mul x3, x21, x16 adds x25, x25, x3 umulh x4, x21, x16 adcs x26, x26, x4 # A[3] * B[3] mul x3, x22, x17 adcs x27, x27, x3 umulh x28, x22, x17 adc x28, x28, xzr # A[0] * B[3] mul x3, x19, x17 adds x13, x13, x3 umulh x4, x19, x17 adcs x25, x25, x4 # A[2] * B[3] mul x3, x21, x17 adcs x26, x26, x3 umulh x4, x21, x17 adcs x27, x27, x4 adc x28, x28, xzr # A[3] * B[0] mul x3, x22, x14 adds x13, x13, x3 umulh x4, x22, x14 adcs x25, x25, x4 # A[3] * B[2] mul x3, x22, x16 adcs x26, x26, x3 umulh x4, x22, x16 adcs x27, x27, x4 adc x28, x28, xzr # Reduce mov x3, #38 mul x4, x3, x28 adds x13, x13, x4 umulh x5, x3, x28 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x25 adds x10, x10, x4 umulh x25, x3, x25 mul x4, x3, x26 adcs x11, x11, x4 umulh x26, x3, x26 mul x4, x3, x27 adcs x12, x12, x4 umulh x27, x3, x27 adc x13, x13, xzr # Add high product results in adds x10, x10, x5 adcs x11, x11, x25 adcs x12, x12, x26 adc x13, x13, x27 subs x24, x24, #1 bge L_curve25519_bits # Invert add x0, x29, #48 add x1, x29, #16 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ add x0, x29, #0x50 add x1, x29, #48 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #0x50 #endif /* !NDEBUG */ add x1, x29, #0x50 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #0x50 #endif /* !NDEBUG */ add x1, x29, #16 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ add x0, x29, #48 add x1, x29, #48 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ add x0, x29, #0x70 #ifndef NDEBUG add x1, x29, #48 #endif /* !NDEBUG */ #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ add x0, x29, #0x50 add x1, x29, #0x50 add x2, x29, #0x70 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 5 times mov x24, #5 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_curve25519_inv_1: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_1 # Store stp x6, x7, [x29, #112] stp x8, x9, [x29, #128] #ifndef NDEBUG add x0, x29, #0x50 #endif /* !NDEBUG */ add x1, x29, #0x70 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 10 times mov x24, #10 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_curve25519_inv_2: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_2 # Store stp x6, x7, [x29, #112] stp x8, x9, [x29, #128] add x0, x29, #0x70 #ifndef NDEBUG add x1, x29, #0x70 #endif /* !NDEBUG */ add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 20 times mov x24, #20 ldp x6, x7, [x29, #112] ldp x8, x9, [x29, #128] L_curve25519_inv_3: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_3 # Store stp x6, x7, [x29, #144] stp x8, x9, [x29, #160] #ifndef NDEBUG add x0, x29, #0x70 #endif /* !NDEBUG */ add x1, x29, #0x90 add x2, x29, #0x70 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 10 times mov x24, #10 ldp x6, x7, [x29, #112] ldp x8, x9, [x29, #128] L_curve25519_inv_4: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_4 # Store stp x6, x7, [x29, #112] stp x8, x9, [x29, #128] add x0, x29, #0x50 add x1, x29, #0x70 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 50 times mov x24, #50 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_curve25519_inv_5: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_5 # Store stp x6, x7, [x29, #112] stp x8, x9, [x29, #128] add x0, x29, #0x70 #ifndef NDEBUG add x1, x29, #0x70 #endif /* !NDEBUG */ add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 100 times mov x24, #0x64 ldp x6, x7, [x29, #112] ldp x8, x9, [x29, #128] L_curve25519_inv_6: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_6 # Store stp x6, x7, [x29, #144] stp x8, x9, [x29, #160] #ifndef NDEBUG add x0, x29, #0x70 #endif /* !NDEBUG */ add x1, x29, #0x90 add x2, x29, #0x70 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 50 times mov x24, #50 ldp x6, x7, [x29, #112] ldp x8, x9, [x29, #128] L_curve25519_inv_7: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_7 # Store stp x6, x7, [x29, #112] stp x8, x9, [x29, #128] add x0, x29, #0x50 add x1, x29, #0x70 add x2, x29, #0x50 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 5 times mov x24, #5 ldp x6, x7, [x29, #80] ldp x8, x9, [x29, #96] L_curve25519_inv_8: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x24, x24, #1 bne L_curve25519_inv_8 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] add x0, x29, #16 add x1, x29, #0x50 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ ldr x0, [x29, #176] # Multiply ldp x6, x7, [x0] ldp x8, x9, [x0, #16] ldp x10, x11, [x29, #16] ldp x12, x13, [x29, #32] # A[0] * B[0] umulh x15, x6, x10 mul x14, x6, x10 # A[2] * B[0] umulh x17, x8, x10 mul x16, x8, x10 # A[1] * B[0] mul x3, x7, x10 adds x15, x15, x3 umulh x4, x7, x10 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x7, x13 adc x17, x17, xzr mul x19, x7, x13 # A[0] * B[1] mul x3, x6, x11 adds x15, x15, x3 umulh x4, x6, x11 adcs x16, x16, x4 # A[2] * B[1] mul x3, x8, x11 adcs x17, x17, x3 umulh x4, x8, x11 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x7, x12 adds x17, x17, x3 umulh x4, x7, x12 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x6, x12 adds x16, x16, x3 umulh x4, x6, x12 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x7, x11 adds x16, x16, x3 umulh x4, x7, x11 adcs x17, x17, x4 # A[3] * B[1] mul x3, x9, x11 adcs x19, x19, x3 umulh x4, x9, x11 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x8, x12 adds x19, x19, x3 umulh x4, x8, x12 adcs x20, x20, x4 # A[3] * B[3] mul x3, x9, x13 adcs x21, x21, x3 umulh x22, x9, x13 adc x22, x22, xzr # A[0] * B[3] mul x3, x6, x13 adds x17, x17, x3 umulh x4, x6, x13 adcs x19, x19, x4 # A[2] * B[3] mul x3, x8, x13 adcs x20, x20, x3 umulh x4, x8, x13 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x9, x10 adds x17, x17, x3 umulh x4, x9, x10 adcs x19, x19, x4 # A[3] * B[2] mul x3, x9, x12 adcs x20, x20, x3 umulh x4, x9, x12 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Reduce if top bit set mov x3, #19 and x4, x3, x17, asr 63 adds x14, x14, x4 adcs x15, x15, xzr and x17, x17, #0x7fffffffffffffff adcs x16, x16, xzr adc x17, x17, xzr adds x4, x14, x3 adcs x4, x15, xzr adcs x4, x16, xzr adc x4, x17, xzr and x4, x3, x4, asr 63 adds x14, x14, x4 adcs x15, x15, xzr mov x4, #0x7fffffffffffffff adcs x16, x16, xzr adc x17, x17, xzr and x17, x17, x4 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] mov x0, xzr ldr x17, [x29, #200] ldr x19, [x29, #208] ldp x20, x21, [x29, #216] ldp x22, x23, [x29, #232] ldp x24, x25, [x29, #248] ldp x26, x27, [x29, #264] ldr x28, [x29, #280] ldp x29, x30, [sp], #0x120 ret #ifndef __APPLE__ .size curve25519,.-curve25519 #endif /* __APPLE__ */ #ifdef HAVE_ED25519 #ifndef __APPLE__ .text .globl fe_pow22523 .type fe_pow22523,@function .align 2 fe_pow22523: #else .section __TEXT,__text .globl _fe_pow22523 .p2align 2 _fe_pow22523: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 str x17, [x29, #128] str x23, [x29, #136] # pow22523 str x0, [x29, #112] str x1, [x29, #120] add x0, x29, #16 #ifndef NDEBUG ldr x1, [x29, #120] #endif /* !NDEBUG */ #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ add x0, x29, #48 add x1, x29, #16 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #48 #endif /* !NDEBUG */ add x1, x29, #48 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #48 #endif /* !NDEBUG */ ldr x1, [x29, #120] add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ add x0, x29, #16 add x1, x29, #16 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #16 #endif /* !NDEBUG */ #ifndef NDEBUG add x1, x29, #16 #endif /* !NDEBUG */ #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #16 #endif /* !NDEBUG */ add x1, x29, #48 add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 5 times mov x23, #5 ldp x6, x7, [x29, #16] ldp x8, x9, [x29, #32] L_fe_pow22523_1: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x23, x23, #1 bne L_fe_pow22523_1 # Store stp x6, x7, [x29, #48] stp x8, x9, [x29, #64] #ifndef NDEBUG add x0, x29, #16 #endif /* !NDEBUG */ #ifndef NDEBUG add x1, x29, #48 #endif /* !NDEBUG */ add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 10 times mov x23, #10 ldp x6, x7, [x29, #16] ldp x8, x9, [x29, #32] L_fe_pow22523_2: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x23, x23, #1 bne L_fe_pow22523_2 # Store stp x6, x7, [x29, #48] stp x8, x9, [x29, #64] add x0, x29, #48 #ifndef NDEBUG add x1, x29, #48 #endif /* !NDEBUG */ add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 20 times mov x23, #20 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_pow22523_3: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x23, x23, #1 bne L_fe_pow22523_3 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] #ifndef NDEBUG add x0, x29, #48 #endif /* !NDEBUG */ add x1, x29, #0x50 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 10 times mov x23, #10 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_pow22523_4: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x23, x23, #1 bne L_fe_pow22523_4 # Store stp x6, x7, [x29, #48] stp x8, x9, [x29, #64] add x0, x29, #16 add x1, x29, #48 add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 50 times mov x23, #50 ldp x6, x7, [x29, #16] ldp x8, x9, [x29, #32] L_fe_pow22523_5: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x23, x23, #1 bne L_fe_pow22523_5 # Store stp x6, x7, [x29, #48] stp x8, x9, [x29, #64] add x0, x29, #48 #ifndef NDEBUG add x1, x29, #48 #endif /* !NDEBUG */ add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 100 times mov x23, #0x64 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_pow22523_6: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x23, x23, #1 bne L_fe_pow22523_6 # Store stp x6, x7, [x29, #80] stp x8, x9, [x29, #96] #ifndef NDEBUG add x0, x29, #48 #endif /* !NDEBUG */ add x1, x29, #0x50 add x2, x29, #48 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ # Loop: 50 times mov x23, #50 ldp x6, x7, [x29, #48] ldp x8, x9, [x29, #64] L_fe_pow22523_7: # Square # A[0] * A[1] umulh x12, x6, x7 mul x11, x6, x7 # A[0] * A[3] umulh x14, x6, x9 mul x13, x6, x9 # A[0] * A[2] mul x3, x6, x8 adds x12, x12, x3 umulh x4, x6, x8 adcs x13, x13, x4 # A[1] * A[3] mul x3, x7, x9 adcs x14, x14, x3 umulh x15, x7, x9 adc x15, x15, xzr # A[1] * A[2] mul x3, x7, x8 adds x13, x13, x3 umulh x4, x7, x8 adcs x14, x14, x4 # A[2] * A[3] mul x3, x8, x9 adcs x15, x15, x3 umulh x16, x8, x9 adc x16, x16, xzr # Double adds x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adcs x15, x15, x15 adcs x16, x16, x16 adc x17, xzr, xzr # A[0] * A[0] umulh x4, x6, x6 mul x10, x6, x6 # A[1] * A[1] mul x3, x7, x7 adds x11, x11, x4 umulh x4, x7, x7 adcs x12, x12, x3 # A[2] * A[2] mul x3, x8, x8 adcs x13, x13, x4 umulh x4, x8, x8 adcs x14, x14, x3 # A[3] * A[3] mul x3, x9, x9 adcs x15, x15, x4 umulh x4, x9, x9 adcs x16, x16, x3 adc x17, x17, x4 # Reduce mov x3, #38 mul x4, x3, x17 adds x13, x13, x4 umulh x5, x3, x17 adc x5, x5, xzr mov x3, #19 extr x5, x5, x13, #63 mul x5, x5, x3 and x13, x13, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x14 adds x10, x10, x4 umulh x14, x3, x14 mul x4, x3, x15 adcs x11, x11, x4 umulh x15, x3, x15 mul x4, x3, x16 adcs x12, x12, x4 umulh x16, x3, x16 adc x13, x13, xzr # Add high product results in adds x6, x10, x5 adcs x7, x11, x14 adcs x8, x12, x15 adc x9, x13, x16 subs x23, x23, #1 bne L_fe_pow22523_7 # Store stp x6, x7, [x29, #48] stp x8, x9, [x29, #64] add x0, x29, #16 add x1, x29, #48 add x2, x29, #16 #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ #ifndef NDEBUG add x0, x29, #16 #endif /* !NDEBUG */ add x1, x29, #16 #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ #ifndef __APPLE__ bl fe_sq #else bl _fe_sq #endif /* __APPLE__ */ ldr x0, [x29, #112] #ifndef NDEBUG add x1, x29, #16 #endif /* !NDEBUG */ ldr x2, [x29, #120] #ifndef __APPLE__ bl fe_mul #else bl _fe_mul #endif /* __APPLE__ */ ldr x17, [x29, #128] ldr x23, [x29, #136] ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ .size fe_pow22523,.-fe_pow22523 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p1p1_to_p2 .type ge_p1p1_to_p2,@function .align 2 ge_p1p1_to_p2: #else .section __TEXT,__text .globl _ge_p1p1_to_p2 .p2align 2 _ge_p1p1_to_p2: #endif /* __APPLE__ */ stp x29, x30, [sp, #-80]! add x29, sp, #0 str x17, [x29, #40] str x19, [x29, #48] stp x20, x21, [x29, #56] str x22, [x29, #72] str x0, [x29, #16] str x1, [x29, #24] mov x2, x1 add x1, x1, #0x60 # Multiply ldp x10, x11, [x1] ldp x12, x13, [x1, #16] ldp x6, x7, [x2] ldp x8, x9, [x2, #16] # A[0] * B[0] umulh x15, x10, x6 mul x14, x10, x6 # A[2] * B[0] umulh x17, x12, x6 mul x16, x12, x6 # A[1] * B[0] mul x3, x11, x6 adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x11, x9 adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 adds x15, x15, x3 umulh x4, x10, x7 adcs x16, x16, x4 # A[2] * B[1] mul x3, x12, x7 adcs x17, x17, x3 umulh x4, x12, x7 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x11, x8 adds x17, x17, x3 umulh x4, x11, x8 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x10, x8 adds x16, x16, x3 umulh x4, x10, x8 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x11, x7 adds x16, x16, x3 umulh x4, x11, x7 adcs x17, x17, x4 # A[3] * B[1] mul x3, x13, x7 adcs x19, x19, x3 umulh x4, x13, x7 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x12, x8 adds x19, x19, x3 umulh x4, x12, x8 adcs x20, x20, x4 # A[3] * B[3] mul x3, x13, x9 adcs x21, x21, x3 umulh x22, x13, x9 adc x22, x22, xzr # A[0] * B[3] mul x3, x10, x9 adds x17, x17, x3 umulh x4, x10, x9 adcs x19, x19, x4 # A[2] * B[3] mul x3, x12, x9 adcs x20, x20, x3 umulh x4, x12, x9 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x13, x6 adds x17, x17, x3 umulh x4, x13, x6 adcs x19, x19, x4 # A[3] * B[2] mul x3, x13, x8 adcs x20, x20, x3 umulh x4, x13, x8 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] sub x2, x1, #32 add x0, x0, #0x40 # Multiply ldp x6, x7, [x2] ldp x8, x9, [x2, #16] # A[0] * B[0] umulh x15, x10, x6 mul x14, x10, x6 # A[2] * B[0] umulh x17, x12, x6 mul x16, x12, x6 # A[1] * B[0] mul x3, x11, x6 adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x11, x9 adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 adds x15, x15, x3 umulh x4, x10, x7 adcs x16, x16, x4 # A[2] * B[1] mul x3, x12, x7 adcs x17, x17, x3 umulh x4, x12, x7 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x11, x8 adds x17, x17, x3 umulh x4, x11, x8 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x10, x8 adds x16, x16, x3 umulh x4, x10, x8 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x11, x7 adds x16, x16, x3 umulh x4, x11, x7 adcs x17, x17, x4 # A[3] * B[1] mul x3, x13, x7 adcs x19, x19, x3 umulh x4, x13, x7 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x12, x8 adds x19, x19, x3 umulh x4, x12, x8 adcs x20, x20, x4 # A[3] * B[3] mul x3, x13, x9 adcs x21, x21, x3 umulh x22, x13, x9 adc x22, x22, xzr # A[0] * B[3] mul x3, x10, x9 adds x17, x17, x3 umulh x4, x10, x9 adcs x19, x19, x4 # A[2] * B[3] mul x3, x12, x9 adcs x20, x20, x3 umulh x4, x12, x9 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x13, x6 adds x17, x17, x3 umulh x4, x13, x6 adcs x19, x19, x4 # A[3] * B[2] mul x3, x13, x8 adcs x20, x20, x3 umulh x4, x13, x8 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] sub x1, x1, #0x40 sub x0, x0, #32 # Multiply ldp x10, x11, [x1] ldp x12, x13, [x1, #16] # A[0] * B[0] umulh x15, x10, x6 mul x14, x10, x6 # A[2] * B[0] umulh x17, x12, x6 mul x16, x12, x6 # A[1] * B[0] mul x3, x11, x6 adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x11, x9 adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 adds x15, x15, x3 umulh x4, x10, x7 adcs x16, x16, x4 # A[2] * B[1] mul x3, x12, x7 adcs x17, x17, x3 umulh x4, x12, x7 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x11, x8 adds x17, x17, x3 umulh x4, x11, x8 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x10, x8 adds x16, x16, x3 umulh x4, x10, x8 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x11, x7 adds x16, x16, x3 umulh x4, x11, x7 adcs x17, x17, x4 # A[3] * B[1] mul x3, x13, x7 adcs x19, x19, x3 umulh x4, x13, x7 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x12, x8 adds x19, x19, x3 umulh x4, x12, x8 adcs x20, x20, x4 # A[3] * B[3] mul x3, x13, x9 adcs x21, x21, x3 umulh x22, x13, x9 adc x22, x22, xzr # A[0] * B[3] mul x3, x10, x9 adds x17, x17, x3 umulh x4, x10, x9 adcs x19, x19, x4 # A[2] * B[3] mul x3, x12, x9 adcs x20, x20, x3 umulh x4, x12, x9 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x13, x6 adds x17, x17, x3 umulh x4, x13, x6 adcs x19, x19, x4 # A[3] * B[2] mul x3, x13, x8 adcs x20, x20, x3 umulh x4, x13, x8 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] ldr x17, [x29, #40] ldr x19, [x29, #48] ldp x20, x21, [x29, #56] ldr x22, [x29, #72] ldp x29, x30, [sp], #0x50 ret #ifndef __APPLE__ .size ge_p1p1_to_p2,.-ge_p1p1_to_p2 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p1p1_to_p3 .type ge_p1p1_to_p3,@function .align 2 ge_p1p1_to_p3: #else .section __TEXT,__text .globl _ge_p1p1_to_p3 .p2align 2 _ge_p1p1_to_p3: #endif /* __APPLE__ */ stp x29, x30, [sp, #-112]! add x29, sp, #0 str x17, [x29, #40] str x19, [x29, #48] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] str x26, [x29, #104] str x0, [x29, #16] str x1, [x29, #24] mov x2, x1 add x1, x1, #0x60 # Multiply ldp x10, x11, [x1] ldp x12, x13, [x1, #16] ldp x6, x7, [x2] ldp x8, x9, [x2, #16] # A[0] * B[0] umulh x15, x10, x6 mul x14, x10, x6 # A[2] * B[0] umulh x17, x12, x6 mul x16, x12, x6 # A[1] * B[0] mul x3, x11, x6 adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x11, x9 adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 adds x15, x15, x3 umulh x4, x10, x7 adcs x16, x16, x4 # A[2] * B[1] mul x3, x12, x7 adcs x17, x17, x3 umulh x4, x12, x7 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x11, x8 adds x17, x17, x3 umulh x4, x11, x8 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x10, x8 adds x16, x16, x3 umulh x4, x10, x8 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x11, x7 adds x16, x16, x3 umulh x4, x11, x7 adcs x17, x17, x4 # A[3] * B[1] mul x3, x13, x7 adcs x19, x19, x3 umulh x4, x13, x7 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x12, x8 adds x19, x19, x3 umulh x4, x12, x8 adcs x20, x20, x4 # A[3] * B[3] mul x3, x13, x9 adcs x21, x21, x3 umulh x22, x13, x9 adc x22, x22, xzr # A[0] * B[3] mul x3, x10, x9 adds x17, x17, x3 umulh x4, x10, x9 adcs x19, x19, x4 # A[2] * B[3] mul x3, x12, x9 adcs x20, x20, x3 umulh x4, x12, x9 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x13, x6 adds x17, x17, x3 umulh x4, x13, x6 adcs x19, x19, x4 # A[3] * B[2] mul x3, x13, x8 adcs x20, x20, x3 umulh x4, x13, x8 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] sub x1, x1, #0x40 add x0, x0, #0x60 # Multiply ldp x23, x24, [x1] ldp x25, x26, [x1, #16] # A[0] * B[0] umulh x15, x23, x6 mul x14, x23, x6 # A[2] * B[0] umulh x17, x25, x6 mul x16, x25, x6 # A[1] * B[0] mul x3, x24, x6 adds x15, x15, x3 umulh x4, x24, x6 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x24, x9 adc x17, x17, xzr mul x19, x24, x9 # A[0] * B[1] mul x3, x23, x7 adds x15, x15, x3 umulh x4, x23, x7 adcs x16, x16, x4 # A[2] * B[1] mul x3, x25, x7 adcs x17, x17, x3 umulh x4, x25, x7 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x24, x8 adds x17, x17, x3 umulh x4, x24, x8 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x23, x8 adds x16, x16, x3 umulh x4, x23, x8 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x24, x7 adds x16, x16, x3 umulh x4, x24, x7 adcs x17, x17, x4 # A[3] * B[1] mul x3, x26, x7 adcs x19, x19, x3 umulh x4, x26, x7 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x25, x8 adds x19, x19, x3 umulh x4, x25, x8 adcs x20, x20, x4 # A[3] * B[3] mul x3, x26, x9 adcs x21, x21, x3 umulh x22, x26, x9 adc x22, x22, xzr # A[0] * B[3] mul x3, x23, x9 adds x17, x17, x3 umulh x4, x23, x9 adcs x19, x19, x4 # A[2] * B[3] mul x3, x25, x9 adcs x20, x20, x3 umulh x4, x25, x9 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x26, x6 adds x17, x17, x3 umulh x4, x26, x6 adcs x19, x19, x4 # A[3] * B[2] mul x3, x26, x8 adcs x20, x20, x3 umulh x4, x26, x8 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] add x2, x1, #32 sub x0, x0, #0x40 # Multiply ldp x6, x7, [x2] ldp x8, x9, [x2, #16] # A[0] * B[0] umulh x15, x23, x6 mul x14, x23, x6 # A[2] * B[0] umulh x17, x25, x6 mul x16, x25, x6 # A[1] * B[0] mul x3, x24, x6 adds x15, x15, x3 umulh x4, x24, x6 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x24, x9 adc x17, x17, xzr mul x19, x24, x9 # A[0] * B[1] mul x3, x23, x7 adds x15, x15, x3 umulh x4, x23, x7 adcs x16, x16, x4 # A[2] * B[1] mul x3, x25, x7 adcs x17, x17, x3 umulh x4, x25, x7 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x24, x8 adds x17, x17, x3 umulh x4, x24, x8 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x23, x8 adds x16, x16, x3 umulh x4, x23, x8 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x24, x7 adds x16, x16, x3 umulh x4, x24, x7 adcs x17, x17, x4 # A[3] * B[1] mul x3, x26, x7 adcs x19, x19, x3 umulh x4, x26, x7 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x25, x8 adds x19, x19, x3 umulh x4, x25, x8 adcs x20, x20, x4 # A[3] * B[3] mul x3, x26, x9 adcs x21, x21, x3 umulh x22, x26, x9 adc x22, x22, xzr # A[0] * B[3] mul x3, x23, x9 adds x17, x17, x3 umulh x4, x23, x9 adcs x19, x19, x4 # A[2] * B[3] mul x3, x25, x9 adcs x20, x20, x3 umulh x4, x25, x9 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x26, x6 adds x17, x17, x3 umulh x4, x26, x6 adcs x19, x19, x4 # A[3] * B[2] mul x3, x26, x8 adcs x20, x20, x3 umulh x4, x26, x8 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] add x1, x1, #0x40 add x0, x0, #32 # Multiply # A[0] * B[0] umulh x15, x10, x6 mul x14, x10, x6 # A[2] * B[0] umulh x17, x12, x6 mul x16, x12, x6 # A[1] * B[0] mul x3, x11, x6 adds x15, x15, x3 umulh x4, x11, x6 adcs x16, x16, x4 # A[1] * B[3] umulh x20, x11, x9 adc x17, x17, xzr mul x19, x11, x9 # A[0] * B[1] mul x3, x10, x7 adds x15, x15, x3 umulh x4, x10, x7 adcs x16, x16, x4 # A[2] * B[1] mul x3, x12, x7 adcs x17, x17, x3 umulh x4, x12, x7 adcs x19, x19, x4 adc x20, x20, xzr # A[1] * B[2] mul x3, x11, x8 adds x17, x17, x3 umulh x4, x11, x8 adcs x19, x19, x4 adcs x20, x20, xzr adc x21, xzr, xzr # A[0] * B[2] mul x3, x10, x8 adds x16, x16, x3 umulh x4, x10, x8 adcs x17, x17, x4 adcs x19, x19, xzr adcs x20, x20, xzr adc x21, x21, xzr # A[1] * B[1] mul x3, x11, x7 adds x16, x16, x3 umulh x4, x11, x7 adcs x17, x17, x4 # A[3] * B[1] mul x3, x13, x7 adcs x19, x19, x3 umulh x4, x13, x7 adcs x20, x20, x4 adc x21, x21, xzr # A[2] * B[2] mul x3, x12, x8 adds x19, x19, x3 umulh x4, x12, x8 adcs x20, x20, x4 # A[3] * B[3] mul x3, x13, x9 adcs x21, x21, x3 umulh x22, x13, x9 adc x22, x22, xzr # A[0] * B[3] mul x3, x10, x9 adds x17, x17, x3 umulh x4, x10, x9 adcs x19, x19, x4 # A[2] * B[3] mul x3, x12, x9 adcs x20, x20, x3 umulh x4, x12, x9 adcs x21, x21, x4 adc x22, x22, xzr # A[3] * B[0] mul x3, x13, x6 adds x17, x17, x3 umulh x4, x13, x6 adcs x19, x19, x4 # A[3] * B[2] mul x3, x13, x8 adcs x20, x20, x3 umulh x4, x13, x8 adcs x21, x21, x4 adc x22, x22, xzr # Reduce mov x3, #38 mul x4, x3, x22 adds x17, x17, x4 umulh x5, x3, x22 adc x5, x5, xzr mov x3, #19 extr x5, x5, x17, #63 mul x5, x5, x3 and x17, x17, #0x7fffffffffffffff mov x3, #38 mul x4, x3, x19 adds x14, x14, x4 umulh x19, x3, x19 mul x4, x3, x20 adcs x15, x15, x4 umulh x20, x3, x20 mul x4, x3, x21 adcs x16, x16, x4 umulh x21, x3, x21 adc x17, x17, xzr # Add high product results in adds x14, x14, x5 adcs x15, x15, x19 adcs x16, x16, x20 adc x17, x17, x21 # Store stp x14, x15, [x0] stp x16, x17, [x0, #16] ldr x17, [x29, #40] ldr x19, [x29, #48] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] ldr x26, [x29, #104] ldp x29, x30, [sp], #0x70 ret #ifndef __APPLE__ .size ge_p1p1_to_p3,.-ge_p1p1_to_p3 #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_p2_dbl .type ge_p2_dbl,@function .align 2 ge_p2_dbl: #else .section __TEXT,__text .globl _ge_p2_dbl .p2align 2 _ge_p2_dbl: #endif /* __APPLE__ */ stp x29, x30, [sp, #-128]! add x29, sp, #0 str x17, [x29, #40] str x19, [x29, #48] stp x20, x21, [x29, #56] stp x22, x23, [x29, #72] stp x24, x25, [x29, #88] stp x26, x27, [x29, #104] str x28, [x29, #120] str x0, [x29, #16] str x1, [x29, #24] add x0, x0, #0x40 # Square ldp x4, x5, [x1] ldp x6, x7, [x1, #16] # A[0] * A[1] umulh x10, x4, x5 mul x9, x4, x5 # A[0] * A[3] umulh x12, x4, x7 mul x11, x4, x7 # A[0] * A[2] mul x25, x4, x6 adds x10, x10, x25 umulh x26, x4, x6 adcs x11, x11, x26 # A[1] * A[3] mul x25, x5, x7 adcs x12, x12, x25 umulh x13, x5, x7 adc x13, x13, xzr # A[1] * A[2] mul x25, x5, x6 adds x11, x11, x25 umulh x26, x5, x6 adcs x12, x12, x26 # A[2] * A[3] mul x25, x6, x7 adcs x13, x13, x25 umulh x14, x6, x7 adc x14, x14, xzr # Double adds x9, x9, x9 adcs x10, x10, x10 adcs x11, x11, x11 adcs x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adc x15, xzr, xzr # A[0] * A[0] umulh x26, x4, x4 mul x8, x4, x4 # A[1] * A[1] mul x25, x5, x5 adds x9, x9, x26 umulh x26, x5, x5 adcs x10, x10, x25 # A[2] * A[2] mul x25, x6, x6 adcs x11, x11, x26 umulh x26, x6, x6 adcs x12, x12, x25 # A[3] * A[3] mul x25, x7, x7 adcs x13, x13, x26 umulh x26, x7, x7 adcs x14, x14, x25 adc x15, x15, x26 # Reduce mov x25, #38 mul x26, x25, x15 adds x11, x11, x26 umulh x27, x25, x15 adc x27, x27, xzr mov x25, #19 extr x27, x27, x11, #63 mul x27, x27, x25 and x11, x11, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x12 adds x8, x8, x26 umulh x12, x25, x12 mul x26, x25, x13 adcs x9, x9, x26 umulh x13, x25, x13 mul x26, x25, x14 adcs x10, x10, x26 umulh x14, x25, x14 adc x11, x11, xzr # Add high product results in adds x8, x8, x27 adcs x9, x9, x12 adcs x10, x10, x13 adc x11, x11, x14 # Store stp x8, x9, [x0] stp x10, x11, [x0, #16] add x2, x1, #32 sub x0, x0, #32 # Square ldp x16, x17, [x2] ldp x19, x20, [x2, #16] # A[0] * A[1] umulh x23, x16, x17 mul x22, x16, x17 # A[0] * A[3] umulh x4, x16, x20 mul x24, x16, x20 # A[0] * A[2] mul x25, x16, x19 adds x23, x23, x25 umulh x26, x16, x19 adcs x24, x24, x26 # A[1] * A[3] mul x25, x17, x20 adcs x4, x4, x25 umulh x5, x17, x20 adc x5, x5, xzr # A[1] * A[2] mul x25, x17, x19 adds x24, x24, x25 umulh x26, x17, x19 adcs x4, x4, x26 # A[2] * A[3] mul x25, x19, x20 adcs x5, x5, x25 umulh x6, x19, x20 adc x6, x6, xzr # Double adds x22, x22, x22 adcs x23, x23, x23 adcs x24, x24, x24 adcs x4, x4, x4 adcs x5, x5, x5 adcs x6, x6, x6 adc x7, xzr, xzr # A[0] * A[0] umulh x26, x16, x16 mul x21, x16, x16 # A[1] * A[1] mul x25, x17, x17 adds x22, x22, x26 umulh x26, x17, x17 adcs x23, x23, x25 # A[2] * A[2] mul x25, x19, x19 adcs x24, x24, x26 umulh x26, x19, x19 adcs x4, x4, x25 # A[3] * A[3] mul x25, x20, x20 adcs x5, x5, x26 umulh x26, x20, x20 adcs x6, x6, x25 adc x7, x7, x26 # Reduce mov x25, #38 mul x26, x25, x7 adds x24, x24, x26 umulh x27, x25, x7 adc x27, x27, xzr mov x25, #19 extr x27, x27, x24, #63 mul x27, x27, x25 and x24, x24, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x4 adds x21, x21, x26 umulh x4, x25, x4 mul x26, x25, x5 adcs x22, x22, x26 umulh x5, x25, x5 mul x26, x25, x6 adcs x23, x23, x26 umulh x6, x25, x6 adc x24, x24, xzr # Add high product results in adds x21, x21, x27 adcs x22, x22, x4 adcs x23, x23, x5 adc x24, x24, x6 add x3, x0, #32 mov x2, x0 add x1, x0, #32 # Add adds x4, x21, x8 adcs x5, x22, x9 adcs x6, x23, x10 adcs x7, x24, x11 cset x28, cs mov x25, #19 extr x28, x28, x7, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x4, x4, x25 adcs x5, x5, xzr and x7, x7, #0x7fffffffffffffff adcs x6, x6, xzr adc x7, x7, xzr # Sub subs x12, x21, x8 sbcs x13, x22, x9 sbcs x14, x23, x10 sbcs x15, x24, x11 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr stp x4, x5, [x0] stp x6, x7, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] ldr x1, [x29, #24] add x2, x1, #32 sub x0, x0, #32 # Add ldp x8, x9, [x1] ldp x10, x11, [x1, #16] adds x8, x8, x16 adcs x9, x9, x17 adcs x10, x10, x19 adcs x11, x11, x20 cset x28, cs mov x25, #19 # Mask the modulus extr x28, x28, x11, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x8, x8, x25 adcs x9, x9, xzr and x11, x11, #0x7fffffffffffffff adcs x10, x10, xzr adc x11, x11, xzr mov x1, x0 # Square # A[0] * A[1] umulh x23, x8, x9 mul x22, x8, x9 # A[0] * A[3] umulh x4, x8, x11 mul x24, x8, x11 # A[0] * A[2] mul x25, x8, x10 adds x23, x23, x25 umulh x26, x8, x10 adcs x24, x24, x26 # A[1] * A[3] mul x25, x9, x11 adcs x4, x4, x25 umulh x5, x9, x11 adc x5, x5, xzr # A[1] * A[2] mul x25, x9, x10 adds x24, x24, x25 umulh x26, x9, x10 adcs x4, x4, x26 # A[2] * A[3] mul x25, x10, x11 adcs x5, x5, x25 umulh x6, x10, x11 adc x6, x6, xzr # Double adds x22, x22, x22 adcs x23, x23, x23 adcs x24, x24, x24 adcs x4, x4, x4 adcs x5, x5, x5 adcs x6, x6, x6 adc x7, xzr, xzr # A[0] * A[0] umulh x26, x8, x8 mul x21, x8, x8 # A[1] * A[1] mul x25, x9, x9 adds x22, x22, x26 umulh x26, x9, x9 adcs x23, x23, x25 # A[2] * A[2] mul x25, x10, x10 adcs x24, x24, x26 umulh x26, x10, x10 adcs x4, x4, x25 # A[3] * A[3] mul x25, x11, x11 adcs x5, x5, x26 umulh x26, x11, x11 adcs x6, x6, x25 adc x7, x7, x26 # Reduce mov x25, #38 mul x26, x25, x7 adds x24, x24, x26 umulh x27, x25, x7 adc x27, x27, xzr mov x25, #19 extr x27, x27, x24, #63 mul x27, x27, x25 and x24, x24, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x4 adds x21, x21, x26 umulh x4, x25, x4 mul x26, x25, x5 adcs x22, x22, x26 umulh x5, x25, x5 mul x26, x25, x6 adcs x23, x23, x26 umulh x6, x25, x6 adc x24, x24, xzr # Add high product results in adds x21, x21, x27 adcs x22, x22, x4 adcs x23, x23, x5 adc x24, x24, x6 add x2, x0, #32 # Sub ldp x8, x9, [x2] ldp x10, x11, [x2, #16] subs x21, x21, x8 sbcs x22, x22, x9 sbcs x23, x23, x10 sbcs x24, x24, x11 csetm x28, cc mov x25, #-19 # Mask the modulus extr x28, x28, x24, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x21, x21, x25 sbcs x22, x22, xzr and x24, x24, #0x7fffffffffffffff sbcs x23, x23, xzr sbc x24, x24, xzr stp x21, x22, [x0] stp x23, x24, [x0, #16] ldr x2, [x29, #24] add x2, x2, #0x40 add x0, x0, #0x60 # Square * 2 ldp x16, x17, [x2] ldp x19, x20, [x2, #16] # A[0] * A[1] umulh x6, x16, x17 mul x5, x16, x17 # A[0] * A[3] umulh x8, x16, x20 mul x7, x16, x20 # A[0] * A[2] mul x25, x16, x19 adds x6, x6, x25 umulh x26, x16, x19 adcs x7, x7, x26 # A[1] * A[3] mul x25, x17, x20 adcs x8, x8, x25 umulh x9, x17, x20 adc x9, x9, xzr # A[1] * A[2] mul x25, x17, x19 adds x7, x7, x25 umulh x26, x17, x19 adcs x8, x8, x26 # A[2] * A[3] mul x25, x19, x20 adcs x9, x9, x25 umulh x10, x19, x20 adc x10, x10, xzr # Double adds x5, x5, x5 adcs x6, x6, x6 adcs x7, x7, x7 adcs x8, x8, x8 adcs x9, x9, x9 adcs x10, x10, x10 adc x11, xzr, xzr # A[0] * A[0] umulh x26, x16, x16 mul x4, x16, x16 # A[1] * A[1] mul x25, x17, x17 adds x5, x5, x26 umulh x26, x17, x17 adcs x6, x6, x25 # A[2] * A[2] mul x25, x19, x19 adcs x7, x7, x26 umulh x26, x19, x19 adcs x8, x8, x25 # A[3] * A[3] mul x25, x20, x20 adcs x9, x9, x26 umulh x26, x20, x20 adcs x10, x10, x25 adc x11, x11, x26 # Reduce mov x25, #38 mul x26, x25, x11 adds x7, x7, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x7, #63 mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x4, x4, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x5, x5, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x6, x6, x26 umulh x10, x25, x10 adc x7, x7, xzr # Add high product results in adds x4, x4, x27 adcs x5, x5, x8 adcs x6, x6, x9 adc x7, x7, x10 mov x25, #19 lsr x26, x7, #62 extr x7, x7, x6, #63 extr x6, x6, x5, #63 extr x5, x5, x4, #63 lsl x4, x4, #1 mul x26, x26, x25 adds x4, x4, x26 adcs x5, x5, xzr and x7, x7, #0x7fffffffffffffff adcs x6, x6, xzr adc x7, x7, xzr # Store sub x1, x0, #32 # Sub subs x4, x4, x12 sbcs x5, x5, x13 sbcs x6, x6, x14 sbcs x7, x7, x15 csetm x28, cc mov x25, #-19 # Mask the modulus extr x28, x28, x7, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x4, x4, x25 sbcs x5, x5, xzr and x7, x7, #0x7fffffffffffffff sbcs x6, x6, xzr sbc x7, x7, xzr stp x4, x5, [x0] stp x6, x7, [x0, #16] ldr x17, [x29, #40] ldr x19, [x29, #48] ldp x20, x21, [x29, #56] ldp x22, x23, [x29, #72] ldp x24, x25, [x29, #88] ldp x26, x27, [x29, #104] ldr x28, [x29, #120] ldp x29, x30, [sp], #0x80 ret #ifndef __APPLE__ .size ge_p2_dbl,.-ge_p2_dbl #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_madd .type ge_madd,@function .align 2 ge_madd: #else .section __TEXT,__text .globl _ge_madd .p2align 2 _ge_madd: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 str x17, [x29, #56] str x19, [x29, #64] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] stp x26, x27, [x29, #120] str x28, [x29, #136] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] mov x3, x1 add x2, x1, #32 add x1, x0, #32 # Add ldp x8, x9, [x2] ldp x10, x11, [x2, #16] ldp x4, x5, [x3] ldp x6, x7, [x3, #16] adds x16, x8, x4 adcs x17, x9, x5 adcs x19, x10, x6 adcs x20, x11, x7 cset x28, cs mov x25, #19 extr x28, x28, x20, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x16, x16, x25 adcs x17, x17, xzr and x20, x20, #0x7fffffffffffffff adcs x19, x19, xzr adc x20, x20, xzr # Sub subs x12, x8, x4 sbcs x13, x9, x5 sbcs x14, x10, x6 sbcs x15, x11, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr ldr x2, [x29, #32] mov x1, x0 # Multiply ldp x8, x9, [x2] ldp x10, x11, [x2, #16] # A[0] * B[0] umulh x22, x16, x8 mul x21, x16, x8 # A[2] * B[0] umulh x24, x19, x8 mul x23, x19, x8 # A[1] * B[0] mul x25, x17, x8 adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 # A[1] * B[3] umulh x5, x17, x11 adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 adds x22, x22, x25 umulh x26, x16, x9 adcs x23, x23, x26 # A[2] * B[1] mul x25, x19, x9 adcs x24, x24, x25 umulh x26, x19, x9 adcs x4, x4, x26 adc x5, x5, xzr # A[1] * B[2] mul x25, x17, x10 adds x24, x24, x25 umulh x26, x17, x10 adcs x4, x4, x26 adcs x5, x5, xzr adc x6, xzr, xzr # A[0] * B[2] mul x25, x16, x10 adds x23, x23, x25 umulh x26, x16, x10 adcs x24, x24, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # A[1] * B[1] mul x25, x17, x9 adds x23, x23, x25 umulh x26, x17, x9 adcs x24, x24, x26 # A[3] * B[1] mul x25, x20, x9 adcs x4, x4, x25 umulh x26, x20, x9 adcs x5, x5, x26 adc x6, x6, xzr # A[2] * B[2] mul x25, x19, x10 adds x4, x4, x25 umulh x26, x19, x10 adcs x5, x5, x26 # A[3] * B[3] mul x25, x20, x11 adcs x6, x6, x25 umulh x7, x20, x11 adc x7, x7, xzr # A[0] * B[3] mul x25, x16, x11 adds x24, x24, x25 umulh x26, x16, x11 adcs x4, x4, x26 # A[2] * B[3] mul x25, x19, x11 adcs x5, x5, x25 umulh x26, x19, x11 adcs x6, x6, x26 adc x7, x7, xzr # A[3] * B[0] mul x25, x20, x8 adds x24, x24, x25 umulh x26, x20, x8 adcs x4, x4, x26 # A[3] * B[2] mul x25, x20, x10 adcs x5, x5, x25 umulh x26, x20, x10 adcs x6, x6, x26 adc x7, x7, xzr # Reduce mov x25, #38 mul x26, x25, x7 adds x24, x24, x26 umulh x27, x25, x7 adc x27, x27, xzr mov x25, #19 extr x27, x27, x24, #63 mul x27, x27, x25 and x24, x24, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x4 adds x21, x21, x26 umulh x4, x25, x4 mul x26, x25, x5 adcs x22, x22, x26 umulh x5, x25, x5 mul x26, x25, x6 adcs x23, x23, x26 umulh x6, x25, x6 adc x24, x24, xzr # Add high product results in adds x21, x21, x27 adcs x22, x22, x4 adcs x23, x23, x5 adc x24, x24, x6 add x2, x2, #32 add x1, x0, #32 add x0, x0, #32 # Multiply ldp x16, x17, [x2] ldp x19, x20, [x2, #16] # A[0] * B[0] umulh x5, x12, x16 mul x4, x12, x16 # A[2] * B[0] umulh x7, x14, x16 mul x6, x14, x16 # A[1] * B[0] mul x25, x13, x16 adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 # A[1] * B[3] umulh x9, x13, x20 adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 adds x5, x5, x25 umulh x26, x12, x17 adcs x6, x6, x26 # A[2] * B[1] mul x25, x14, x17 adcs x7, x7, x25 umulh x26, x14, x17 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x13, x19 adds x7, x7, x25 umulh x26, x13, x19 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x12, x19 adds x6, x6, x25 umulh x26, x12, x19 adcs x7, x7, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x13, x17 adds x6, x6, x25 umulh x26, x13, x17 adcs x7, x7, x26 # A[3] * B[1] mul x25, x15, x17 adcs x8, x8, x25 umulh x26, x15, x17 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x14, x19 adds x8, x8, x25 umulh x26, x14, x19 adcs x9, x9, x26 # A[3] * B[3] mul x25, x15, x20 adcs x10, x10, x25 umulh x11, x15, x20 adc x11, x11, xzr # A[0] * B[3] mul x25, x12, x20 adds x7, x7, x25 umulh x26, x12, x20 adcs x8, x8, x26 # A[2] * B[3] mul x25, x14, x20 adcs x9, x9, x25 umulh x26, x14, x20 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x15, x16 adds x7, x7, x25 umulh x26, x15, x16 adcs x8, x8, x26 # A[3] * B[2] mul x25, x15, x19 adcs x9, x9, x25 umulh x26, x15, x19 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x7, x7, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x7, #63 mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x4, x4, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x5, x5, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x6, x6, x26 umulh x10, x25, x10 adc x7, x7, xzr # Add high product results in adds x4, x4, x27 adcs x5, x5, x8 adcs x6, x6, x9 adc x7, x7, x10 mov x3, x0 sub x2, x0, #32 sub x1, x0, #32 # Add adds x8, x21, x4 adcs x9, x22, x5 adcs x10, x23, x6 adcs x11, x24, x7 cset x28, cs mov x25, #19 extr x28, x28, x11, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x8, x8, x25 adcs x9, x9, xzr and x11, x11, #0x7fffffffffffffff adcs x10, x10, xzr adc x11, x11, xzr # Sub subs x12, x21, x4 sbcs x13, x22, x5 sbcs x14, x23, x6 sbcs x15, x24, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] ldr x1, [x29, #24] ldr x2, [x29, #32] add x2, x2, #0x40 add x1, x1, #0x60 add x0, x0, #0x40 # Multiply ldp x21, x22, [x1] ldp x23, x24, [x1, #16] ldp x4, x5, [x2] ldp x6, x7, [x2, #16] # A[0] * B[0] umulh x17, x21, x4 mul x16, x21, x4 # A[2] * B[0] umulh x20, x23, x4 mul x19, x23, x4 # A[1] * B[0] mul x25, x22, x4 adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 # A[1] * B[3] umulh x9, x22, x7 adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 adds x17, x17, x25 umulh x26, x21, x5 adcs x19, x19, x26 # A[2] * B[1] mul x25, x23, x5 adcs x20, x20, x25 umulh x26, x23, x5 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x22, x6 adds x20, x20, x25 umulh x26, x22, x6 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x21, x6 adds x19, x19, x25 umulh x26, x21, x6 adcs x20, x20, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x22, x5 adds x19, x19, x25 umulh x26, x22, x5 adcs x20, x20, x26 # A[3] * B[1] mul x25, x24, x5 adcs x8, x8, x25 umulh x26, x24, x5 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x23, x6 adds x8, x8, x25 umulh x26, x23, x6 adcs x9, x9, x26 # A[3] * B[3] mul x25, x24, x7 adcs x10, x10, x25 umulh x11, x24, x7 adc x11, x11, xzr # A[0] * B[3] mul x25, x21, x7 adds x20, x20, x25 umulh x26, x21, x7 adcs x8, x8, x26 # A[2] * B[3] mul x25, x23, x7 adcs x9, x9, x25 umulh x26, x23, x7 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x24, x4 adds x20, x20, x25 umulh x26, x24, x4 adcs x8, x8, x26 # A[3] * B[2] mul x25, x24, x6 adcs x9, x9, x25 umulh x26, x24, x6 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x20, x20, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x20, #63 mul x27, x27, x25 and x20, x20, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x16, x16, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x17, x17, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x19, x19, x26 umulh x10, x25, x10 adc x20, x20, xzr # Add high product results in adds x16, x16, x27 adcs x17, x17, x8 adcs x19, x19, x9 adc x20, x20, x10 sub x1, x1, #32 # Double ldp x12, x13, [x1] ldp x14, x15, [x1, #16] adds x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adc x15, x15, x15 mov x25, #-19 asr x28, x15, #63 # Mask the modulus and x25, x28, x25 and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) subs x12, x12, x25 sbcs x13, x13, x28 sbcs x14, x14, x28 sbc x15, x15, x26 mov x3, x0 sub x2, x0, #32 mov x1, x0 sub x0, x0, #32 # Add adds x8, x12, x16 adcs x9, x13, x17 adcs x10, x14, x19 adcs x11, x15, x20 cset x28, cs mov x25, #19 extr x28, x28, x11, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x8, x8, x25 adcs x9, x9, xzr and x11, x11, #0x7fffffffffffffff adcs x10, x10, xzr adc x11, x11, xzr # Sub subs x4, x12, x16 sbcs x5, x13, x17 sbcs x6, x14, x19 sbcs x7, x15, x20 csetm x28, cc mov x25, #-19 extr x28, x28, x7, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x4, x4, x25 sbcs x5, x5, xzr and x7, x7, #0x7fffffffffffffff sbcs x6, x6, xzr sbc x7, x7, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x4, x5, [x1] stp x6, x7, [x1, #16] ldr x17, [x29, #56] ldr x19, [x29, #64] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] ldp x26, x27, [x29, #120] ldr x28, [x29, #136] ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ .size ge_madd,.-ge_madd #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_msub .type ge_msub,@function .align 2 ge_msub: #else .section __TEXT,__text .globl _ge_msub .p2align 2 _ge_msub: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 str x17, [x29, #56] str x19, [x29, #64] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] stp x26, x27, [x29, #120] str x28, [x29, #136] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] mov x3, x1 add x2, x1, #32 add x1, x0, #32 # Add ldp x8, x9, [x2] ldp x10, x11, [x2, #16] ldp x4, x5, [x3] ldp x6, x7, [x3, #16] adds x16, x8, x4 adcs x17, x9, x5 adcs x19, x10, x6 adcs x20, x11, x7 cset x28, cs mov x25, #19 extr x28, x28, x20, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x16, x16, x25 adcs x17, x17, xzr and x20, x20, #0x7fffffffffffffff adcs x19, x19, xzr adc x20, x20, xzr # Sub subs x12, x8, x4 sbcs x13, x9, x5 sbcs x14, x10, x6 sbcs x15, x11, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr ldr x2, [x29, #32] add x2, x2, #32 mov x1, x0 # Multiply ldp x8, x9, [x2] ldp x10, x11, [x2, #16] # A[0] * B[0] umulh x22, x16, x8 mul x21, x16, x8 # A[2] * B[0] umulh x24, x19, x8 mul x23, x19, x8 # A[1] * B[0] mul x25, x17, x8 adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 # A[1] * B[3] umulh x5, x17, x11 adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 adds x22, x22, x25 umulh x26, x16, x9 adcs x23, x23, x26 # A[2] * B[1] mul x25, x19, x9 adcs x24, x24, x25 umulh x26, x19, x9 adcs x4, x4, x26 adc x5, x5, xzr # A[1] * B[2] mul x25, x17, x10 adds x24, x24, x25 umulh x26, x17, x10 adcs x4, x4, x26 adcs x5, x5, xzr adc x6, xzr, xzr # A[0] * B[2] mul x25, x16, x10 adds x23, x23, x25 umulh x26, x16, x10 adcs x24, x24, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # A[1] * B[1] mul x25, x17, x9 adds x23, x23, x25 umulh x26, x17, x9 adcs x24, x24, x26 # A[3] * B[1] mul x25, x20, x9 adcs x4, x4, x25 umulh x26, x20, x9 adcs x5, x5, x26 adc x6, x6, xzr # A[2] * B[2] mul x25, x19, x10 adds x4, x4, x25 umulh x26, x19, x10 adcs x5, x5, x26 # A[3] * B[3] mul x25, x20, x11 adcs x6, x6, x25 umulh x7, x20, x11 adc x7, x7, xzr # A[0] * B[3] mul x25, x16, x11 adds x24, x24, x25 umulh x26, x16, x11 adcs x4, x4, x26 # A[2] * B[3] mul x25, x19, x11 adcs x5, x5, x25 umulh x26, x19, x11 adcs x6, x6, x26 adc x7, x7, xzr # A[3] * B[0] mul x25, x20, x8 adds x24, x24, x25 umulh x26, x20, x8 adcs x4, x4, x26 # A[3] * B[2] mul x25, x20, x10 adcs x5, x5, x25 umulh x26, x20, x10 adcs x6, x6, x26 adc x7, x7, xzr # Reduce mov x25, #38 mul x26, x25, x7 adds x24, x24, x26 umulh x27, x25, x7 adc x27, x27, xzr mov x25, #19 extr x27, x27, x24, #63 mul x27, x27, x25 and x24, x24, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x4 adds x21, x21, x26 umulh x4, x25, x4 mul x26, x25, x5 adcs x22, x22, x26 umulh x5, x25, x5 mul x26, x25, x6 adcs x23, x23, x26 umulh x6, x25, x6 adc x24, x24, xzr # Add high product results in adds x21, x21, x27 adcs x22, x22, x4 adcs x23, x23, x5 adc x24, x24, x6 sub x2, x2, #32 add x1, x0, #32 add x0, x0, #32 # Multiply ldp x16, x17, [x2] ldp x19, x20, [x2, #16] # A[0] * B[0] umulh x5, x12, x16 mul x4, x12, x16 # A[2] * B[0] umulh x7, x14, x16 mul x6, x14, x16 # A[1] * B[0] mul x25, x13, x16 adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 # A[1] * B[3] umulh x9, x13, x20 adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 adds x5, x5, x25 umulh x26, x12, x17 adcs x6, x6, x26 # A[2] * B[1] mul x25, x14, x17 adcs x7, x7, x25 umulh x26, x14, x17 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x13, x19 adds x7, x7, x25 umulh x26, x13, x19 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x12, x19 adds x6, x6, x25 umulh x26, x12, x19 adcs x7, x7, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x13, x17 adds x6, x6, x25 umulh x26, x13, x17 adcs x7, x7, x26 # A[3] * B[1] mul x25, x15, x17 adcs x8, x8, x25 umulh x26, x15, x17 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x14, x19 adds x8, x8, x25 umulh x26, x14, x19 adcs x9, x9, x26 # A[3] * B[3] mul x25, x15, x20 adcs x10, x10, x25 umulh x11, x15, x20 adc x11, x11, xzr # A[0] * B[3] mul x25, x12, x20 adds x7, x7, x25 umulh x26, x12, x20 adcs x8, x8, x26 # A[2] * B[3] mul x25, x14, x20 adcs x9, x9, x25 umulh x26, x14, x20 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x15, x16 adds x7, x7, x25 umulh x26, x15, x16 adcs x8, x8, x26 # A[3] * B[2] mul x25, x15, x19 adcs x9, x9, x25 umulh x26, x15, x19 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x7, x7, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x7, #63 mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x4, x4, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x5, x5, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x6, x6, x26 umulh x10, x25, x10 adc x7, x7, xzr # Add high product results in adds x4, x4, x27 adcs x5, x5, x8 adcs x6, x6, x9 adc x7, x7, x10 mov x3, x0 sub x2, x0, #32 sub x1, x0, #32 # Add adds x8, x21, x4 adcs x9, x22, x5 adcs x10, x23, x6 adcs x11, x24, x7 cset x28, cs mov x25, #19 extr x28, x28, x11, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x8, x8, x25 adcs x9, x9, xzr and x11, x11, #0x7fffffffffffffff adcs x10, x10, xzr adc x11, x11, xzr # Sub subs x12, x21, x4 sbcs x13, x22, x5 sbcs x14, x23, x6 sbcs x15, x24, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] ldr x1, [x29, #24] ldr x2, [x29, #32] add x2, x2, #0x40 add x1, x1, #0x60 add x0, x0, #0x40 # Multiply ldp x21, x22, [x1] ldp x23, x24, [x1, #16] ldp x4, x5, [x2] ldp x6, x7, [x2, #16] # A[0] * B[0] umulh x17, x21, x4 mul x16, x21, x4 # A[2] * B[0] umulh x20, x23, x4 mul x19, x23, x4 # A[1] * B[0] mul x25, x22, x4 adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 # A[1] * B[3] umulh x9, x22, x7 adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 adds x17, x17, x25 umulh x26, x21, x5 adcs x19, x19, x26 # A[2] * B[1] mul x25, x23, x5 adcs x20, x20, x25 umulh x26, x23, x5 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x22, x6 adds x20, x20, x25 umulh x26, x22, x6 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x21, x6 adds x19, x19, x25 umulh x26, x21, x6 adcs x20, x20, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x22, x5 adds x19, x19, x25 umulh x26, x22, x5 adcs x20, x20, x26 # A[3] * B[1] mul x25, x24, x5 adcs x8, x8, x25 umulh x26, x24, x5 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x23, x6 adds x8, x8, x25 umulh x26, x23, x6 adcs x9, x9, x26 # A[3] * B[3] mul x25, x24, x7 adcs x10, x10, x25 umulh x11, x24, x7 adc x11, x11, xzr # A[0] * B[3] mul x25, x21, x7 adds x20, x20, x25 umulh x26, x21, x7 adcs x8, x8, x26 # A[2] * B[3] mul x25, x23, x7 adcs x9, x9, x25 umulh x26, x23, x7 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x24, x4 adds x20, x20, x25 umulh x26, x24, x4 adcs x8, x8, x26 # A[3] * B[2] mul x25, x24, x6 adcs x9, x9, x25 umulh x26, x24, x6 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x20, x20, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x20, #63 mul x27, x27, x25 and x20, x20, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x16, x16, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x17, x17, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x19, x19, x26 umulh x10, x25, x10 adc x20, x20, xzr # Add high product results in adds x16, x16, x27 adcs x17, x17, x8 adcs x19, x19, x9 adc x20, x20, x10 sub x1, x1, #32 # Double ldp x12, x13, [x1] ldp x14, x15, [x1, #16] adds x12, x12, x12 adcs x13, x13, x13 adcs x14, x14, x14 adc x15, x15, x15 mov x25, #-19 asr x28, x15, #63 # Mask the modulus and x25, x28, x25 and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) subs x12, x12, x25 sbcs x13, x13, x28 sbcs x14, x14, x28 sbc x15, x15, x26 mov x3, x0 sub x2, x0, #32 sub x1, x0, #32 # Add adds x8, x12, x16 adcs x9, x13, x17 adcs x10, x14, x19 adcs x11, x15, x20 cset x28, cs mov x25, #19 extr x28, x28, x11, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x8, x8, x25 adcs x9, x9, xzr and x11, x11, #0x7fffffffffffffff adcs x10, x10, xzr adc x11, x11, xzr # Sub subs x4, x12, x16 sbcs x5, x13, x17 sbcs x6, x14, x19 sbcs x7, x15, x20 csetm x28, cc mov x25, #-19 extr x28, x28, x7, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x4, x4, x25 sbcs x5, x5, xzr and x7, x7, #0x7fffffffffffffff sbcs x6, x6, xzr sbc x7, x7, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x4, x5, [x1] stp x6, x7, [x1, #16] ldr x17, [x29, #56] ldr x19, [x29, #64] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] ldp x26, x27, [x29, #120] ldr x28, [x29, #136] ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ .size ge_msub,.-ge_msub #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_add .type ge_add,@function .align 2 ge_add: #else .section __TEXT,__text .globl _ge_add .p2align 2 _ge_add: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 str x17, [x29, #56] str x19, [x29, #64] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] stp x26, x27, [x29, #120] str x28, [x29, #136] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] mov x3, x1 add x2, x1, #32 add x1, x0, #32 # Add ldp x8, x9, [x2] ldp x10, x11, [x2, #16] ldp x4, x5, [x3] ldp x6, x7, [x3, #16] adds x16, x8, x4 adcs x17, x9, x5 adcs x19, x10, x6 adcs x20, x11, x7 cset x28, cs mov x25, #19 extr x28, x28, x20, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x16, x16, x25 adcs x17, x17, xzr and x20, x20, #0x7fffffffffffffff adcs x19, x19, xzr adc x20, x20, xzr # Sub subs x12, x8, x4 sbcs x13, x9, x5 sbcs x14, x10, x6 sbcs x15, x11, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr ldr x2, [x29, #32] mov x1, x0 # Multiply ldp x8, x9, [x2] ldp x10, x11, [x2, #16] # A[0] * B[0] umulh x22, x16, x8 mul x21, x16, x8 # A[2] * B[0] umulh x24, x19, x8 mul x23, x19, x8 # A[1] * B[0] mul x25, x17, x8 adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 # A[1] * B[3] umulh x5, x17, x11 adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 adds x22, x22, x25 umulh x26, x16, x9 adcs x23, x23, x26 # A[2] * B[1] mul x25, x19, x9 adcs x24, x24, x25 umulh x26, x19, x9 adcs x4, x4, x26 adc x5, x5, xzr # A[1] * B[2] mul x25, x17, x10 adds x24, x24, x25 umulh x26, x17, x10 adcs x4, x4, x26 adcs x5, x5, xzr adc x6, xzr, xzr # A[0] * B[2] mul x25, x16, x10 adds x23, x23, x25 umulh x26, x16, x10 adcs x24, x24, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # A[1] * B[1] mul x25, x17, x9 adds x23, x23, x25 umulh x26, x17, x9 adcs x24, x24, x26 # A[3] * B[1] mul x25, x20, x9 adcs x4, x4, x25 umulh x26, x20, x9 adcs x5, x5, x26 adc x6, x6, xzr # A[2] * B[2] mul x25, x19, x10 adds x4, x4, x25 umulh x26, x19, x10 adcs x5, x5, x26 # A[3] * B[3] mul x25, x20, x11 adcs x6, x6, x25 umulh x7, x20, x11 adc x7, x7, xzr # A[0] * B[3] mul x25, x16, x11 adds x24, x24, x25 umulh x26, x16, x11 adcs x4, x4, x26 # A[2] * B[3] mul x25, x19, x11 adcs x5, x5, x25 umulh x26, x19, x11 adcs x6, x6, x26 adc x7, x7, xzr # A[3] * B[0] mul x25, x20, x8 adds x24, x24, x25 umulh x26, x20, x8 adcs x4, x4, x26 # A[3] * B[2] mul x25, x20, x10 adcs x5, x5, x25 umulh x26, x20, x10 adcs x6, x6, x26 adc x7, x7, xzr # Reduce mov x25, #38 mul x26, x25, x7 adds x24, x24, x26 umulh x27, x25, x7 adc x27, x27, xzr mov x25, #19 extr x27, x27, x24, #63 mul x27, x27, x25 and x24, x24, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x4 adds x21, x21, x26 umulh x4, x25, x4 mul x26, x25, x5 adcs x22, x22, x26 umulh x5, x25, x5 mul x26, x25, x6 adcs x23, x23, x26 umulh x6, x25, x6 adc x24, x24, xzr # Add high product results in adds x21, x21, x27 adcs x22, x22, x4 adcs x23, x23, x5 adc x24, x24, x6 # Store stp x21, x22, [x0] stp x23, x24, [x0, #16] add x2, x2, #32 add x1, x0, #32 add x0, x0, #32 # Multiply ldp x16, x17, [x2] ldp x19, x20, [x2, #16] # A[0] * B[0] umulh x5, x12, x16 mul x4, x12, x16 # A[2] * B[0] umulh x7, x14, x16 mul x6, x14, x16 # A[1] * B[0] mul x25, x13, x16 adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 # A[1] * B[3] umulh x9, x13, x20 adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 adds x5, x5, x25 umulh x26, x12, x17 adcs x6, x6, x26 # A[2] * B[1] mul x25, x14, x17 adcs x7, x7, x25 umulh x26, x14, x17 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x13, x19 adds x7, x7, x25 umulh x26, x13, x19 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x12, x19 adds x6, x6, x25 umulh x26, x12, x19 adcs x7, x7, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x13, x17 adds x6, x6, x25 umulh x26, x13, x17 adcs x7, x7, x26 # A[3] * B[1] mul x25, x15, x17 adcs x8, x8, x25 umulh x26, x15, x17 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x14, x19 adds x8, x8, x25 umulh x26, x14, x19 adcs x9, x9, x26 # A[3] * B[3] mul x25, x15, x20 adcs x10, x10, x25 umulh x11, x15, x20 adc x11, x11, xzr # A[0] * B[3] mul x25, x12, x20 adds x7, x7, x25 umulh x26, x12, x20 adcs x8, x8, x26 # A[2] * B[3] mul x25, x14, x20 adcs x9, x9, x25 umulh x26, x14, x20 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x15, x16 adds x7, x7, x25 umulh x26, x15, x16 adcs x8, x8, x26 # A[3] * B[2] mul x25, x15, x19 adcs x9, x9, x25 umulh x26, x15, x19 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x7, x7, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x7, #63 mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x4, x4, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x5, x5, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x6, x6, x26 umulh x10, x25, x10 adc x7, x7, xzr # Add high product results in adds x4, x4, x27 adcs x5, x5, x8 adcs x6, x6, x9 adc x7, x7, x10 # Store stp x4, x5, [x0] stp x6, x7, [x0, #16] mov x3, x0 sub x2, x0, #32 sub x1, x0, #32 # Add adds x8, x21, x4 adcs x9, x22, x5 adcs x10, x23, x6 adcs x11, x24, x7 cset x28, cs mov x25, #19 extr x28, x28, x11, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x8, x8, x25 adcs x9, x9, xzr and x11, x11, #0x7fffffffffffffff adcs x10, x10, xzr adc x11, x11, xzr # Sub subs x12, x21, x4 sbcs x13, x22, x5 sbcs x14, x23, x6 sbcs x15, x24, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] ldr x1, [x29, #24] ldr x2, [x29, #32] add x2, x2, #0x60 add x1, x1, #0x60 add x0, x0, #0x40 # Multiply ldp x21, x22, [x1] ldp x23, x24, [x1, #16] ldp x4, x5, [x2] ldp x6, x7, [x2, #16] # A[0] * B[0] umulh x17, x21, x4 mul x16, x21, x4 # A[2] * B[0] umulh x20, x23, x4 mul x19, x23, x4 # A[1] * B[0] mul x25, x22, x4 adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 # A[1] * B[3] umulh x9, x22, x7 adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 adds x17, x17, x25 umulh x26, x21, x5 adcs x19, x19, x26 # A[2] * B[1] mul x25, x23, x5 adcs x20, x20, x25 umulh x26, x23, x5 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x22, x6 adds x20, x20, x25 umulh x26, x22, x6 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x21, x6 adds x19, x19, x25 umulh x26, x21, x6 adcs x20, x20, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x22, x5 adds x19, x19, x25 umulh x26, x22, x5 adcs x20, x20, x26 # A[3] * B[1] mul x25, x24, x5 adcs x8, x8, x25 umulh x26, x24, x5 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x23, x6 adds x8, x8, x25 umulh x26, x23, x6 adcs x9, x9, x26 # A[3] * B[3] mul x25, x24, x7 adcs x10, x10, x25 umulh x11, x24, x7 adc x11, x11, xzr # A[0] * B[3] mul x25, x21, x7 adds x20, x20, x25 umulh x26, x21, x7 adcs x8, x8, x26 # A[2] * B[3] mul x25, x23, x7 adcs x9, x9, x25 umulh x26, x23, x7 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x24, x4 adds x20, x20, x25 umulh x26, x24, x4 adcs x8, x8, x26 # A[3] * B[2] mul x25, x24, x6 adcs x9, x9, x25 umulh x26, x24, x6 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x20, x20, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x20, #63 mul x27, x27, x25 and x20, x20, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x16, x16, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x17, x17, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x19, x19, x26 umulh x10, x25, x10 adc x20, x20, xzr # Add high product results in adds x16, x16, x27 adcs x17, x17, x8 adcs x19, x19, x9 adc x20, x20, x10 # Store stp x16, x17, [x0] stp x19, x20, [x0, #16] sub x3, x2, #32 sub x2, x1, #32 sub x1, x0, #32 # Multiply ldp x4, x5, [x2] ldp x6, x7, [x2, #16] ldp x12, x13, [x3] ldp x14, x15, [x3, #16] # A[0] * B[0] umulh x9, x4, x12 mul x8, x4, x12 # A[2] * B[0] umulh x11, x6, x12 mul x10, x6, x12 # A[1] * B[0] mul x25, x5, x12 adds x9, x9, x25 umulh x26, x5, x12 adcs x10, x10, x26 # A[1] * B[3] umulh x17, x5, x15 adc x11, x11, xzr mul x16, x5, x15 # A[0] * B[1] mul x25, x4, x13 adds x9, x9, x25 umulh x26, x4, x13 adcs x10, x10, x26 # A[2] * B[1] mul x25, x6, x13 adcs x11, x11, x25 umulh x26, x6, x13 adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[2] mul x25, x5, x14 adds x11, x11, x25 umulh x26, x5, x14 adcs x16, x16, x26 adcs x17, x17, xzr adc x19, xzr, xzr # A[0] * B[2] mul x25, x4, x14 adds x10, x10, x25 umulh x26, x4, x14 adcs x11, x11, x26 adcs x16, x16, xzr adcs x17, x17, xzr adc x19, x19, xzr # A[1] * B[1] mul x25, x5, x13 adds x10, x10, x25 umulh x26, x5, x13 adcs x11, x11, x26 # A[3] * B[1] mul x25, x7, x13 adcs x16, x16, x25 umulh x26, x7, x13 adcs x17, x17, x26 adc x19, x19, xzr # A[2] * B[2] mul x25, x6, x14 adds x16, x16, x25 umulh x26, x6, x14 adcs x17, x17, x26 # A[3] * B[3] mul x25, x7, x15 adcs x19, x19, x25 umulh x20, x7, x15 adc x20, x20, xzr # A[0] * B[3] mul x25, x4, x15 adds x11, x11, x25 umulh x26, x4, x15 adcs x16, x16, x26 # A[2] * B[3] mul x25, x6, x15 adcs x17, x17, x25 umulh x26, x6, x15 adcs x19, x19, x26 adc x20, x20, xzr # A[3] * B[0] mul x25, x7, x12 adds x11, x11, x25 umulh x26, x7, x12 adcs x16, x16, x26 # A[3] * B[2] mul x25, x7, x14 adcs x17, x17, x25 umulh x26, x7, x14 adcs x19, x19, x26 adc x20, x20, xzr # Reduce mov x25, #38 mul x26, x25, x20 adds x11, x11, x26 umulh x27, x25, x20 adc x27, x27, xzr mov x25, #19 extr x27, x27, x11, #63 mul x27, x27, x25 and x11, x11, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x16 adds x8, x8, x26 umulh x16, x25, x16 mul x26, x25, x17 adcs x9, x9, x26 umulh x17, x25, x17 mul x26, x25, x19 adcs x10, x10, x26 umulh x19, x25, x19 adc x11, x11, xzr # Add high product results in adds x8, x8, x27 adcs x9, x9, x16 adcs x10, x10, x17 adc x11, x11, x19 # Double adds x8, x8, x8 adcs x9, x9, x9 adcs x10, x10, x10 adc x11, x11, x11 mov x25, #-19 asr x28, x11, #63 # Mask the modulus and x25, x28, x25 and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) subs x8, x8, x25 sbcs x9, x9, x28 sbcs x10, x10, x28 sbc x11, x11, x26 mov x3, x0 sub x2, x0, #32 mov x1, x0 sub x0, x0, #32 # Add ldp x4, x5, [x3] ldp x6, x7, [x3, #16] adds x21, x8, x4 adcs x22, x9, x5 adcs x23, x10, x6 adcs x24, x11, x7 cset x28, cs mov x25, #19 extr x28, x28, x24, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x21, x21, x25 adcs x22, x22, xzr and x24, x24, #0x7fffffffffffffff adcs x23, x23, xzr adc x24, x24, xzr # Sub subs x12, x8, x4 sbcs x13, x9, x5 sbcs x14, x10, x6 sbcs x15, x11, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr stp x21, x22, [x0] stp x23, x24, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] ldr x17, [x29, #56] ldr x19, [x29, #64] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] ldp x26, x27, [x29, #120] ldr x28, [x29, #136] ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ .size ge_add,.-ge_add #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl ge_sub .type ge_sub,@function .align 2 ge_sub: #else .section __TEXT,__text .globl _ge_sub .p2align 2 _ge_sub: #endif /* __APPLE__ */ stp x29, x30, [sp, #-144]! add x29, sp, #0 str x17, [x29, #56] str x19, [x29, #64] stp x20, x21, [x29, #72] stp x22, x23, [x29, #88] stp x24, x25, [x29, #104] stp x26, x27, [x29, #120] str x28, [x29, #136] str x0, [x29, #16] str x1, [x29, #24] str x2, [x29, #32] mov x3, x1 add x2, x1, #32 add x1, x0, #32 # Add ldp x8, x9, [x2] ldp x10, x11, [x2, #16] ldp x4, x5, [x3] ldp x6, x7, [x3, #16] adds x16, x8, x4 adcs x17, x9, x5 adcs x19, x10, x6 adcs x20, x11, x7 cset x28, cs mov x25, #19 extr x28, x28, x20, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x16, x16, x25 adcs x17, x17, xzr and x20, x20, #0x7fffffffffffffff adcs x19, x19, xzr adc x20, x20, xzr # Sub subs x12, x8, x4 sbcs x13, x9, x5 sbcs x14, x10, x6 sbcs x15, x11, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr ldr x2, [x29, #32] add x2, x2, #32 mov x1, x0 # Multiply ldp x8, x9, [x2] ldp x10, x11, [x2, #16] # A[0] * B[0] umulh x22, x16, x8 mul x21, x16, x8 # A[2] * B[0] umulh x24, x19, x8 mul x23, x19, x8 # A[1] * B[0] mul x25, x17, x8 adds x22, x22, x25 umulh x26, x17, x8 adcs x23, x23, x26 # A[1] * B[3] umulh x5, x17, x11 adc x24, x24, xzr mul x4, x17, x11 # A[0] * B[1] mul x25, x16, x9 adds x22, x22, x25 umulh x26, x16, x9 adcs x23, x23, x26 # A[2] * B[1] mul x25, x19, x9 adcs x24, x24, x25 umulh x26, x19, x9 adcs x4, x4, x26 adc x5, x5, xzr # A[1] * B[2] mul x25, x17, x10 adds x24, x24, x25 umulh x26, x17, x10 adcs x4, x4, x26 adcs x5, x5, xzr adc x6, xzr, xzr # A[0] * B[2] mul x25, x16, x10 adds x23, x23, x25 umulh x26, x16, x10 adcs x24, x24, x26 adcs x4, x4, xzr adcs x5, x5, xzr adc x6, x6, xzr # A[1] * B[1] mul x25, x17, x9 adds x23, x23, x25 umulh x26, x17, x9 adcs x24, x24, x26 # A[3] * B[1] mul x25, x20, x9 adcs x4, x4, x25 umulh x26, x20, x9 adcs x5, x5, x26 adc x6, x6, xzr # A[2] * B[2] mul x25, x19, x10 adds x4, x4, x25 umulh x26, x19, x10 adcs x5, x5, x26 # A[3] * B[3] mul x25, x20, x11 adcs x6, x6, x25 umulh x7, x20, x11 adc x7, x7, xzr # A[0] * B[3] mul x25, x16, x11 adds x24, x24, x25 umulh x26, x16, x11 adcs x4, x4, x26 # A[2] * B[3] mul x25, x19, x11 adcs x5, x5, x25 umulh x26, x19, x11 adcs x6, x6, x26 adc x7, x7, xzr # A[3] * B[0] mul x25, x20, x8 adds x24, x24, x25 umulh x26, x20, x8 adcs x4, x4, x26 # A[3] * B[2] mul x25, x20, x10 adcs x5, x5, x25 umulh x26, x20, x10 adcs x6, x6, x26 adc x7, x7, xzr # Reduce mov x25, #38 mul x26, x25, x7 adds x24, x24, x26 umulh x27, x25, x7 adc x27, x27, xzr mov x25, #19 extr x27, x27, x24, #63 mul x27, x27, x25 and x24, x24, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x4 adds x21, x21, x26 umulh x4, x25, x4 mul x26, x25, x5 adcs x22, x22, x26 umulh x5, x25, x5 mul x26, x25, x6 adcs x23, x23, x26 umulh x6, x25, x6 adc x24, x24, xzr # Add high product results in adds x21, x21, x27 adcs x22, x22, x4 adcs x23, x23, x5 adc x24, x24, x6 # Reduce if top bit set mov x25, #19 and x26, x25, x24, asr 63 adds x21, x21, x26 adcs x22, x22, xzr and x24, x24, #0x7fffffffffffffff adcs x23, x23, xzr adc x24, x24, xzr # Store stp x21, x22, [x0] stp x23, x24, [x0, #16] sub x2, x2, #32 add x1, x0, #32 add x0, x0, #32 # Multiply ldp x16, x17, [x2] ldp x19, x20, [x2, #16] # A[0] * B[0] umulh x5, x12, x16 mul x4, x12, x16 # A[2] * B[0] umulh x7, x14, x16 mul x6, x14, x16 # A[1] * B[0] mul x25, x13, x16 adds x5, x5, x25 umulh x26, x13, x16 adcs x6, x6, x26 # A[1] * B[3] umulh x9, x13, x20 adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x25, x12, x17 adds x5, x5, x25 umulh x26, x12, x17 adcs x6, x6, x26 # A[2] * B[1] mul x25, x14, x17 adcs x7, x7, x25 umulh x26, x14, x17 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x13, x19 adds x7, x7, x25 umulh x26, x13, x19 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x12, x19 adds x6, x6, x25 umulh x26, x12, x19 adcs x7, x7, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x13, x17 adds x6, x6, x25 umulh x26, x13, x17 adcs x7, x7, x26 # A[3] * B[1] mul x25, x15, x17 adcs x8, x8, x25 umulh x26, x15, x17 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x14, x19 adds x8, x8, x25 umulh x26, x14, x19 adcs x9, x9, x26 # A[3] * B[3] mul x25, x15, x20 adcs x10, x10, x25 umulh x11, x15, x20 adc x11, x11, xzr # A[0] * B[3] mul x25, x12, x20 adds x7, x7, x25 umulh x26, x12, x20 adcs x8, x8, x26 # A[2] * B[3] mul x25, x14, x20 adcs x9, x9, x25 umulh x26, x14, x20 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x15, x16 adds x7, x7, x25 umulh x26, x15, x16 adcs x8, x8, x26 # A[3] * B[2] mul x25, x15, x19 adcs x9, x9, x25 umulh x26, x15, x19 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x7, x7, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x7, #63 mul x27, x27, x25 and x7, x7, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x4, x4, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x5, x5, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x6, x6, x26 umulh x10, x25, x10 adc x7, x7, xzr # Add high product results in adds x4, x4, x27 adcs x5, x5, x8 adcs x6, x6, x9 adc x7, x7, x10 # Store stp x4, x5, [x0] stp x6, x7, [x0, #16] mov x3, x0 sub x2, x0, #32 sub x1, x0, #32 # Add adds x8, x21, x4 adcs x9, x22, x5 adcs x10, x23, x6 adcs x11, x24, x7 cset x28, cs mov x25, #19 extr x28, x28, x11, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x8, x8, x25 adcs x9, x9, xzr and x11, x11, #0x7fffffffffffffff adcs x10, x10, xzr adc x11, x11, xzr # Sub subs x12, x21, x4 sbcs x13, x22, x5 sbcs x14, x23, x6 sbcs x15, x24, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x15, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x12, x12, x25 sbcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff sbcs x14, x14, xzr sbc x15, x15, xzr stp x8, x9, [x0] stp x10, x11, [x0, #16] stp x12, x13, [x1] stp x14, x15, [x1, #16] ldr x1, [x29, #24] ldr x2, [x29, #32] add x2, x2, #0x60 add x1, x1, #0x60 add x0, x0, #0x40 # Multiply ldp x21, x22, [x1] ldp x23, x24, [x1, #16] ldp x4, x5, [x2] ldp x6, x7, [x2, #16] # A[0] * B[0] umulh x17, x21, x4 mul x16, x21, x4 # A[2] * B[0] umulh x20, x23, x4 mul x19, x23, x4 # A[1] * B[0] mul x25, x22, x4 adds x17, x17, x25 umulh x26, x22, x4 adcs x19, x19, x26 # A[1] * B[3] umulh x9, x22, x7 adc x20, x20, xzr mul x8, x22, x7 # A[0] * B[1] mul x25, x21, x5 adds x17, x17, x25 umulh x26, x21, x5 adcs x19, x19, x26 # A[2] * B[1] mul x25, x23, x5 adcs x20, x20, x25 umulh x26, x23, x5 adcs x8, x8, x26 adc x9, x9, xzr # A[1] * B[2] mul x25, x22, x6 adds x20, x20, x25 umulh x26, x22, x6 adcs x8, x8, x26 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x25, x21, x6 adds x19, x19, x25 umulh x26, x21, x6 adcs x20, x20, x26 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x25, x22, x5 adds x19, x19, x25 umulh x26, x22, x5 adcs x20, x20, x26 # A[3] * B[1] mul x25, x24, x5 adcs x8, x8, x25 umulh x26, x24, x5 adcs x9, x9, x26 adc x10, x10, xzr # A[2] * B[2] mul x25, x23, x6 adds x8, x8, x25 umulh x26, x23, x6 adcs x9, x9, x26 # A[3] * B[3] mul x25, x24, x7 adcs x10, x10, x25 umulh x11, x24, x7 adc x11, x11, xzr # A[0] * B[3] mul x25, x21, x7 adds x20, x20, x25 umulh x26, x21, x7 adcs x8, x8, x26 # A[2] * B[3] mul x25, x23, x7 adcs x9, x9, x25 umulh x26, x23, x7 adcs x10, x10, x26 adc x11, x11, xzr # A[3] * B[0] mul x25, x24, x4 adds x20, x20, x25 umulh x26, x24, x4 adcs x8, x8, x26 # A[3] * B[2] mul x25, x24, x6 adcs x9, x9, x25 umulh x26, x24, x6 adcs x10, x10, x26 adc x11, x11, xzr # Reduce mov x25, #38 mul x26, x25, x11 adds x20, x20, x26 umulh x27, x25, x11 adc x27, x27, xzr mov x25, #19 extr x27, x27, x20, #63 mul x27, x27, x25 and x20, x20, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x8 adds x16, x16, x26 umulh x8, x25, x8 mul x26, x25, x9 adcs x17, x17, x26 umulh x9, x25, x9 mul x26, x25, x10 adcs x19, x19, x26 umulh x10, x25, x10 adc x20, x20, xzr # Add high product results in adds x16, x16, x27 adcs x17, x17, x8 adcs x19, x19, x9 adc x20, x20, x10 # Reduce if top bit set mov x25, #19 and x26, x25, x20, asr 63 adds x16, x16, x26 adcs x17, x17, xzr and x20, x20, #0x7fffffffffffffff adcs x19, x19, xzr adc x20, x20, xzr # Store stp x16, x17, [x0] stp x19, x20, [x0, #16] sub x3, x2, #32 sub x2, x1, #32 sub x1, x0, #32 # Multiply ldp x4, x5, [x2] ldp x6, x7, [x2, #16] ldp x12, x13, [x3] ldp x14, x15, [x3, #16] # A[0] * B[0] umulh x9, x4, x12 mul x8, x4, x12 # A[2] * B[0] umulh x11, x6, x12 mul x10, x6, x12 # A[1] * B[0] mul x25, x5, x12 adds x9, x9, x25 umulh x26, x5, x12 adcs x10, x10, x26 # A[1] * B[3] umulh x17, x5, x15 adc x11, x11, xzr mul x16, x5, x15 # A[0] * B[1] mul x25, x4, x13 adds x9, x9, x25 umulh x26, x4, x13 adcs x10, x10, x26 # A[2] * B[1] mul x25, x6, x13 adcs x11, x11, x25 umulh x26, x6, x13 adcs x16, x16, x26 adc x17, x17, xzr # A[1] * B[2] mul x25, x5, x14 adds x11, x11, x25 umulh x26, x5, x14 adcs x16, x16, x26 adcs x17, x17, xzr adc x19, xzr, xzr # A[0] * B[2] mul x25, x4, x14 adds x10, x10, x25 umulh x26, x4, x14 adcs x11, x11, x26 adcs x16, x16, xzr adcs x17, x17, xzr adc x19, x19, xzr # A[1] * B[1] mul x25, x5, x13 adds x10, x10, x25 umulh x26, x5, x13 adcs x11, x11, x26 # A[3] * B[1] mul x25, x7, x13 adcs x16, x16, x25 umulh x26, x7, x13 adcs x17, x17, x26 adc x19, x19, xzr # A[2] * B[2] mul x25, x6, x14 adds x16, x16, x25 umulh x26, x6, x14 adcs x17, x17, x26 # A[3] * B[3] mul x25, x7, x15 adcs x19, x19, x25 umulh x20, x7, x15 adc x20, x20, xzr # A[0] * B[3] mul x25, x4, x15 adds x11, x11, x25 umulh x26, x4, x15 adcs x16, x16, x26 # A[2] * B[3] mul x25, x6, x15 adcs x17, x17, x25 umulh x26, x6, x15 adcs x19, x19, x26 adc x20, x20, xzr # A[3] * B[0] mul x25, x7, x12 adds x11, x11, x25 umulh x26, x7, x12 adcs x16, x16, x26 # A[3] * B[2] mul x25, x7, x14 adcs x17, x17, x25 umulh x26, x7, x14 adcs x19, x19, x26 adc x20, x20, xzr # Reduce mov x25, #38 mul x26, x25, x20 adds x11, x11, x26 umulh x27, x25, x20 adc x27, x27, xzr mov x25, #19 extr x27, x27, x11, #63 mul x27, x27, x25 and x11, x11, #0x7fffffffffffffff mov x25, #38 mul x26, x25, x16 adds x8, x8, x26 umulh x16, x25, x16 mul x26, x25, x17 adcs x9, x9, x26 umulh x17, x25, x17 mul x26, x25, x19 adcs x10, x10, x26 umulh x19, x25, x19 adc x11, x11, xzr # Add high product results in adds x8, x8, x27 adcs x9, x9, x16 adcs x10, x10, x17 adc x11, x11, x19 # Double adds x8, x8, x8 adcs x9, x9, x9 adcs x10, x10, x10 adc x11, x11, x11 mov x25, #-19 asr x28, x11, #63 # Mask the modulus and x25, x28, x25 and x26, x28, #0x7fffffffffffffff # Sub modulus (if overflow) subs x8, x8, x25 sbcs x9, x9, x28 sbcs x10, x10, x28 sbc x11, x11, x26 mov x3, x0 sub x2, x0, #32 # Add ldp x4, x5, [x3] ldp x6, x7, [x3, #16] adds x12, x8, x4 adcs x13, x9, x5 adcs x14, x10, x6 adcs x15, x11, x7 cset x28, cs mov x25, #19 extr x28, x28, x15, #63 mul x25, x28, x25 # Sub modulus (if overflow) adds x12, x12, x25 adcs x13, x13, xzr and x15, x15, #0x7fffffffffffffff adcs x14, x14, xzr adc x15, x15, xzr # Sub subs x21, x8, x4 sbcs x22, x9, x5 sbcs x23, x10, x6 sbcs x24, x11, x7 csetm x28, cc mov x25, #-19 extr x28, x28, x24, #63 mul x25, x28, x25 # Add modulus (if underflow) subs x21, x21, x25 sbcs x22, x22, xzr and x24, x24, #0x7fffffffffffffff sbcs x23, x23, xzr sbc x24, x24, xzr stp x12, x13, [x0] stp x14, x15, [x0, #16] stp x21, x22, [x1] stp x23, x24, [x1, #16] ldr x17, [x29, #56] ldr x19, [x29, #64] ldp x20, x21, [x29, #72] ldp x22, x23, [x29, #88] ldp x24, x25, [x29, #104] ldp x26, x27, [x29, #120] ldr x28, [x29, #136] ldp x29, x30, [sp], #0x90 ret #ifndef __APPLE__ .size ge_sub,.-ge_sub #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_reduce .type sc_reduce,@function .align 2 sc_reduce: #else .section __TEXT,__text .globl _sc_reduce .p2align 2 _sc_reduce: #endif /* __APPLE__ */ stp x29, x30, [sp, #-64]! add x29, sp, #0 str x17, [x29, #16] str x19, [x29, #24] stp x20, x21, [x29, #32] stp x22, x23, [x29, #48] ldp x2, x3, [x0] ldp x4, x5, [x0, #16] ldp x6, x7, [x0, #32] ldp x8, x9, [x0, #48] lsr x23, x9, #56 lsl x9, x9, #4 orr x9, x9, x8, lsr 60 lsl x8, x8, #4 orr x8, x8, x7, lsr 60 lsl x7, x7, #4 orr x7, x7, x6, lsr 60 lsl x6, x6, #4 mov x1, #15 orr x6, x6, x5, lsr 60 bic x5, x5, x1, lsl 60 bic x9, x9, x1, lsl 60 # Add order times bits 504..511 mov x11, #0x2c13 movk x11, #0xa30a, lsl 16 movk x11, #0x9ce5, lsl 32 movk x11, #0xa7ed, lsl 48 mov x13, #0x6329 movk x13, #0x5d08, lsl 16 movk x13, #0x621, lsl 32 movk x13, #0xeb21, lsl 48 mul x10, x23, x11 umulh x11, x23, x11 mul x12, x23, x13 umulh x13, x23, x13 adds x6, x6, x10 adcs x7, x7, x11 adcs x8, x8, xzr adc x9, x9, xzr adds x7, x7, x12 adcs x8, x8, x13 adc x9, x9, xzr subs x8, x8, x23 sbc x9, x9, xzr # Sub product of top 4 words and order mov x1, #0x2c13 movk x1, #0xa30a, lsl 16 movk x1, #0x9ce5, lsl 32 movk x1, #0xa7ed, lsl 48 mul x10, x6, x1 umulh x11, x6, x1 mul x12, x7, x1 umulh x13, x7, x1 mul x14, x8, x1 umulh x15, x8, x1 mul x16, x9, x1 umulh x17, x9, x1 adds x2, x2, x10 adcs x3, x3, x11 adcs x4, x4, x14 adcs x5, x5, x15 adc x19, xzr, xzr adds x3, x3, x12 adcs x4, x4, x13 adcs x5, x5, x16 adc x19, x19, x17 mov x1, #0x6329 movk x1, #0x5d08, lsl 16 movk x1, #0x621, lsl 32 movk x1, #0xeb21, lsl 48 mul x10, x6, x1 umulh x11, x6, x1 mul x12, x7, x1 umulh x13, x7, x1 mul x14, x8, x1 umulh x15, x8, x1 mul x16, x9, x1 umulh x17, x9, x1 adds x3, x3, x10 adcs x4, x4, x11 adcs x5, x5, x14 adcs x19, x19, x15 adc x20, xzr, xzr adds x4, x4, x12 adcs x5, x5, x13 adcs x19, x19, x16 adc x20, x20, x17 subs x4, x4, x6 sbcs x5, x5, x7 sbcs x6, x19, x8 sbc x7, x20, x9 asr x23, x7, #57 # Conditionally subtract order starting at bit 125 mov x10, xzr mov x13, xzr mov x11, #0xba7d movk x11, #0x4b9e, lsl 16 movk x11, #0x4c63, lsl 32 movk x11, #0xcb02, lsl 48 mov x12, #0xf39a movk x12, #0xd45e, lsl 16 movk x12, #0xdf3b, lsl 32 movk x12, #0x29b, lsl 48 movk x10, #0xa000, lsl 48 movk x13, #0x200, lsl 48 and x10, x10, x23 and x11, x11, x23 and x12, x12, x23 and x13, x13, x23 adds x3, x3, x10 adcs x4, x4, x11 adcs x5, x5, x12 adcs x6, x6, xzr adc x7, x7, x13 # Move bits 252-376 to own registers lsl x7, x7, #4 orr x7, x7, x6, lsr 60 lsl x6, x6, #4 mov x23, #15 orr x6, x6, x5, lsr 60 bic x5, x5, x23, lsl 60 # Sub product of top 2 words and order # * -5812631a5cf5d3ed mov x1, #0x2c13 movk x1, #0xa30a, lsl 16 movk x1, #0x9ce5, lsl 32 movk x1, #0xa7ed, lsl 48 mul x10, x6, x1 umulh x11, x6, x1 mul x12, x7, x1 umulh x13, x7, x1 adds x2, x2, x10 adcs x3, x3, x11 adc x19, xzr, xzr adds x3, x3, x12 adc x19, x19, x13 # * -14def9dea2f79cd7 mov x1, #0x6329 movk x1, #0x5d08, lsl 16 movk x1, #0x621, lsl 32 movk x1, #0xeb21, lsl 48 mul x10, x6, x1 umulh x11, x6, x1 mul x12, x7, x1 umulh x13, x7, x1 adds x3, x3, x10 adcs x4, x4, x11 adc x20, xzr, xzr adds x4, x4, x12 adc x20, x20, x13 # Add overflows at 2 * 64 mov x1, #15 bic x5, x5, x1, lsl 60 adds x4, x4, x19 adc x5, x5, x20 # Subtract top at 2 * 64 subs x4, x4, x6 sbcs x5, x5, x7 sbc x1, x1, x1 # Conditional sub order mov x10, #0xd3ed movk x10, #0x5cf5, lsl 16 movk x10, #0x631a, lsl 32 movk x10, #0x5812, lsl 48 mov x11, #0x9cd6 movk x11, #0xa2f7, lsl 16 movk x11, #0xf9de, lsl 32 movk x11, #0x14de, lsl 48 and x10, x10, x1 and x11, x11, x1 adds x2, x2, x10 adcs x3, x3, x11 and x1, x1, #0x1000000000000000 adcs x4, x4, xzr mov x23, #15 adc x5, x5, x1 bic x5, x5, x23, lsl 60 # Store result stp x2, x3, [x0] stp x4, x5, [x0, #16] ldr x17, [x29, #16] ldr x19, [x29, #24] ldp x20, x21, [x29, #32] ldp x22, x23, [x29, #48] ldp x29, x30, [sp], #0x40 ret #ifndef __APPLE__ .size sc_reduce,.-sc_reduce #endif /* __APPLE__ */ #ifndef __APPLE__ .text .globl sc_muladd .type sc_muladd,@function .align 2 sc_muladd: #else .section __TEXT,__text .globl _sc_muladd .p2align 2 _sc_muladd: #endif /* __APPLE__ */ stp x29, x30, [sp, #-96]! add x29, sp, #0 str x17, [x29, #24] str x19, [x29, #32] stp x20, x21, [x29, #40] stp x22, x23, [x29, #56] stp x24, x25, [x29, #72] str x26, [x29, #88] # Multiply ldp x12, x13, [x1] ldp x14, x15, [x1, #16] ldp x16, x17, [x2] ldp x19, x20, [x2, #16] # A[0] * B[0] umulh x5, x12, x16 mul x4, x12, x16 # A[2] * B[0] umulh x7, x14, x16 mul x6, x14, x16 # A[1] * B[0] mul x21, x13, x16 adds x5, x5, x21 umulh x22, x13, x16 adcs x6, x6, x22 # A[1] * B[3] umulh x9, x13, x20 adc x7, x7, xzr mul x8, x13, x20 # A[0] * B[1] mul x21, x12, x17 adds x5, x5, x21 umulh x22, x12, x17 adcs x6, x6, x22 # A[2] * B[1] mul x21, x14, x17 adcs x7, x7, x21 umulh x22, x14, x17 adcs x8, x8, x22 adc x9, x9, xzr # A[1] * B[2] mul x21, x13, x19 adds x7, x7, x21 umulh x22, x13, x19 adcs x8, x8, x22 adcs x9, x9, xzr adc x10, xzr, xzr # A[0] * B[2] mul x21, x12, x19 adds x6, x6, x21 umulh x22, x12, x19 adcs x7, x7, x22 adcs x8, x8, xzr adcs x9, x9, xzr adc x10, x10, xzr # A[1] * B[1] mul x21, x13, x17 adds x6, x6, x21 umulh x22, x13, x17 adcs x7, x7, x22 # A[3] * B[1] mul x21, x15, x17 adcs x8, x8, x21 umulh x22, x15, x17 adcs x9, x9, x22 adc x10, x10, xzr # A[2] * B[2] mul x21, x14, x19 adds x8, x8, x21 umulh x22, x14, x19 adcs x9, x9, x22 # A[3] * B[3] mul x21, x15, x20 adcs x10, x10, x21 umulh x11, x15, x20 adc x11, x11, xzr # A[0] * B[3] mul x21, x12, x20 adds x7, x7, x21 umulh x22, x12, x20 adcs x8, x8, x22 # A[2] * B[3] mul x21, x14, x20 adcs x9, x9, x21 umulh x22, x14, x20 adcs x10, x10, x22 adc x11, x11, xzr # A[3] * B[0] mul x21, x15, x16 adds x7, x7, x21 umulh x22, x15, x16 adcs x8, x8, x22 # A[3] * B[2] mul x21, x15, x19 adcs x9, x9, x21 umulh x22, x15, x19 adcs x10, x10, x22 adc x11, x11, xzr # Add c to a * b ldp x12, x13, [x3] ldp x14, x15, [x3, #16] adds x4, x4, x12 adcs x5, x5, x13 adcs x6, x6, x14 adcs x7, x7, x15 adcs x8, x8, xzr adcs x9, x9, xzr adcs x10, x10, xzr adc x11, x11, xzr lsr x25, x11, #56 lsl x11, x11, #4 orr x11, x11, x10, lsr 60 lsl x10, x10, #4 orr x10, x10, x9, lsr 60 lsl x9, x9, #4 orr x9, x9, x8, lsr 60 lsl x8, x8, #4 mov x26, #15 orr x8, x8, x7, lsr 60 bic x7, x7, x26, lsl 60 bic x11, x11, x26, lsl 60 # Add order times bits 504..507 mov x22, #0x2c13 movk x22, #0xa30a, lsl 16 movk x22, #0x9ce5, lsl 32 movk x22, #0xa7ed, lsl 48 mov x24, #0x6329 movk x24, #0x5d08, lsl 16 movk x24, #0x621, lsl 32 movk x24, #0xeb21, lsl 48 mul x21, x25, x22 umulh x22, x25, x22 mul x23, x25, x24 umulh x24, x25, x24 adds x8, x8, x21 adcs x9, x9, x22 adcs x10, x10, xzr adc x11, x11, xzr adds x9, x9, x23 adcs x10, x10, x24 adc x11, x11, xzr subs x10, x10, x25 sbc x11, x11, xzr # Sub product of top 4 words and order mov x26, #0x2c13 movk x26, #0xa30a, lsl 16 movk x26, #0x9ce5, lsl 32 movk x26, #0xa7ed, lsl 48 mul x16, x8, x26 umulh x17, x8, x26 mul x19, x9, x26 umulh x20, x9, x26 mul x21, x10, x26 umulh x22, x10, x26 mul x23, x11, x26 umulh x24, x11, x26 adds x4, x4, x16 adcs x5, x5, x17 adcs x6, x6, x21 adcs x7, x7, x22 adc x12, xzr, xzr adds x5, x5, x19 adcs x6, x6, x20 adcs x7, x7, x23 adc x12, x12, x24 mov x26, #0x6329 movk x26, #0x5d08, lsl 16 movk x26, #0x621, lsl 32 movk x26, #0xeb21, lsl 48 mul x16, x8, x26 umulh x17, x8, x26 mul x19, x9, x26 umulh x20, x9, x26 mul x21, x10, x26 umulh x22, x10, x26 mul x23, x11, x26 umulh x24, x11, x26 adds x5, x5, x16 adcs x6, x6, x17 adcs x7, x7, x21 adcs x12, x12, x22 adc x13, xzr, xzr adds x6, x6, x19 adcs x7, x7, x20 adcs x12, x12, x23 adc x13, x13, x24 subs x6, x6, x8 sbcs x7, x7, x9 sbcs x8, x12, x10 sbc x9, x13, x11 asr x25, x9, #57 # Conditionally subtract order starting at bit 125 mov x16, xzr mov x20, xzr mov x17, #0xba7d movk x17, #0x4b9e, lsl 16 movk x17, #0x4c63, lsl 32 movk x17, #0xcb02, lsl 48 mov x19, #0xf39a movk x19, #0xd45e, lsl 16 movk x19, #0xdf3b, lsl 32 movk x19, #0x29b, lsl 48 movk x16, #0xa000, lsl 48 movk x20, #0x200, lsl 48 and x16, x16, x25 and x17, x17, x25 and x19, x19, x25 and x20, x20, x25 adds x5, x5, x16 adcs x6, x6, x17 adcs x7, x7, x19 adcs x8, x8, xzr adc x9, x9, x20 # Move bits 252-376 to own registers lsl x9, x9, #4 orr x9, x9, x8, lsr 60 lsl x8, x8, #4 mov x25, #15 orr x8, x8, x7, lsr 60 bic x7, x7, x25, lsl 60 # Sub product of top 2 words and order # * -5812631a5cf5d3ed mov x26, #0x2c13 movk x26, #0xa30a, lsl 16 movk x26, #0x9ce5, lsl 32 movk x26, #0xa7ed, lsl 48 mul x16, x8, x26 umulh x17, x8, x26 mul x19, x9, x26 umulh x20, x9, x26 adds x4, x4, x16 adcs x5, x5, x17 adc x12, xzr, xzr adds x5, x5, x19 adc x12, x12, x20 # * -14def9dea2f79cd7 mov x26, #0x6329 movk x26, #0x5d08, lsl 16 movk x26, #0x621, lsl 32 movk x26, #0xeb21, lsl 48 mul x16, x8, x26 umulh x17, x8, x26 mul x19, x9, x26 umulh x20, x9, x26 adds x5, x5, x16 adcs x6, x6, x17 adc x13, xzr, xzr adds x6, x6, x19 adc x13, x13, x20 # Add overflows at 2 * 64 mov x26, #15 bic x7, x7, x26, lsl 60 adds x6, x6, x12 adc x7, x7, x13 # Subtract top at 2 * 64 subs x6, x6, x8 sbcs x7, x7, x9 sbc x26, x26, x26 # Conditional sub order mov x16, #0xd3ed movk x16, #0x5cf5, lsl 16 movk x16, #0x631a, lsl 32 movk x16, #0x5812, lsl 48 mov x17, #0x9cd6 movk x17, #0xa2f7, lsl 16 movk x17, #0xf9de, lsl 32 movk x17, #0x14de, lsl 48 and x16, x16, x26 and x17, x17, x26 adds x4, x4, x16 adcs x5, x5, x17 and x26, x26, #0x1000000000000000 adcs x6, x6, xzr mov x25, #15 adc x7, x7, x26 bic x7, x7, x25, lsl 60 # Store result stp x4, x5, [x0] stp x6, x7, [x0, #16] ldr x17, [x29, #24] ldr x19, [x29, #32] ldp x20, x21, [x29, #40] ldp x22, x23, [x29, #56] ldp x24, x25, [x29, #72] ldr x26, [x29, #88] ldp x29, x30, [sp], #0x60 ret #ifndef __APPLE__ .size sc_muladd,.-sc_muladd #endif /* __APPLE__ */ #endif /* HAVE_ED25519 */ #endif /* !CURVE25519_SMALL || !ED25519_SMALL */ #endif /* HAVE_CURVE25519 || HAVE_ED25519 */ #endif /* __aarch64__ */ #endif /* WOLFSSL_ARMASM */ #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif #endif /* !WOLFSSL_ARMASM_INLINE */