1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci# 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci# 17e1051a39Sopenharmony_ci# April 2010 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# The module implements "4-bit" GCM GHASH function and underlying 20e1051a39Sopenharmony_ci# single multiplication operation in GF(2^128). "4-bit" means that it 21e1051a39Sopenharmony_ci# uses 256 bytes per-key table [+32 bytes shared table]. There is no 22e1051a39Sopenharmony_ci# experimental performance data available yet. The only approximation 23e1051a39Sopenharmony_ci# that can be made at this point is based on code size. Inner loop is 24e1051a39Sopenharmony_ci# 32 instructions long and on single-issue core should execute in <40 25e1051a39Sopenharmony_ci# cycles. Having verified that gcc 3.4 didn't unroll corresponding 26e1051a39Sopenharmony_ci# loop, this assembler loop body was found to be ~3x smaller than 27e1051a39Sopenharmony_ci# compiler-generated one... 28e1051a39Sopenharmony_ci# 29e1051a39Sopenharmony_ci# July 2010 30e1051a39Sopenharmony_ci# 31e1051a39Sopenharmony_ci# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on 32e1051a39Sopenharmony_ci# Cortex A8 core and ~25 cycles per processed byte (which was observed 33e1051a39Sopenharmony_ci# to be ~3 times faster than gcc-generated code:-) 34e1051a39Sopenharmony_ci# 35e1051a39Sopenharmony_ci# February 2011 36e1051a39Sopenharmony_ci# 37e1051a39Sopenharmony_ci# Profiler-assisted and platform-specific optimization resulted in 7% 38e1051a39Sopenharmony_ci# improvement on Cortex A8 core and ~23.5 cycles per byte. 39e1051a39Sopenharmony_ci# 40e1051a39Sopenharmony_ci# March 2011 41e1051a39Sopenharmony_ci# 42e1051a39Sopenharmony_ci# Add NEON implementation featuring polynomial multiplication, i.e. no 43e1051a39Sopenharmony_ci# lookup tables involved. On Cortex A8 it was measured to process one 44e1051a39Sopenharmony_ci# byte in 15 cycles or 55% faster than integer-only code. 45e1051a39Sopenharmony_ci# 46e1051a39Sopenharmony_ci# April 2014 47e1051a39Sopenharmony_ci# 48e1051a39Sopenharmony_ci# Switch to multiplication algorithm suggested in paper referred 49e1051a39Sopenharmony_ci# below and combine it with reduction algorithm from x86 module. 50e1051a39Sopenharmony_ci# Performance improvement over previous version varies from 65% on 51e1051a39Sopenharmony_ci# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8 52e1051a39Sopenharmony_ci# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63, 53e1051a39Sopenharmony_ci# Snapdragon S4 - in 9.33. 54e1051a39Sopenharmony_ci# 55e1051a39Sopenharmony_ci# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software 56e1051a39Sopenharmony_ci# Polynomial Multiplication on ARM Processors using the NEON Engine. 57e1051a39Sopenharmony_ci# 58e1051a39Sopenharmony_ci# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf 59e1051a39Sopenharmony_ci 60e1051a39Sopenharmony_ci# ==================================================================== 61e1051a39Sopenharmony_ci# Note about "528B" variant. In ARM case it makes lesser sense to 62e1051a39Sopenharmony_ci# implement it for following reasons: 63e1051a39Sopenharmony_ci# 64e1051a39Sopenharmony_ci# - performance improvement won't be anywhere near 50%, because 128- 65e1051a39Sopenharmony_ci# bit shift operation is neatly fused with 128-bit xor here, and 66e1051a39Sopenharmony_ci# "538B" variant would eliminate only 4-5 instructions out of 32 67e1051a39Sopenharmony_ci# in the inner loop (meaning that estimated improvement is ~15%); 68e1051a39Sopenharmony_ci# - ARM-based systems are often embedded ones and extra memory 69e1051a39Sopenharmony_ci# consumption might be unappreciated (for so little improvement); 70e1051a39Sopenharmony_ci# 71e1051a39Sopenharmony_ci# Byte order [in]dependence. ========================================= 72e1051a39Sopenharmony_ci# 73e1051a39Sopenharmony_ci# Caller is expected to maintain specific *dword* order in Htable, 74e1051a39Sopenharmony_ci# namely with *least* significant dword of 128-bit value at *lower* 75e1051a39Sopenharmony_ci# address. This differs completely from C code and has everything to 76e1051a39Sopenharmony_ci# do with ldm instruction and order in which dwords are "consumed" by 77e1051a39Sopenharmony_ci# algorithm. *Byte* order within these dwords in turn is whatever 78e1051a39Sopenharmony_ci# *native* byte order on current platform. See gcm128.c for working 79e1051a39Sopenharmony_ci# example... 80e1051a39Sopenharmony_ci 81e1051a39Sopenharmony_ci# $output is the last argument if it looks like a file (it has an extension) 82e1051a39Sopenharmony_ci# $flavour is the first argument if it doesn't look like a file 83e1051a39Sopenharmony_ci$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 84e1051a39Sopenharmony_ci$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 85e1051a39Sopenharmony_ci 86e1051a39Sopenharmony_ciif ($flavour && $flavour ne "void") { 87e1051a39Sopenharmony_ci $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 88e1051a39Sopenharmony_ci ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 89e1051a39Sopenharmony_ci ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 90e1051a39Sopenharmony_ci die "can't locate arm-xlate.pl"; 91e1051a39Sopenharmony_ci 92e1051a39Sopenharmony_ci open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 93e1051a39Sopenharmony_ci or die "can't call $xlate: $!"; 94e1051a39Sopenharmony_ci} else { 95e1051a39Sopenharmony_ci $output and open STDOUT,">$output"; 96e1051a39Sopenharmony_ci} 97e1051a39Sopenharmony_ci 98e1051a39Sopenharmony_ci$Xi="r0"; # argument block 99e1051a39Sopenharmony_ci$Htbl="r1"; 100e1051a39Sopenharmony_ci$inp="r2"; 101e1051a39Sopenharmony_ci$len="r3"; 102e1051a39Sopenharmony_ci 103e1051a39Sopenharmony_ci$Zll="r4"; # variables 104e1051a39Sopenharmony_ci$Zlh="r5"; 105e1051a39Sopenharmony_ci$Zhl="r6"; 106e1051a39Sopenharmony_ci$Zhh="r7"; 107e1051a39Sopenharmony_ci$Tll="r8"; 108e1051a39Sopenharmony_ci$Tlh="r9"; 109e1051a39Sopenharmony_ci$Thl="r10"; 110e1051a39Sopenharmony_ci$Thh="r11"; 111e1051a39Sopenharmony_ci$nlo="r12"; 112e1051a39Sopenharmony_ci################# r13 is stack pointer 113e1051a39Sopenharmony_ci$nhi="r14"; 114e1051a39Sopenharmony_ci################# r15 is program counter 115e1051a39Sopenharmony_ci 116e1051a39Sopenharmony_ci$rem_4bit=$inp; # used in gcm_gmult_4bit 117e1051a39Sopenharmony_ci$cnt=$len; 118e1051a39Sopenharmony_ci 119e1051a39Sopenharmony_cisub Zsmash() { 120e1051a39Sopenharmony_ci my $i=12; 121e1051a39Sopenharmony_ci my @args=@_; 122e1051a39Sopenharmony_ci for ($Zll,$Zlh,$Zhl,$Zhh) { 123e1051a39Sopenharmony_ci $code.=<<___; 124e1051a39Sopenharmony_ci#if __ARM_ARCH__>=7 && defined(__ARMEL__) 125e1051a39Sopenharmony_ci rev $_,$_ 126e1051a39Sopenharmony_ci str $_,[$Xi,#$i] 127e1051a39Sopenharmony_ci#elif defined(__ARMEB__) 128e1051a39Sopenharmony_ci str $_,[$Xi,#$i] 129e1051a39Sopenharmony_ci#else 130e1051a39Sopenharmony_ci mov $Tlh,$_,lsr#8 131e1051a39Sopenharmony_ci strb $_,[$Xi,#$i+3] 132e1051a39Sopenharmony_ci mov $Thl,$_,lsr#16 133e1051a39Sopenharmony_ci strb $Tlh,[$Xi,#$i+2] 134e1051a39Sopenharmony_ci mov $Thh,$_,lsr#24 135e1051a39Sopenharmony_ci strb $Thl,[$Xi,#$i+1] 136e1051a39Sopenharmony_ci strb $Thh,[$Xi,#$i] 137e1051a39Sopenharmony_ci#endif 138e1051a39Sopenharmony_ci___ 139e1051a39Sopenharmony_ci $code.="\t".shift(@args)."\n"; 140e1051a39Sopenharmony_ci $i-=4; 141e1051a39Sopenharmony_ci } 142e1051a39Sopenharmony_ci} 143e1051a39Sopenharmony_ci 144e1051a39Sopenharmony_ci$code=<<___; 145e1051a39Sopenharmony_ci#include "arm_arch.h" 146e1051a39Sopenharmony_ci 147e1051a39Sopenharmony_ci#if defined(__thumb2__) || defined(__clang__) 148e1051a39Sopenharmony_ci.syntax unified 149e1051a39Sopenharmony_ci#define ldrplb ldrbpl 150e1051a39Sopenharmony_ci#define ldrneb ldrbne 151e1051a39Sopenharmony_ci#endif 152e1051a39Sopenharmony_ci#if defined(__thumb2__) 153e1051a39Sopenharmony_ci.thumb 154e1051a39Sopenharmony_ci#else 155e1051a39Sopenharmony_ci.code 32 156e1051a39Sopenharmony_ci#endif 157e1051a39Sopenharmony_ci 158e1051a39Sopenharmony_ci.text 159e1051a39Sopenharmony_ci 160e1051a39Sopenharmony_ci.type rem_4bit,%object 161e1051a39Sopenharmony_ci.align 5 162e1051a39Sopenharmony_cirem_4bit: 163e1051a39Sopenharmony_ci.short 0x0000,0x1C20,0x3840,0x2460 164e1051a39Sopenharmony_ci.short 0x7080,0x6CA0,0x48C0,0x54E0 165e1051a39Sopenharmony_ci.short 0xE100,0xFD20,0xD940,0xC560 166e1051a39Sopenharmony_ci.short 0x9180,0x8DA0,0xA9C0,0xB5E0 167e1051a39Sopenharmony_ci.size rem_4bit,.-rem_4bit 168e1051a39Sopenharmony_ci 169e1051a39Sopenharmony_ci.type rem_4bit_get,%function 170e1051a39Sopenharmony_cirem_4bit_get: 171e1051a39Sopenharmony_ci#if defined(__thumb2__) 172e1051a39Sopenharmony_ci adr $rem_4bit,rem_4bit 173e1051a39Sopenharmony_ci#else 174e1051a39Sopenharmony_ci sub $rem_4bit,pc,#8+32 @ &rem_4bit 175e1051a39Sopenharmony_ci#endif 176e1051a39Sopenharmony_ci b .Lrem_4bit_got 177e1051a39Sopenharmony_ci nop 178e1051a39Sopenharmony_ci nop 179e1051a39Sopenharmony_ci.size rem_4bit_get,.-rem_4bit_get 180e1051a39Sopenharmony_ci 181e1051a39Sopenharmony_ci.global gcm_ghash_4bit 182e1051a39Sopenharmony_ci.type gcm_ghash_4bit,%function 183e1051a39Sopenharmony_ci.align 4 184e1051a39Sopenharmony_cigcm_ghash_4bit: 185e1051a39Sopenharmony_ci#if defined(__thumb2__) 186e1051a39Sopenharmony_ci adr r12,rem_4bit 187e1051a39Sopenharmony_ci#else 188e1051a39Sopenharmony_ci sub r12,pc,#8+48 @ &rem_4bit 189e1051a39Sopenharmony_ci#endif 190e1051a39Sopenharmony_ci add $len,$inp,$len @ $len to point at the end 191e1051a39Sopenharmony_ci stmdb sp!,{r3-r11,lr} @ save $len/end too 192e1051a39Sopenharmony_ci 193e1051a39Sopenharmony_ci ldmia r12,{r4-r11} @ copy rem_4bit ... 194e1051a39Sopenharmony_ci stmdb sp!,{r4-r11} @ ... to stack 195e1051a39Sopenharmony_ci 196e1051a39Sopenharmony_ci ldrb $nlo,[$inp,#15] 197e1051a39Sopenharmony_ci ldrb $nhi,[$Xi,#15] 198e1051a39Sopenharmony_ci.Louter: 199e1051a39Sopenharmony_ci eor $nlo,$nlo,$nhi 200e1051a39Sopenharmony_ci and $nhi,$nlo,#0xf0 201e1051a39Sopenharmony_ci and $nlo,$nlo,#0x0f 202e1051a39Sopenharmony_ci mov $cnt,#14 203e1051a39Sopenharmony_ci 204e1051a39Sopenharmony_ci add $Zhh,$Htbl,$nlo,lsl#4 205e1051a39Sopenharmony_ci ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 206e1051a39Sopenharmony_ci add $Thh,$Htbl,$nhi 207e1051a39Sopenharmony_ci ldrb $nlo,[$inp,#14] 208e1051a39Sopenharmony_ci 209e1051a39Sopenharmony_ci and $nhi,$Zll,#0xf @ rem 210e1051a39Sopenharmony_ci ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 211e1051a39Sopenharmony_ci add $nhi,$nhi,$nhi 212e1051a39Sopenharmony_ci eor $Zll,$Tll,$Zll,lsr#4 213e1051a39Sopenharmony_ci ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] 214e1051a39Sopenharmony_ci eor $Zll,$Zll,$Zlh,lsl#28 215e1051a39Sopenharmony_ci ldrb $nhi,[$Xi,#14] 216e1051a39Sopenharmony_ci eor $Zlh,$Tlh,$Zlh,lsr#4 217e1051a39Sopenharmony_ci eor $Zlh,$Zlh,$Zhl,lsl#28 218e1051a39Sopenharmony_ci eor $Zhl,$Thl,$Zhl,lsr#4 219e1051a39Sopenharmony_ci eor $Zhl,$Zhl,$Zhh,lsl#28 220e1051a39Sopenharmony_ci eor $Zhh,$Thh,$Zhh,lsr#4 221e1051a39Sopenharmony_ci eor $nlo,$nlo,$nhi 222e1051a39Sopenharmony_ci and $nhi,$nlo,#0xf0 223e1051a39Sopenharmony_ci and $nlo,$nlo,#0x0f 224e1051a39Sopenharmony_ci eor $Zhh,$Zhh,$Tll,lsl#16 225e1051a39Sopenharmony_ci 226e1051a39Sopenharmony_ci.Linner: 227e1051a39Sopenharmony_ci add $Thh,$Htbl,$nlo,lsl#4 228e1051a39Sopenharmony_ci and $nlo,$Zll,#0xf @ rem 229e1051a39Sopenharmony_ci subs $cnt,$cnt,#1 230e1051a39Sopenharmony_ci add $nlo,$nlo,$nlo 231e1051a39Sopenharmony_ci ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 232e1051a39Sopenharmony_ci eor $Zll,$Tll,$Zll,lsr#4 233e1051a39Sopenharmony_ci eor $Zll,$Zll,$Zlh,lsl#28 234e1051a39Sopenharmony_ci eor $Zlh,$Tlh,$Zlh,lsr#4 235e1051a39Sopenharmony_ci eor $Zlh,$Zlh,$Zhl,lsl#28 236e1051a39Sopenharmony_ci ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] 237e1051a39Sopenharmony_ci eor $Zhl,$Thl,$Zhl,lsr#4 238e1051a39Sopenharmony_ci#ifdef __thumb2__ 239e1051a39Sopenharmony_ci it pl 240e1051a39Sopenharmony_ci#endif 241e1051a39Sopenharmony_ci ldrplb $nlo,[$inp,$cnt] 242e1051a39Sopenharmony_ci eor $Zhl,$Zhl,$Zhh,lsl#28 243e1051a39Sopenharmony_ci eor $Zhh,$Thh,$Zhh,lsr#4 244e1051a39Sopenharmony_ci 245e1051a39Sopenharmony_ci add $Thh,$Htbl,$nhi 246e1051a39Sopenharmony_ci and $nhi,$Zll,#0xf @ rem 247e1051a39Sopenharmony_ci eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 248e1051a39Sopenharmony_ci add $nhi,$nhi,$nhi 249e1051a39Sopenharmony_ci ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 250e1051a39Sopenharmony_ci eor $Zll,$Tll,$Zll,lsr#4 251e1051a39Sopenharmony_ci#ifdef __thumb2__ 252e1051a39Sopenharmony_ci it pl 253e1051a39Sopenharmony_ci#endif 254e1051a39Sopenharmony_ci ldrplb $Tll,[$Xi,$cnt] 255e1051a39Sopenharmony_ci eor $Zll,$Zll,$Zlh,lsl#28 256e1051a39Sopenharmony_ci eor $Zlh,$Tlh,$Zlh,lsr#4 257e1051a39Sopenharmony_ci ldrh $Tlh,[sp,$nhi] 258e1051a39Sopenharmony_ci eor $Zlh,$Zlh,$Zhl,lsl#28 259e1051a39Sopenharmony_ci eor $Zhl,$Thl,$Zhl,lsr#4 260e1051a39Sopenharmony_ci eor $Zhl,$Zhl,$Zhh,lsl#28 261e1051a39Sopenharmony_ci#ifdef __thumb2__ 262e1051a39Sopenharmony_ci it pl 263e1051a39Sopenharmony_ci#endif 264e1051a39Sopenharmony_ci eorpl $nlo,$nlo,$Tll 265e1051a39Sopenharmony_ci eor $Zhh,$Thh,$Zhh,lsr#4 266e1051a39Sopenharmony_ci#ifdef __thumb2__ 267e1051a39Sopenharmony_ci itt pl 268e1051a39Sopenharmony_ci#endif 269e1051a39Sopenharmony_ci andpl $nhi,$nlo,#0xf0 270e1051a39Sopenharmony_ci andpl $nlo,$nlo,#0x0f 271e1051a39Sopenharmony_ci eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] 272e1051a39Sopenharmony_ci bpl .Linner 273e1051a39Sopenharmony_ci 274e1051a39Sopenharmony_ci ldr $len,[sp,#32] @ re-load $len/end 275e1051a39Sopenharmony_ci add $inp,$inp,#16 276e1051a39Sopenharmony_ci mov $nhi,$Zll 277e1051a39Sopenharmony_ci___ 278e1051a39Sopenharmony_ci &Zsmash("cmp\t$inp,$len","\n". 279e1051a39Sopenharmony_ci "#ifdef __thumb2__\n". 280e1051a39Sopenharmony_ci " it ne\n". 281e1051a39Sopenharmony_ci "#endif\n". 282e1051a39Sopenharmony_ci " ldrneb $nlo,[$inp,#15]"); 283e1051a39Sopenharmony_ci$code.=<<___; 284e1051a39Sopenharmony_ci bne .Louter 285e1051a39Sopenharmony_ci 286e1051a39Sopenharmony_ci add sp,sp,#36 287e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 288e1051a39Sopenharmony_ci ldmia sp!,{r4-r11,pc} 289e1051a39Sopenharmony_ci#else 290e1051a39Sopenharmony_ci ldmia sp!,{r4-r11,lr} 291e1051a39Sopenharmony_ci tst lr,#1 292e1051a39Sopenharmony_ci moveq pc,lr @ be binary compatible with V4, yet 293e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 294e1051a39Sopenharmony_ci#endif 295e1051a39Sopenharmony_ci.size gcm_ghash_4bit,.-gcm_ghash_4bit 296e1051a39Sopenharmony_ci 297e1051a39Sopenharmony_ci.global gcm_gmult_4bit 298e1051a39Sopenharmony_ci.type gcm_gmult_4bit,%function 299e1051a39Sopenharmony_cigcm_gmult_4bit: 300e1051a39Sopenharmony_ci stmdb sp!,{r4-r11,lr} 301e1051a39Sopenharmony_ci ldrb $nlo,[$Xi,#15] 302e1051a39Sopenharmony_ci b rem_4bit_get 303e1051a39Sopenharmony_ci.Lrem_4bit_got: 304e1051a39Sopenharmony_ci and $nhi,$nlo,#0xf0 305e1051a39Sopenharmony_ci and $nlo,$nlo,#0x0f 306e1051a39Sopenharmony_ci mov $cnt,#14 307e1051a39Sopenharmony_ci 308e1051a39Sopenharmony_ci add $Zhh,$Htbl,$nlo,lsl#4 309e1051a39Sopenharmony_ci ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] 310e1051a39Sopenharmony_ci ldrb $nlo,[$Xi,#14] 311e1051a39Sopenharmony_ci 312e1051a39Sopenharmony_ci add $Thh,$Htbl,$nhi 313e1051a39Sopenharmony_ci and $nhi,$Zll,#0xf @ rem 314e1051a39Sopenharmony_ci ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 315e1051a39Sopenharmony_ci add $nhi,$nhi,$nhi 316e1051a39Sopenharmony_ci eor $Zll,$Tll,$Zll,lsr#4 317e1051a39Sopenharmony_ci ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 318e1051a39Sopenharmony_ci eor $Zll,$Zll,$Zlh,lsl#28 319e1051a39Sopenharmony_ci eor $Zlh,$Tlh,$Zlh,lsr#4 320e1051a39Sopenharmony_ci eor $Zlh,$Zlh,$Zhl,lsl#28 321e1051a39Sopenharmony_ci eor $Zhl,$Thl,$Zhl,lsr#4 322e1051a39Sopenharmony_ci eor $Zhl,$Zhl,$Zhh,lsl#28 323e1051a39Sopenharmony_ci eor $Zhh,$Thh,$Zhh,lsr#4 324e1051a39Sopenharmony_ci and $nhi,$nlo,#0xf0 325e1051a39Sopenharmony_ci eor $Zhh,$Zhh,$Tll,lsl#16 326e1051a39Sopenharmony_ci and $nlo,$nlo,#0x0f 327e1051a39Sopenharmony_ci 328e1051a39Sopenharmony_ci.Loop: 329e1051a39Sopenharmony_ci add $Thh,$Htbl,$nlo,lsl#4 330e1051a39Sopenharmony_ci and $nlo,$Zll,#0xf @ rem 331e1051a39Sopenharmony_ci subs $cnt,$cnt,#1 332e1051a39Sopenharmony_ci add $nlo,$nlo,$nlo 333e1051a39Sopenharmony_ci ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] 334e1051a39Sopenharmony_ci eor $Zll,$Tll,$Zll,lsr#4 335e1051a39Sopenharmony_ci eor $Zll,$Zll,$Zlh,lsl#28 336e1051a39Sopenharmony_ci eor $Zlh,$Tlh,$Zlh,lsr#4 337e1051a39Sopenharmony_ci eor $Zlh,$Zlh,$Zhl,lsl#28 338e1051a39Sopenharmony_ci ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] 339e1051a39Sopenharmony_ci eor $Zhl,$Thl,$Zhl,lsr#4 340e1051a39Sopenharmony_ci#ifdef __thumb2__ 341e1051a39Sopenharmony_ci it pl 342e1051a39Sopenharmony_ci#endif 343e1051a39Sopenharmony_ci ldrplb $nlo,[$Xi,$cnt] 344e1051a39Sopenharmony_ci eor $Zhl,$Zhl,$Zhh,lsl#28 345e1051a39Sopenharmony_ci eor $Zhh,$Thh,$Zhh,lsr#4 346e1051a39Sopenharmony_ci 347e1051a39Sopenharmony_ci add $Thh,$Htbl,$nhi 348e1051a39Sopenharmony_ci and $nhi,$Zll,#0xf @ rem 349e1051a39Sopenharmony_ci eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 350e1051a39Sopenharmony_ci add $nhi,$nhi,$nhi 351e1051a39Sopenharmony_ci ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] 352e1051a39Sopenharmony_ci eor $Zll,$Tll,$Zll,lsr#4 353e1051a39Sopenharmony_ci eor $Zll,$Zll,$Zlh,lsl#28 354e1051a39Sopenharmony_ci eor $Zlh,$Tlh,$Zlh,lsr#4 355e1051a39Sopenharmony_ci ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] 356e1051a39Sopenharmony_ci eor $Zlh,$Zlh,$Zhl,lsl#28 357e1051a39Sopenharmony_ci eor $Zhl,$Thl,$Zhl,lsr#4 358e1051a39Sopenharmony_ci eor $Zhl,$Zhl,$Zhh,lsl#28 359e1051a39Sopenharmony_ci eor $Zhh,$Thh,$Zhh,lsr#4 360e1051a39Sopenharmony_ci#ifdef __thumb2__ 361e1051a39Sopenharmony_ci itt pl 362e1051a39Sopenharmony_ci#endif 363e1051a39Sopenharmony_ci andpl $nhi,$nlo,#0xf0 364e1051a39Sopenharmony_ci andpl $nlo,$nlo,#0x0f 365e1051a39Sopenharmony_ci eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] 366e1051a39Sopenharmony_ci bpl .Loop 367e1051a39Sopenharmony_ci___ 368e1051a39Sopenharmony_ci &Zsmash(); 369e1051a39Sopenharmony_ci$code.=<<___; 370e1051a39Sopenharmony_ci#if __ARM_ARCH__>=5 371e1051a39Sopenharmony_ci ldmia sp!,{r4-r11,pc} 372e1051a39Sopenharmony_ci#else 373e1051a39Sopenharmony_ci ldmia sp!,{r4-r11,lr} 374e1051a39Sopenharmony_ci tst lr,#1 375e1051a39Sopenharmony_ci moveq pc,lr @ be binary compatible with V4, yet 376e1051a39Sopenharmony_ci bx lr @ interoperable with Thumb ISA:-) 377e1051a39Sopenharmony_ci#endif 378e1051a39Sopenharmony_ci.size gcm_gmult_4bit,.-gcm_gmult_4bit 379e1051a39Sopenharmony_ci___ 380e1051a39Sopenharmony_ci{ 381e1051a39Sopenharmony_cimy ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3)); 382e1051a39Sopenharmony_cimy ($t0,$t1,$t2,$t3)=map("q$_",(8..12)); 383e1051a39Sopenharmony_cimy ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31)); 384e1051a39Sopenharmony_ci 385e1051a39Sopenharmony_cisub clmul64x64 { 386e1051a39Sopenharmony_cimy ($r,$a,$b)=@_; 387e1051a39Sopenharmony_ci$code.=<<___; 388e1051a39Sopenharmony_ci vext.8 $t0#lo, $a, $a, #1 @ A1 389e1051a39Sopenharmony_ci vmull.p8 $t0, $t0#lo, $b @ F = A1*B 390e1051a39Sopenharmony_ci vext.8 $r#lo, $b, $b, #1 @ B1 391e1051a39Sopenharmony_ci vmull.p8 $r, $a, $r#lo @ E = A*B1 392e1051a39Sopenharmony_ci vext.8 $t1#lo, $a, $a, #2 @ A2 393e1051a39Sopenharmony_ci vmull.p8 $t1, $t1#lo, $b @ H = A2*B 394e1051a39Sopenharmony_ci vext.8 $t3#lo, $b, $b, #2 @ B2 395e1051a39Sopenharmony_ci vmull.p8 $t3, $a, $t3#lo @ G = A*B2 396e1051a39Sopenharmony_ci vext.8 $t2#lo, $a, $a, #3 @ A3 397e1051a39Sopenharmony_ci veor $t0, $t0, $r @ L = E + F 398e1051a39Sopenharmony_ci vmull.p8 $t2, $t2#lo, $b @ J = A3*B 399e1051a39Sopenharmony_ci vext.8 $r#lo, $b, $b, #3 @ B3 400e1051a39Sopenharmony_ci veor $t1, $t1, $t3 @ M = G + H 401e1051a39Sopenharmony_ci vmull.p8 $r, $a, $r#lo @ I = A*B3 402e1051a39Sopenharmony_ci veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8 403e1051a39Sopenharmony_ci vand $t0#hi, $t0#hi, $k48 404e1051a39Sopenharmony_ci vext.8 $t3#lo, $b, $b, #4 @ B4 405e1051a39Sopenharmony_ci veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16 406e1051a39Sopenharmony_ci vand $t1#hi, $t1#hi, $k32 407e1051a39Sopenharmony_ci vmull.p8 $t3, $a, $t3#lo @ K = A*B4 408e1051a39Sopenharmony_ci veor $t2, $t2, $r @ N = I + J 409e1051a39Sopenharmony_ci veor $t0#lo, $t0#lo, $t0#hi 410e1051a39Sopenharmony_ci veor $t1#lo, $t1#lo, $t1#hi 411e1051a39Sopenharmony_ci veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24 412e1051a39Sopenharmony_ci vand $t2#hi, $t2#hi, $k16 413e1051a39Sopenharmony_ci vext.8 $t0, $t0, $t0, #15 414e1051a39Sopenharmony_ci veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32 415e1051a39Sopenharmony_ci vmov.i64 $t3#hi, #0 416e1051a39Sopenharmony_ci vext.8 $t1, $t1, $t1, #14 417e1051a39Sopenharmony_ci veor $t2#lo, $t2#lo, $t2#hi 418e1051a39Sopenharmony_ci vmull.p8 $r, $a, $b @ D = A*B 419e1051a39Sopenharmony_ci vext.8 $t3, $t3, $t3, #12 420e1051a39Sopenharmony_ci vext.8 $t2, $t2, $t2, #13 421e1051a39Sopenharmony_ci veor $t0, $t0, $t1 422e1051a39Sopenharmony_ci veor $t2, $t2, $t3 423e1051a39Sopenharmony_ci veor $r, $r, $t0 424e1051a39Sopenharmony_ci veor $r, $r, $t2 425e1051a39Sopenharmony_ci___ 426e1051a39Sopenharmony_ci} 427e1051a39Sopenharmony_ci 428e1051a39Sopenharmony_ci$code.=<<___; 429e1051a39Sopenharmony_ci#if __ARM_MAX_ARCH__>=7 430e1051a39Sopenharmony_ci.arch armv7-a 431e1051a39Sopenharmony_ci.fpu neon 432e1051a39Sopenharmony_ci 433e1051a39Sopenharmony_ci.global gcm_init_neon 434e1051a39Sopenharmony_ci.type gcm_init_neon,%function 435e1051a39Sopenharmony_ci.align 4 436e1051a39Sopenharmony_cigcm_init_neon: 437e1051a39Sopenharmony_ci vld1.64 $IN#hi,[r1]! @ load H 438e1051a39Sopenharmony_ci vmov.i8 $t0,#0xe1 439e1051a39Sopenharmony_ci vld1.64 $IN#lo,[r1] 440e1051a39Sopenharmony_ci vshl.i64 $t0#hi,#57 441e1051a39Sopenharmony_ci vshr.u64 $t0#lo,#63 @ t0=0xc2....01 442e1051a39Sopenharmony_ci vdup.8 $t1,$IN#hi[7] 443e1051a39Sopenharmony_ci vshr.u64 $Hlo,$IN#lo,#63 444e1051a39Sopenharmony_ci vshr.s8 $t1,#7 @ broadcast carry bit 445e1051a39Sopenharmony_ci vshl.i64 $IN,$IN,#1 446e1051a39Sopenharmony_ci vand $t0,$t0,$t1 447e1051a39Sopenharmony_ci vorr $IN#hi,$Hlo @ H<<<=1 448e1051a39Sopenharmony_ci veor $IN,$IN,$t0 @ twisted H 449e1051a39Sopenharmony_ci vstmia r0,{$IN} 450e1051a39Sopenharmony_ci 451e1051a39Sopenharmony_ci ret @ bx lr 452e1051a39Sopenharmony_ci.size gcm_init_neon,.-gcm_init_neon 453e1051a39Sopenharmony_ci 454e1051a39Sopenharmony_ci.global gcm_gmult_neon 455e1051a39Sopenharmony_ci.type gcm_gmult_neon,%function 456e1051a39Sopenharmony_ci.align 4 457e1051a39Sopenharmony_cigcm_gmult_neon: 458e1051a39Sopenharmony_ci vld1.64 $IN#hi,[$Xi]! @ load Xi 459e1051a39Sopenharmony_ci vld1.64 $IN#lo,[$Xi]! 460e1051a39Sopenharmony_ci vmov.i64 $k48,#0x0000ffffffffffff 461e1051a39Sopenharmony_ci vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 462e1051a39Sopenharmony_ci vmov.i64 $k32,#0x00000000ffffffff 463e1051a39Sopenharmony_ci#ifdef __ARMEL__ 464e1051a39Sopenharmony_ci vrev64.8 $IN,$IN 465e1051a39Sopenharmony_ci#endif 466e1051a39Sopenharmony_ci vmov.i64 $k16,#0x000000000000ffff 467e1051a39Sopenharmony_ci veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 468e1051a39Sopenharmony_ci mov $len,#16 469e1051a39Sopenharmony_ci b .Lgmult_neon 470e1051a39Sopenharmony_ci.size gcm_gmult_neon,.-gcm_gmult_neon 471e1051a39Sopenharmony_ci 472e1051a39Sopenharmony_ci.global gcm_ghash_neon 473e1051a39Sopenharmony_ci.type gcm_ghash_neon,%function 474e1051a39Sopenharmony_ci.align 4 475e1051a39Sopenharmony_cigcm_ghash_neon: 476e1051a39Sopenharmony_ci vld1.64 $Xl#hi,[$Xi]! @ load Xi 477e1051a39Sopenharmony_ci vld1.64 $Xl#lo,[$Xi]! 478e1051a39Sopenharmony_ci vmov.i64 $k48,#0x0000ffffffffffff 479e1051a39Sopenharmony_ci vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H 480e1051a39Sopenharmony_ci vmov.i64 $k32,#0x00000000ffffffff 481e1051a39Sopenharmony_ci#ifdef __ARMEL__ 482e1051a39Sopenharmony_ci vrev64.8 $Xl,$Xl 483e1051a39Sopenharmony_ci#endif 484e1051a39Sopenharmony_ci vmov.i64 $k16,#0x000000000000ffff 485e1051a39Sopenharmony_ci veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing 486e1051a39Sopenharmony_ci 487e1051a39Sopenharmony_ci.Loop_neon: 488e1051a39Sopenharmony_ci vld1.64 $IN#hi,[$inp]! @ load inp 489e1051a39Sopenharmony_ci vld1.64 $IN#lo,[$inp]! 490e1051a39Sopenharmony_ci#ifdef __ARMEL__ 491e1051a39Sopenharmony_ci vrev64.8 $IN,$IN 492e1051a39Sopenharmony_ci#endif 493e1051a39Sopenharmony_ci veor $IN,$Xl @ inp^=Xi 494e1051a39Sopenharmony_ci.Lgmult_neon: 495e1051a39Sopenharmony_ci___ 496e1051a39Sopenharmony_ci &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo 497e1051a39Sopenharmony_ci$code.=<<___; 498e1051a39Sopenharmony_ci veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing 499e1051a39Sopenharmony_ci___ 500e1051a39Sopenharmony_ci &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi) 501e1051a39Sopenharmony_ci &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi 502e1051a39Sopenharmony_ci$code.=<<___; 503e1051a39Sopenharmony_ci veor $Xm,$Xm,$Xl @ Karatsuba post-processing 504e1051a39Sopenharmony_ci veor $Xm,$Xm,$Xh 505e1051a39Sopenharmony_ci veor $Xl#hi,$Xl#hi,$Xm#lo 506e1051a39Sopenharmony_ci veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result 507e1051a39Sopenharmony_ci 508e1051a39Sopenharmony_ci @ equivalent of reduction_avx from ghash-x86_64.pl 509e1051a39Sopenharmony_ci vshl.i64 $t1,$Xl,#57 @ 1st phase 510e1051a39Sopenharmony_ci vshl.i64 $t2,$Xl,#62 511e1051a39Sopenharmony_ci veor $t2,$t2,$t1 @ 512e1051a39Sopenharmony_ci vshl.i64 $t1,$Xl,#63 513e1051a39Sopenharmony_ci veor $t2, $t2, $t1 @ 514e1051a39Sopenharmony_ci veor $Xl#hi,$Xl#hi,$t2#lo @ 515e1051a39Sopenharmony_ci veor $Xh#lo,$Xh#lo,$t2#hi 516e1051a39Sopenharmony_ci 517e1051a39Sopenharmony_ci vshr.u64 $t2,$Xl,#1 @ 2nd phase 518e1051a39Sopenharmony_ci veor $Xh,$Xh,$Xl 519e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 @ 520e1051a39Sopenharmony_ci vshr.u64 $t2,$t2,#6 521e1051a39Sopenharmony_ci vshr.u64 $Xl,$Xl,#1 @ 522e1051a39Sopenharmony_ci veor $Xl,$Xl,$Xh @ 523e1051a39Sopenharmony_ci veor $Xl,$Xl,$t2 @ 524e1051a39Sopenharmony_ci 525e1051a39Sopenharmony_ci subs $len,#16 526e1051a39Sopenharmony_ci bne .Loop_neon 527e1051a39Sopenharmony_ci 528e1051a39Sopenharmony_ci#ifdef __ARMEL__ 529e1051a39Sopenharmony_ci vrev64.8 $Xl,$Xl 530e1051a39Sopenharmony_ci#endif 531e1051a39Sopenharmony_ci sub $Xi,#16 532e1051a39Sopenharmony_ci vst1.64 $Xl#hi,[$Xi]! @ write out Xi 533e1051a39Sopenharmony_ci vst1.64 $Xl#lo,[$Xi] 534e1051a39Sopenharmony_ci 535e1051a39Sopenharmony_ci ret @ bx lr 536e1051a39Sopenharmony_ci.size gcm_ghash_neon,.-gcm_ghash_neon 537e1051a39Sopenharmony_ci#endif 538e1051a39Sopenharmony_ci___ 539e1051a39Sopenharmony_ci} 540e1051a39Sopenharmony_ci$code.=<<___; 541e1051a39Sopenharmony_ci.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 542e1051a39Sopenharmony_ci.align 2 543e1051a39Sopenharmony_ci___ 544e1051a39Sopenharmony_ci 545e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 546e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/geo; 547e1051a39Sopenharmony_ci 548e1051a39Sopenharmony_ci s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 549e1051a39Sopenharmony_ci s/\bret\b/bx lr/go or 550e1051a39Sopenharmony_ci s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 551e1051a39Sopenharmony_ci 552e1051a39Sopenharmony_ci print $_,"\n"; 553e1051a39Sopenharmony_ci} 554e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; # enforce flush 555