1e1051a39Sopenharmony_ci#! /usr/bin/env perl 2e1051a39Sopenharmony_ci# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved. 3e1051a39Sopenharmony_ci# 4e1051a39Sopenharmony_ci# Licensed under the Apache License 2.0 (the "License"). You may not use 5e1051a39Sopenharmony_ci# this file except in compliance with the License. You can obtain a copy 6e1051a39Sopenharmony_ci# in the file LICENSE in the source distribution or at 7e1051a39Sopenharmony_ci# https://www.openssl.org/source/license.html 8e1051a39Sopenharmony_ci 9e1051a39Sopenharmony_ci 10e1051a39Sopenharmony_ci# ==================================================================== 11e1051a39Sopenharmony_ci# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e1051a39Sopenharmony_ci# project. The module is, however, dual licensed under OpenSSL and 13e1051a39Sopenharmony_ci# CRYPTOGAMS licenses depending on where you obtain it. For further 14e1051a39Sopenharmony_ci# details see http://www.openssl.org/~appro/cryptogams/. 15e1051a39Sopenharmony_ci# ==================================================================== 16e1051a39Sopenharmony_ci 17e1051a39Sopenharmony_ci# March 2010 18e1051a39Sopenharmony_ci# 19e1051a39Sopenharmony_ci# The module implements "4-bit" GCM GHASH function and underlying 20e1051a39Sopenharmony_ci# single multiplication operation in GF(2^128). "4-bit" means that it 21e1051a39Sopenharmony_ci# uses 256 bytes per-key table [+128 bytes shared table]. Performance 22e1051a39Sopenharmony_ci# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU 23e1051a39Sopenharmony_ci# and are expressed in cycles per processed byte, less is better: 24e1051a39Sopenharmony_ci# 25e1051a39Sopenharmony_ci# gcc 3.3.x cc 5.2 this assembler 26e1051a39Sopenharmony_ci# 27e1051a39Sopenharmony_ci# 32-bit build 81.4 43.3 12.6 (+546%/+244%) 28e1051a39Sopenharmony_ci# 64-bit build 20.2 21.2 12.6 (+60%/+68%) 29e1051a39Sopenharmony_ci# 30e1051a39Sopenharmony_ci# Here is data collected on UltraSPARC T1 system running Linux: 31e1051a39Sopenharmony_ci# 32e1051a39Sopenharmony_ci# gcc 4.4.1 this assembler 33e1051a39Sopenharmony_ci# 34e1051a39Sopenharmony_ci# 32-bit build 566 50 (+1000%) 35e1051a39Sopenharmony_ci# 64-bit build 56 50 (+12%) 36e1051a39Sopenharmony_ci# 37e1051a39Sopenharmony_ci# I don't quite understand why difference between 32-bit and 64-bit 38e1051a39Sopenharmony_ci# compiler-generated code is so big. Compilers *were* instructed to 39e1051a39Sopenharmony_ci# generate code for UltraSPARC and should have used 64-bit registers 40e1051a39Sopenharmony_ci# for Z vector (see C code) even in 32-bit build... Oh well, it only 41e1051a39Sopenharmony_ci# means more impressive improvement coefficients for this assembler 42e1051a39Sopenharmony_ci# module;-) Loops are aggressively modulo-scheduled in respect to 43e1051a39Sopenharmony_ci# references to input data and Z.hi updates to achieve 12 cycles 44e1051a39Sopenharmony_ci# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 45e1051a39Sopenharmony_ci# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. 46e1051a39Sopenharmony_ci# 47e1051a39Sopenharmony_ci# October 2012 48e1051a39Sopenharmony_ci# 49e1051a39Sopenharmony_ci# Add VIS3 lookup-table-free implementation using polynomial 50e1051a39Sopenharmony_ci# multiplication xmulx[hi] and extended addition addxc[cc] 51e1051a39Sopenharmony_ci# instructions. 4.52/7.63x improvement on T3/T4 or in absolute 52e1051a39Sopenharmony_ci# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark 53e1051a39Sopenharmony_ci# saturates at ~15.5x single-process result on 8-core processor, 54e1051a39Sopenharmony_ci# or ~20.5GBps per 2.85GHz socket. 55e1051a39Sopenharmony_ci 56e1051a39Sopenharmony_ci$output=pop and open STDOUT,">$output"; 57e1051a39Sopenharmony_ci 58e1051a39Sopenharmony_ci$frame="STACK_FRAME"; 59e1051a39Sopenharmony_ci$bias="STACK_BIAS"; 60e1051a39Sopenharmony_ci 61e1051a39Sopenharmony_ci$Zhi="%o0"; # 64-bit values 62e1051a39Sopenharmony_ci$Zlo="%o1"; 63e1051a39Sopenharmony_ci$Thi="%o2"; 64e1051a39Sopenharmony_ci$Tlo="%o3"; 65e1051a39Sopenharmony_ci$rem="%o4"; 66e1051a39Sopenharmony_ci$tmp="%o5"; 67e1051a39Sopenharmony_ci 68e1051a39Sopenharmony_ci$nhi="%l0"; # small values and pointers 69e1051a39Sopenharmony_ci$nlo="%l1"; 70e1051a39Sopenharmony_ci$xi0="%l2"; 71e1051a39Sopenharmony_ci$xi1="%l3"; 72e1051a39Sopenharmony_ci$rem_4bit="%l4"; 73e1051a39Sopenharmony_ci$remi="%l5"; 74e1051a39Sopenharmony_ci$Htblo="%l6"; 75e1051a39Sopenharmony_ci$cnt="%l7"; 76e1051a39Sopenharmony_ci 77e1051a39Sopenharmony_ci$Xi="%i0"; # input argument block 78e1051a39Sopenharmony_ci$Htbl="%i1"; 79e1051a39Sopenharmony_ci$inp="%i2"; 80e1051a39Sopenharmony_ci$len="%i3"; 81e1051a39Sopenharmony_ci 82e1051a39Sopenharmony_ci$code.=<<___; 83e1051a39Sopenharmony_ci#ifndef __ASSEMBLER__ 84e1051a39Sopenharmony_ci# define __ASSEMBLER__ 1 85e1051a39Sopenharmony_ci#endif 86e1051a39Sopenharmony_ci#include "crypto/sparc_arch.h" 87e1051a39Sopenharmony_ci 88e1051a39Sopenharmony_ci#ifdef __arch64__ 89e1051a39Sopenharmony_ci.register %g2,#scratch 90e1051a39Sopenharmony_ci.register %g3,#scratch 91e1051a39Sopenharmony_ci#endif 92e1051a39Sopenharmony_ci 93e1051a39Sopenharmony_ci.section ".text",#alloc,#execinstr 94e1051a39Sopenharmony_ci 95e1051a39Sopenharmony_ci.align 64 96e1051a39Sopenharmony_cirem_4bit: 97e1051a39Sopenharmony_ci .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 98e1051a39Sopenharmony_ci .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 99e1051a39Sopenharmony_ci .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 100e1051a39Sopenharmony_ci .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 101e1051a39Sopenharmony_ci.type rem_4bit,#object 102e1051a39Sopenharmony_ci.size rem_4bit,(.-rem_4bit) 103e1051a39Sopenharmony_ci 104e1051a39Sopenharmony_ci.globl gcm_ghash_4bit 105e1051a39Sopenharmony_ci.align 32 106e1051a39Sopenharmony_cigcm_ghash_4bit: 107e1051a39Sopenharmony_ci save %sp,-$frame,%sp 108e1051a39Sopenharmony_ci ldub [$inp+15],$nlo 109e1051a39Sopenharmony_ci ldub [$Xi+15],$xi0 110e1051a39Sopenharmony_ci ldub [$Xi+14],$xi1 111e1051a39Sopenharmony_ci add $len,$inp,$len 112e1051a39Sopenharmony_ci add $Htbl,8,$Htblo 113e1051a39Sopenharmony_ci 114e1051a39Sopenharmony_ci1: call .+8 115e1051a39Sopenharmony_ci add %o7,rem_4bit-1b,$rem_4bit 116e1051a39Sopenharmony_ci 117e1051a39Sopenharmony_ci.Louter: 118e1051a39Sopenharmony_ci xor $xi0,$nlo,$nlo 119e1051a39Sopenharmony_ci and $nlo,0xf0,$nhi 120e1051a39Sopenharmony_ci and $nlo,0x0f,$nlo 121e1051a39Sopenharmony_ci sll $nlo,4,$nlo 122e1051a39Sopenharmony_ci ldx [$Htblo+$nlo],$Zlo 123e1051a39Sopenharmony_ci ldx [$Htbl+$nlo],$Zhi 124e1051a39Sopenharmony_ci 125e1051a39Sopenharmony_ci ldub [$inp+14],$nlo 126e1051a39Sopenharmony_ci 127e1051a39Sopenharmony_ci ldx [$Htblo+$nhi],$Tlo 128e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 129e1051a39Sopenharmony_ci ldx [$Htbl+$nhi],$Thi 130e1051a39Sopenharmony_ci sll $remi,3,$remi 131e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 132e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 133e1051a39Sopenharmony_ci mov 13,$cnt 134e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 135e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 136e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 137e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 138e1051a39Sopenharmony_ci 139e1051a39Sopenharmony_ci xor $xi1,$nlo,$nlo 140e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 141e1051a39Sopenharmony_ci and $nlo,0xf0,$nhi 142e1051a39Sopenharmony_ci and $nlo,0x0f,$nlo 143e1051a39Sopenharmony_ci ba .Lghash_inner 144e1051a39Sopenharmony_ci sll $nlo,4,$nlo 145e1051a39Sopenharmony_ci.align 32 146e1051a39Sopenharmony_ci.Lghash_inner: 147e1051a39Sopenharmony_ci ldx [$Htblo+$nlo],$Tlo 148e1051a39Sopenharmony_ci sll $remi,3,$remi 149e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 150e1051a39Sopenharmony_ci ldx [$Htbl+$nlo],$Thi 151e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 152e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 153e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 154e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 155e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 156e1051a39Sopenharmony_ci ldub [$inp+$cnt],$nlo 157e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 158e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 159e1051a39Sopenharmony_ci ldub [$Xi+$cnt],$xi1 160e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 161e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 162e1051a39Sopenharmony_ci 163e1051a39Sopenharmony_ci ldx [$Htblo+$nhi],$Tlo 164e1051a39Sopenharmony_ci sll $remi,3,$remi 165e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 166e1051a39Sopenharmony_ci ldx [$Htbl+$nhi],$Thi 167e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 168e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 169e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 170e1051a39Sopenharmony_ci xor $xi1,$nlo,$nlo 171e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 172e1051a39Sopenharmony_ci and $nlo,0xf0,$nhi 173e1051a39Sopenharmony_ci addcc $cnt,-1,$cnt 174e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 175e1051a39Sopenharmony_ci and $nlo,0x0f,$nlo 176e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 177e1051a39Sopenharmony_ci sll $nlo,4,$nlo 178e1051a39Sopenharmony_ci blu .Lghash_inner 179e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 180e1051a39Sopenharmony_ci 181e1051a39Sopenharmony_ci ldx [$Htblo+$nlo],$Tlo 182e1051a39Sopenharmony_ci sll $remi,3,$remi 183e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 184e1051a39Sopenharmony_ci ldx [$Htbl+$nlo],$Thi 185e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 186e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 187e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 188e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 189e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 190e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 191e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 192e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 193e1051a39Sopenharmony_ci 194e1051a39Sopenharmony_ci add $inp,16,$inp 195e1051a39Sopenharmony_ci cmp $inp,$len 196e1051a39Sopenharmony_ci be,pn SIZE_T_CC,.Ldone 197e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 198e1051a39Sopenharmony_ci 199e1051a39Sopenharmony_ci ldx [$Htblo+$nhi],$Tlo 200e1051a39Sopenharmony_ci sll $remi,3,$remi 201e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 202e1051a39Sopenharmony_ci ldx [$Htbl+$nhi],$Thi 203e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 204e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 205e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 206e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 207e1051a39Sopenharmony_ci ldub [$inp+15],$nlo 208e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 209e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 210e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 211e1051a39Sopenharmony_ci stx $Zlo,[$Xi+8] 212e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 213e1051a39Sopenharmony_ci stx $Zhi,[$Xi] 214e1051a39Sopenharmony_ci srl $Zlo,8,$xi1 215e1051a39Sopenharmony_ci and $Zlo,0xff,$xi0 216e1051a39Sopenharmony_ci ba .Louter 217e1051a39Sopenharmony_ci and $xi1,0xff,$xi1 218e1051a39Sopenharmony_ci.align 32 219e1051a39Sopenharmony_ci.Ldone: 220e1051a39Sopenharmony_ci ldx [$Htblo+$nhi],$Tlo 221e1051a39Sopenharmony_ci sll $remi,3,$remi 222e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 223e1051a39Sopenharmony_ci ldx [$Htbl+$nhi],$Thi 224e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 225e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 226e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 227e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 228e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 229e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 230e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 231e1051a39Sopenharmony_ci stx $Zlo,[$Xi+8] 232e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 233e1051a39Sopenharmony_ci stx $Zhi,[$Xi] 234e1051a39Sopenharmony_ci 235e1051a39Sopenharmony_ci ret 236e1051a39Sopenharmony_ci restore 237e1051a39Sopenharmony_ci.type gcm_ghash_4bit,#function 238e1051a39Sopenharmony_ci.size gcm_ghash_4bit,(.-gcm_ghash_4bit) 239e1051a39Sopenharmony_ci___ 240e1051a39Sopenharmony_ci 241e1051a39Sopenharmony_ciundef $inp; 242e1051a39Sopenharmony_ciundef $len; 243e1051a39Sopenharmony_ci 244e1051a39Sopenharmony_ci$code.=<<___; 245e1051a39Sopenharmony_ci.globl gcm_gmult_4bit 246e1051a39Sopenharmony_ci.align 32 247e1051a39Sopenharmony_cigcm_gmult_4bit: 248e1051a39Sopenharmony_ci save %sp,-$frame,%sp 249e1051a39Sopenharmony_ci ldub [$Xi+15],$nlo 250e1051a39Sopenharmony_ci add $Htbl,8,$Htblo 251e1051a39Sopenharmony_ci 252e1051a39Sopenharmony_ci1: call .+8 253e1051a39Sopenharmony_ci add %o7,rem_4bit-1b,$rem_4bit 254e1051a39Sopenharmony_ci 255e1051a39Sopenharmony_ci and $nlo,0xf0,$nhi 256e1051a39Sopenharmony_ci and $nlo,0x0f,$nlo 257e1051a39Sopenharmony_ci sll $nlo,4,$nlo 258e1051a39Sopenharmony_ci ldx [$Htblo+$nlo],$Zlo 259e1051a39Sopenharmony_ci ldx [$Htbl+$nlo],$Zhi 260e1051a39Sopenharmony_ci 261e1051a39Sopenharmony_ci ldub [$Xi+14],$nlo 262e1051a39Sopenharmony_ci 263e1051a39Sopenharmony_ci ldx [$Htblo+$nhi],$Tlo 264e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 265e1051a39Sopenharmony_ci ldx [$Htbl+$nhi],$Thi 266e1051a39Sopenharmony_ci sll $remi,3,$remi 267e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 268e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 269e1051a39Sopenharmony_ci mov 13,$cnt 270e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 271e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 272e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 273e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 274e1051a39Sopenharmony_ci 275e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 276e1051a39Sopenharmony_ci and $nlo,0xf0,$nhi 277e1051a39Sopenharmony_ci and $nlo,0x0f,$nlo 278e1051a39Sopenharmony_ci ba .Lgmult_inner 279e1051a39Sopenharmony_ci sll $nlo,4,$nlo 280e1051a39Sopenharmony_ci.align 32 281e1051a39Sopenharmony_ci.Lgmult_inner: 282e1051a39Sopenharmony_ci ldx [$Htblo+$nlo],$Tlo 283e1051a39Sopenharmony_ci sll $remi,3,$remi 284e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 285e1051a39Sopenharmony_ci ldx [$Htbl+$nlo],$Thi 286e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 287e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 288e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 289e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 290e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 291e1051a39Sopenharmony_ci ldub [$Xi+$cnt],$nlo 292e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 293e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 294e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 295e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 296e1051a39Sopenharmony_ci 297e1051a39Sopenharmony_ci ldx [$Htblo+$nhi],$Tlo 298e1051a39Sopenharmony_ci sll $remi,3,$remi 299e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 300e1051a39Sopenharmony_ci ldx [$Htbl+$nhi],$Thi 301e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 302e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 303e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 304e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 305e1051a39Sopenharmony_ci and $nlo,0xf0,$nhi 306e1051a39Sopenharmony_ci addcc $cnt,-1,$cnt 307e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 308e1051a39Sopenharmony_ci and $nlo,0x0f,$nlo 309e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 310e1051a39Sopenharmony_ci sll $nlo,4,$nlo 311e1051a39Sopenharmony_ci blu .Lgmult_inner 312e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 313e1051a39Sopenharmony_ci 314e1051a39Sopenharmony_ci ldx [$Htblo+$nlo],$Tlo 315e1051a39Sopenharmony_ci sll $remi,3,$remi 316e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 317e1051a39Sopenharmony_ci ldx [$Htbl+$nlo],$Thi 318e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 319e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 320e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 321e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 322e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 323e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 324e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 325e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 326e1051a39Sopenharmony_ci and $Zlo,0xf,$remi 327e1051a39Sopenharmony_ci 328e1051a39Sopenharmony_ci ldx [$Htblo+$nhi],$Tlo 329e1051a39Sopenharmony_ci sll $remi,3,$remi 330e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 331e1051a39Sopenharmony_ci ldx [$Htbl+$nhi],$Thi 332e1051a39Sopenharmony_ci srlx $Zlo,4,$Zlo 333e1051a39Sopenharmony_ci ldx [$rem_4bit+$remi],$rem 334e1051a39Sopenharmony_ci sllx $Zhi,60,$tmp 335e1051a39Sopenharmony_ci xor $Tlo,$Zlo,$Zlo 336e1051a39Sopenharmony_ci srlx $Zhi,4,$Zhi 337e1051a39Sopenharmony_ci xor $Zlo,$tmp,$Zlo 338e1051a39Sopenharmony_ci xor $Thi,$Zhi,$Zhi 339e1051a39Sopenharmony_ci stx $Zlo,[$Xi+8] 340e1051a39Sopenharmony_ci xor $rem,$Zhi,$Zhi 341e1051a39Sopenharmony_ci stx $Zhi,[$Xi] 342e1051a39Sopenharmony_ci 343e1051a39Sopenharmony_ci ret 344e1051a39Sopenharmony_ci restore 345e1051a39Sopenharmony_ci.type gcm_gmult_4bit,#function 346e1051a39Sopenharmony_ci.size gcm_gmult_4bit,(.-gcm_gmult_4bit) 347e1051a39Sopenharmony_ci___ 348e1051a39Sopenharmony_ci 349e1051a39Sopenharmony_ci{{{ 350e1051a39Sopenharmony_ci# Straightforward 128x128-bit multiplication using Karatsuba algorithm 351e1051a39Sopenharmony_ci# followed by pair of 64-bit reductions [with a shortcut in first one, 352e1051a39Sopenharmony_ci# which allowed to break dependency between reductions and remove one 353e1051a39Sopenharmony_ci# multiplication from critical path]. While it might be suboptimal 354e1051a39Sopenharmony_ci# with regard to sheer number of multiplications, other methods [such 355e1051a39Sopenharmony_ci# as aggregate reduction] would require more 64-bit registers, which 356e1051a39Sopenharmony_ci# we don't have in 32-bit application context. 357e1051a39Sopenharmony_ci 358e1051a39Sopenharmony_ci($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); 359e1051a39Sopenharmony_ci 360e1051a39Sopenharmony_ci($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= 361e1051a39Sopenharmony_ci (map("%o$_",(0..5,7)),map("%g$_",(1..5))); 362e1051a39Sopenharmony_ci 363e1051a39Sopenharmony_ci($shl,$shr)=map("%l$_",(0..7)); 364e1051a39Sopenharmony_ci 365e1051a39Sopenharmony_ci# For details regarding "twisted H" see ghash-x86.pl. 366e1051a39Sopenharmony_ci$code.=<<___; 367e1051a39Sopenharmony_ci.globl gcm_init_vis3 368e1051a39Sopenharmony_ci.align 32 369e1051a39Sopenharmony_cigcm_init_vis3: 370e1051a39Sopenharmony_ci save %sp,-$frame,%sp 371e1051a39Sopenharmony_ci 372e1051a39Sopenharmony_ci ldx [%i1+0],$Hhi 373e1051a39Sopenharmony_ci ldx [%i1+8],$Hlo 374e1051a39Sopenharmony_ci mov 0xE1,$Xhi 375e1051a39Sopenharmony_ci mov 1,$Xlo 376e1051a39Sopenharmony_ci sllx $Xhi,57,$Xhi 377e1051a39Sopenharmony_ci srax $Hhi,63,$C0 ! broadcast carry 378e1051a39Sopenharmony_ci addcc $Hlo,$Hlo,$Hlo ! H<<=1 379e1051a39Sopenharmony_ci addxc $Hhi,$Hhi,$Hhi 380e1051a39Sopenharmony_ci and $C0,$Xlo,$Xlo 381e1051a39Sopenharmony_ci and $C0,$Xhi,$Xhi 382e1051a39Sopenharmony_ci xor $Xlo,$Hlo,$Hlo 383e1051a39Sopenharmony_ci xor $Xhi,$Hhi,$Hhi 384e1051a39Sopenharmony_ci stx $Hlo,[%i0+8] ! save twisted H 385e1051a39Sopenharmony_ci stx $Hhi,[%i0+0] 386e1051a39Sopenharmony_ci 387e1051a39Sopenharmony_ci sethi %hi(0xA0406080),$V 388e1051a39Sopenharmony_ci sethi %hi(0x20C0E000),%l0 389e1051a39Sopenharmony_ci or $V,%lo(0xA0406080),$V 390e1051a39Sopenharmony_ci or %l0,%lo(0x20C0E000),%l0 391e1051a39Sopenharmony_ci sllx $V,32,$V 392e1051a39Sopenharmony_ci or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000 393e1051a39Sopenharmony_ci stx $V,[%i0+16] 394e1051a39Sopenharmony_ci 395e1051a39Sopenharmony_ci ret 396e1051a39Sopenharmony_ci restore 397e1051a39Sopenharmony_ci.type gcm_init_vis3,#function 398e1051a39Sopenharmony_ci.size gcm_init_vis3,.-gcm_init_vis3 399e1051a39Sopenharmony_ci 400e1051a39Sopenharmony_ci.globl gcm_gmult_vis3 401e1051a39Sopenharmony_ci.align 32 402e1051a39Sopenharmony_cigcm_gmult_vis3: 403e1051a39Sopenharmony_ci save %sp,-$frame,%sp 404e1051a39Sopenharmony_ci 405e1051a39Sopenharmony_ci ldx [$Xip+8],$Xlo ! load Xi 406e1051a39Sopenharmony_ci ldx [$Xip+0],$Xhi 407e1051a39Sopenharmony_ci ldx [$Htable+8],$Hlo ! load twisted H 408e1051a39Sopenharmony_ci ldx [$Htable+0],$Hhi 409e1051a39Sopenharmony_ci 410e1051a39Sopenharmony_ci mov 0xE1,%l7 411e1051a39Sopenharmony_ci sllx %l7,57,$xE1 ! 57 is not a typo 412e1051a39Sopenharmony_ci ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 413e1051a39Sopenharmony_ci 414e1051a39Sopenharmony_ci xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 415e1051a39Sopenharmony_ci xmulx $Xlo,$Hlo,$C0 416e1051a39Sopenharmony_ci xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 417e1051a39Sopenharmony_ci xmulx $C2,$Hhl,$C1 418e1051a39Sopenharmony_ci xmulxhi $Xlo,$Hlo,$Xlo 419e1051a39Sopenharmony_ci xmulxhi $C2,$Hhl,$C2 420e1051a39Sopenharmony_ci xmulxhi $Xhi,$Hhi,$C3 421e1051a39Sopenharmony_ci xmulx $Xhi,$Hhi,$Xhi 422e1051a39Sopenharmony_ci 423e1051a39Sopenharmony_ci sll $C0,3,$sqr 424e1051a39Sopenharmony_ci srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] 425e1051a39Sopenharmony_ci xor $C0,$sqr,$sqr 426e1051a39Sopenharmony_ci sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] 427e1051a39Sopenharmony_ci 428e1051a39Sopenharmony_ci xor $C0,$C1,$C1 ! Karatsuba post-processing 429e1051a39Sopenharmony_ci xor $Xlo,$C2,$C2 430e1051a39Sopenharmony_ci xor $sqr,$Xlo,$Xlo ! real destination is $C1 431e1051a39Sopenharmony_ci xor $C3,$C2,$C2 432e1051a39Sopenharmony_ci xor $Xlo,$C1,$C1 433e1051a39Sopenharmony_ci xor $Xhi,$C2,$C2 434e1051a39Sopenharmony_ci xor $Xhi,$C1,$C1 435e1051a39Sopenharmony_ci 436e1051a39Sopenharmony_ci xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 437e1051a39Sopenharmony_ci xor $C0,$C2,$C2 438e1051a39Sopenharmony_ci xmulx $C1,$xE1,$C0 439e1051a39Sopenharmony_ci xor $C1,$C3,$C3 440e1051a39Sopenharmony_ci xmulxhi $C1,$xE1,$C1 441e1051a39Sopenharmony_ci 442e1051a39Sopenharmony_ci xor $Xlo,$C2,$C2 443e1051a39Sopenharmony_ci xor $C0,$C2,$C2 444e1051a39Sopenharmony_ci xor $C1,$C3,$C3 445e1051a39Sopenharmony_ci 446e1051a39Sopenharmony_ci stx $C2,[$Xip+8] ! save Xi 447e1051a39Sopenharmony_ci stx $C3,[$Xip+0] 448e1051a39Sopenharmony_ci 449e1051a39Sopenharmony_ci ret 450e1051a39Sopenharmony_ci restore 451e1051a39Sopenharmony_ci.type gcm_gmult_vis3,#function 452e1051a39Sopenharmony_ci.size gcm_gmult_vis3,.-gcm_gmult_vis3 453e1051a39Sopenharmony_ci 454e1051a39Sopenharmony_ci.globl gcm_ghash_vis3 455e1051a39Sopenharmony_ci.align 32 456e1051a39Sopenharmony_cigcm_ghash_vis3: 457e1051a39Sopenharmony_ci save %sp,-$frame,%sp 458e1051a39Sopenharmony_ci nop 459e1051a39Sopenharmony_ci srln $len,0,$len ! needed on v8+, "nop" on v9 460e1051a39Sopenharmony_ci 461e1051a39Sopenharmony_ci ldx [$Xip+8],$C2 ! load Xi 462e1051a39Sopenharmony_ci ldx [$Xip+0],$C3 463e1051a39Sopenharmony_ci ldx [$Htable+8],$Hlo ! load twisted H 464e1051a39Sopenharmony_ci ldx [$Htable+0],$Hhi 465e1051a39Sopenharmony_ci 466e1051a39Sopenharmony_ci mov 0xE1,%l7 467e1051a39Sopenharmony_ci sllx %l7,57,$xE1 ! 57 is not a typo 468e1051a39Sopenharmony_ci ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 469e1051a39Sopenharmony_ci 470e1051a39Sopenharmony_ci and $inp,7,$shl 471e1051a39Sopenharmony_ci andn $inp,7,$inp 472e1051a39Sopenharmony_ci sll $shl,3,$shl 473e1051a39Sopenharmony_ci prefetch [$inp+63], 20 474e1051a39Sopenharmony_ci sub %g0,$shl,$shr 475e1051a39Sopenharmony_ci 476e1051a39Sopenharmony_ci xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 477e1051a39Sopenharmony_ci.Loop: 478e1051a39Sopenharmony_ci ldx [$inp+8],$Xlo 479e1051a39Sopenharmony_ci brz,pt $shl,1f 480e1051a39Sopenharmony_ci ldx [$inp+0],$Xhi 481e1051a39Sopenharmony_ci 482e1051a39Sopenharmony_ci ldx [$inp+16],$C1 ! align data 483e1051a39Sopenharmony_ci srlx $Xlo,$shr,$C0 484e1051a39Sopenharmony_ci sllx $Xlo,$shl,$Xlo 485e1051a39Sopenharmony_ci sllx $Xhi,$shl,$Xhi 486e1051a39Sopenharmony_ci srlx $C1,$shr,$C1 487e1051a39Sopenharmony_ci or $C0,$Xhi,$Xhi 488e1051a39Sopenharmony_ci or $C1,$Xlo,$Xlo 489e1051a39Sopenharmony_ci1: 490e1051a39Sopenharmony_ci add $inp,16,$inp 491e1051a39Sopenharmony_ci sub $len,16,$len 492e1051a39Sopenharmony_ci xor $C2,$Xlo,$Xlo 493e1051a39Sopenharmony_ci xor $C3,$Xhi,$Xhi 494e1051a39Sopenharmony_ci prefetch [$inp+63], 20 495e1051a39Sopenharmony_ci 496e1051a39Sopenharmony_ci xmulx $Xlo,$Hlo,$C0 497e1051a39Sopenharmony_ci xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 498e1051a39Sopenharmony_ci xmulx $C2,$Hhl,$C1 499e1051a39Sopenharmony_ci xmulxhi $Xlo,$Hlo,$Xlo 500e1051a39Sopenharmony_ci xmulxhi $C2,$Hhl,$C2 501e1051a39Sopenharmony_ci xmulxhi $Xhi,$Hhi,$C3 502e1051a39Sopenharmony_ci xmulx $Xhi,$Hhi,$Xhi 503e1051a39Sopenharmony_ci 504e1051a39Sopenharmony_ci sll $C0,3,$sqr 505e1051a39Sopenharmony_ci srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] 506e1051a39Sopenharmony_ci xor $C0,$sqr,$sqr 507e1051a39Sopenharmony_ci sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] 508e1051a39Sopenharmony_ci 509e1051a39Sopenharmony_ci xor $C0,$C1,$C1 ! Karatsuba post-processing 510e1051a39Sopenharmony_ci xor $Xlo,$C2,$C2 511e1051a39Sopenharmony_ci xor $sqr,$Xlo,$Xlo ! real destination is $C1 512e1051a39Sopenharmony_ci xor $C3,$C2,$C2 513e1051a39Sopenharmony_ci xor $Xlo,$C1,$C1 514e1051a39Sopenharmony_ci xor $Xhi,$C2,$C2 515e1051a39Sopenharmony_ci xor $Xhi,$C1,$C1 516e1051a39Sopenharmony_ci 517e1051a39Sopenharmony_ci xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 518e1051a39Sopenharmony_ci xor $C0,$C2,$C2 519e1051a39Sopenharmony_ci xmulx $C1,$xE1,$C0 520e1051a39Sopenharmony_ci xor $C1,$C3,$C3 521e1051a39Sopenharmony_ci xmulxhi $C1,$xE1,$C1 522e1051a39Sopenharmony_ci 523e1051a39Sopenharmony_ci xor $Xlo,$C2,$C2 524e1051a39Sopenharmony_ci xor $C0,$C2,$C2 525e1051a39Sopenharmony_ci brnz,pt $len,.Loop 526e1051a39Sopenharmony_ci xor $C1,$C3,$C3 527e1051a39Sopenharmony_ci 528e1051a39Sopenharmony_ci stx $C2,[$Xip+8] ! save Xi 529e1051a39Sopenharmony_ci stx $C3,[$Xip+0] 530e1051a39Sopenharmony_ci 531e1051a39Sopenharmony_ci ret 532e1051a39Sopenharmony_ci restore 533e1051a39Sopenharmony_ci.type gcm_ghash_vis3,#function 534e1051a39Sopenharmony_ci.size gcm_ghash_vis3,.-gcm_ghash_vis3 535e1051a39Sopenharmony_ci___ 536e1051a39Sopenharmony_ci}}} 537e1051a39Sopenharmony_ci$code.=<<___; 538e1051a39Sopenharmony_ci.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>" 539e1051a39Sopenharmony_ci.align 4 540e1051a39Sopenharmony_ci___ 541e1051a39Sopenharmony_ci 542e1051a39Sopenharmony_ci 543e1051a39Sopenharmony_ci# Purpose of these subroutines is to explicitly encode VIS instructions, 544e1051a39Sopenharmony_ci# so that one can compile the module without having to specify VIS 545e1051a39Sopenharmony_ci# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 546e1051a39Sopenharmony_ci# Idea is to reserve for option to produce "universal" binary and let 547e1051a39Sopenharmony_ci# programmer detect if current CPU is VIS capable at run-time. 548e1051a39Sopenharmony_cisub unvis3 { 549e1051a39Sopenharmony_cimy ($mnemonic,$rs1,$rs2,$rd)=@_; 550e1051a39Sopenharmony_cimy %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 551e1051a39Sopenharmony_cimy ($ref,$opf); 552e1051a39Sopenharmony_cimy %visopf = ( "addxc" => 0x011, 553e1051a39Sopenharmony_ci "addxccc" => 0x013, 554e1051a39Sopenharmony_ci "xmulx" => 0x115, 555e1051a39Sopenharmony_ci "xmulxhi" => 0x116 ); 556e1051a39Sopenharmony_ci 557e1051a39Sopenharmony_ci $ref = "$mnemonic\t$rs1,$rs2,$rd"; 558e1051a39Sopenharmony_ci 559e1051a39Sopenharmony_ci if ($opf=$visopf{$mnemonic}) { 560e1051a39Sopenharmony_ci foreach ($rs1,$rs2,$rd) { 561e1051a39Sopenharmony_ci return $ref if (!/%([goli])([0-9])/); 562e1051a39Sopenharmony_ci $_=$bias{$1}+$2; 563e1051a39Sopenharmony_ci } 564e1051a39Sopenharmony_ci 565e1051a39Sopenharmony_ci return sprintf ".word\t0x%08x !%s", 566e1051a39Sopenharmony_ci 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 567e1051a39Sopenharmony_ci $ref; 568e1051a39Sopenharmony_ci } else { 569e1051a39Sopenharmony_ci return $ref; 570e1051a39Sopenharmony_ci } 571e1051a39Sopenharmony_ci} 572e1051a39Sopenharmony_ci 573e1051a39Sopenharmony_ciforeach (split("\n",$code)) { 574e1051a39Sopenharmony_ci s/\`([^\`]*)\`/eval $1/ge; 575e1051a39Sopenharmony_ci 576e1051a39Sopenharmony_ci s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 577e1051a39Sopenharmony_ci &unvis3($1,$2,$3,$4) 578e1051a39Sopenharmony_ci /ge; 579e1051a39Sopenharmony_ci 580e1051a39Sopenharmony_ci print $_,"\n"; 581e1051a39Sopenharmony_ci} 582e1051a39Sopenharmony_ci 583e1051a39Sopenharmony_ciclose STDOUT or die "error closing STDOUT: $!"; 584