1bbbf1280Sopenharmony_ci/* 2bbbf1280Sopenharmony_ci * strncmp - compare two strings 3bbbf1280Sopenharmony_ci * 4bbbf1280Sopenharmony_ci * Copyright (c) 2013-2021, Arm Limited. 5bbbf1280Sopenharmony_ci * SPDX-License-Identifier: MIT 6bbbf1280Sopenharmony_ci */ 7bbbf1280Sopenharmony_ci 8bbbf1280Sopenharmony_ci/* Assumptions: 9bbbf1280Sopenharmony_ci * 10bbbf1280Sopenharmony_ci * ARMv8-a, AArch64 11bbbf1280Sopenharmony_ci */ 12bbbf1280Sopenharmony_ci 13bbbf1280Sopenharmony_ci#include "../asmdefs.h" 14bbbf1280Sopenharmony_ci 15bbbf1280Sopenharmony_ci#define REP8_01 0x0101010101010101 16bbbf1280Sopenharmony_ci#define REP8_7f 0x7f7f7f7f7f7f7f7f 17bbbf1280Sopenharmony_ci 18bbbf1280Sopenharmony_ci/* Parameters and result. */ 19bbbf1280Sopenharmony_ci#define src1 x0 20bbbf1280Sopenharmony_ci#define src2 x1 21bbbf1280Sopenharmony_ci#define limit x2 22bbbf1280Sopenharmony_ci#define result x0 23bbbf1280Sopenharmony_ci 24bbbf1280Sopenharmony_ci/* Internal variables. */ 25bbbf1280Sopenharmony_ci#define data1 x3 26bbbf1280Sopenharmony_ci#define data1w w3 27bbbf1280Sopenharmony_ci#define data2 x4 28bbbf1280Sopenharmony_ci#define data2w w4 29bbbf1280Sopenharmony_ci#define has_nul x5 30bbbf1280Sopenharmony_ci#define diff x6 31bbbf1280Sopenharmony_ci#define syndrome x7 32bbbf1280Sopenharmony_ci#define tmp1 x8 33bbbf1280Sopenharmony_ci#define tmp2 x9 34bbbf1280Sopenharmony_ci#define tmp3 x10 35bbbf1280Sopenharmony_ci#define zeroones x11 36bbbf1280Sopenharmony_ci#define pos x12 37bbbf1280Sopenharmony_ci#define mask x13 38bbbf1280Sopenharmony_ci#define endloop x14 39bbbf1280Sopenharmony_ci#define count mask 40bbbf1280Sopenharmony_ci#define offset pos 41bbbf1280Sopenharmony_ci#define neg_offset x15 42bbbf1280Sopenharmony_ci 43bbbf1280Sopenharmony_ci/* Define endian dependent shift operations. 44bbbf1280Sopenharmony_ci On big-endian early bytes are at MSB and on little-endian LSB. 45bbbf1280Sopenharmony_ci LS_FW means shifting towards early bytes. 46bbbf1280Sopenharmony_ci LS_BK means shifting towards later bytes. 47bbbf1280Sopenharmony_ci */ 48bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__ 49bbbf1280Sopenharmony_ci#define LS_FW lsl 50bbbf1280Sopenharmony_ci#define LS_BK lsr 51bbbf1280Sopenharmony_ci#else 52bbbf1280Sopenharmony_ci#define LS_FW lsr 53bbbf1280Sopenharmony_ci#define LS_BK lsl 54bbbf1280Sopenharmony_ci#endif 55bbbf1280Sopenharmony_ci 56bbbf1280Sopenharmony_ciENTRY (__strncmp_aarch64_mte) 57bbbf1280Sopenharmony_ci PTR_ARG (0) 58bbbf1280Sopenharmony_ci PTR_ARG (1) 59bbbf1280Sopenharmony_ci SIZE_ARG (2) 60bbbf1280Sopenharmony_ci cbz limit, L(ret0) 61bbbf1280Sopenharmony_ci eor tmp1, src1, src2 62bbbf1280Sopenharmony_ci mov zeroones, #REP8_01 63bbbf1280Sopenharmony_ci tst tmp1, #7 64bbbf1280Sopenharmony_ci and count, src1, #7 65bbbf1280Sopenharmony_ci b.ne L(misaligned8) 66bbbf1280Sopenharmony_ci cbnz count, L(mutual_align) 67bbbf1280Sopenharmony_ci 68bbbf1280Sopenharmony_ci /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 69bbbf1280Sopenharmony_ci (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 70bbbf1280Sopenharmony_ci can be done in parallel across the entire word. */ 71bbbf1280Sopenharmony_ci .p2align 4 72bbbf1280Sopenharmony_ciL(loop_aligned): 73bbbf1280Sopenharmony_ci ldr data1, [src1], #8 74bbbf1280Sopenharmony_ci ldr data2, [src2], #8 75bbbf1280Sopenharmony_ciL(start_realigned): 76bbbf1280Sopenharmony_ci subs limit, limit, #8 77bbbf1280Sopenharmony_ci sub tmp1, data1, zeroones 78bbbf1280Sopenharmony_ci orr tmp2, data1, #REP8_7f 79bbbf1280Sopenharmony_ci eor diff, data1, data2 /* Non-zero if differences found. */ 80bbbf1280Sopenharmony_ci csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 81bbbf1280Sopenharmony_ci bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 82bbbf1280Sopenharmony_ci ccmp endloop, #0, #0, eq 83bbbf1280Sopenharmony_ci b.eq L(loop_aligned) 84bbbf1280Sopenharmony_ci /* End of main loop */ 85bbbf1280Sopenharmony_ci 86bbbf1280Sopenharmony_ciL(full_check): 87bbbf1280Sopenharmony_ci#ifndef __AARCH64EB__ 88bbbf1280Sopenharmony_ci orr syndrome, diff, has_nul 89bbbf1280Sopenharmony_ci add limit, limit, 8 /* Rewind limit to before last subs. */ 90bbbf1280Sopenharmony_ciL(syndrome_check): 91bbbf1280Sopenharmony_ci /* Limit was reached. Check if the NUL byte or the difference 92bbbf1280Sopenharmony_ci is before the limit. */ 93bbbf1280Sopenharmony_ci rev syndrome, syndrome 94bbbf1280Sopenharmony_ci rev data1, data1 95bbbf1280Sopenharmony_ci clz pos, syndrome 96bbbf1280Sopenharmony_ci rev data2, data2 97bbbf1280Sopenharmony_ci lsl data1, data1, pos 98bbbf1280Sopenharmony_ci cmp limit, pos, lsr #3 99bbbf1280Sopenharmony_ci lsl data2, data2, pos 100bbbf1280Sopenharmony_ci /* But we need to zero-extend (char is unsigned) the value and then 101bbbf1280Sopenharmony_ci perform a signed 32-bit subtraction. */ 102bbbf1280Sopenharmony_ci lsr data1, data1, #56 103bbbf1280Sopenharmony_ci sub result, data1, data2, lsr #56 104bbbf1280Sopenharmony_ci csel result, result, xzr, hi 105bbbf1280Sopenharmony_ci ret 106bbbf1280Sopenharmony_ci#else 107bbbf1280Sopenharmony_ci /* Not reached the limit, must have found the end or a diff. */ 108bbbf1280Sopenharmony_ci tbz limit, #63, L(not_limit) 109bbbf1280Sopenharmony_ci add tmp1, limit, 8 110bbbf1280Sopenharmony_ci cbz limit, L(not_limit) 111bbbf1280Sopenharmony_ci 112bbbf1280Sopenharmony_ci lsl limit, tmp1, #3 /* Bits -> bytes. */ 113bbbf1280Sopenharmony_ci mov mask, #~0 114bbbf1280Sopenharmony_ci lsr mask, mask, limit 115bbbf1280Sopenharmony_ci bic data1, data1, mask 116bbbf1280Sopenharmony_ci bic data2, data2, mask 117bbbf1280Sopenharmony_ci 118bbbf1280Sopenharmony_ci /* Make sure that the NUL byte is marked in the syndrome. */ 119bbbf1280Sopenharmony_ci orr has_nul, has_nul, mask 120bbbf1280Sopenharmony_ci 121bbbf1280Sopenharmony_ciL(not_limit): 122bbbf1280Sopenharmony_ci /* For big-endian we cannot use the trick with the syndrome value 123bbbf1280Sopenharmony_ci as carry-propagation can corrupt the upper bits if the trailing 124bbbf1280Sopenharmony_ci bytes in the string contain 0x01. */ 125bbbf1280Sopenharmony_ci /* However, if there is no NUL byte in the dword, we can generate 126bbbf1280Sopenharmony_ci the result directly. We can't just subtract the bytes as the 127bbbf1280Sopenharmony_ci MSB might be significant. */ 128bbbf1280Sopenharmony_ci cbnz has_nul, 1f 129bbbf1280Sopenharmony_ci cmp data1, data2 130bbbf1280Sopenharmony_ci cset result, ne 131bbbf1280Sopenharmony_ci cneg result, result, lo 132bbbf1280Sopenharmony_ci ret 133bbbf1280Sopenharmony_ci1: 134bbbf1280Sopenharmony_ci /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 135bbbf1280Sopenharmony_ci rev tmp3, data1 136bbbf1280Sopenharmony_ci sub tmp1, tmp3, zeroones 137bbbf1280Sopenharmony_ci orr tmp2, tmp3, #REP8_7f 138bbbf1280Sopenharmony_ci bic has_nul, tmp1, tmp2 139bbbf1280Sopenharmony_ci rev has_nul, has_nul 140bbbf1280Sopenharmony_ci orr syndrome, diff, has_nul 141bbbf1280Sopenharmony_ci clz pos, syndrome 142bbbf1280Sopenharmony_ci /* The most-significant-non-zero bit of the syndrome marks either the 143bbbf1280Sopenharmony_ci first bit that is different, or the top bit of the first zero byte. 144bbbf1280Sopenharmony_ci Shifting left now will bring the critical information into the 145bbbf1280Sopenharmony_ci top bits. */ 146bbbf1280Sopenharmony_ciL(end_quick): 147bbbf1280Sopenharmony_ci lsl data1, data1, pos 148bbbf1280Sopenharmony_ci lsl data2, data2, pos 149bbbf1280Sopenharmony_ci /* But we need to zero-extend (char is unsigned) the value and then 150bbbf1280Sopenharmony_ci perform a signed 32-bit subtraction. */ 151bbbf1280Sopenharmony_ci lsr data1, data1, #56 152bbbf1280Sopenharmony_ci sub result, data1, data2, lsr #56 153bbbf1280Sopenharmony_ci ret 154bbbf1280Sopenharmony_ci#endif 155bbbf1280Sopenharmony_ci 156bbbf1280Sopenharmony_ciL(mutual_align): 157bbbf1280Sopenharmony_ci /* Sources are mutually aligned, but are not currently at an 158bbbf1280Sopenharmony_ci alignment boundary. Round down the addresses and then mask off 159bbbf1280Sopenharmony_ci the bytes that precede the start point. 160bbbf1280Sopenharmony_ci We also need to adjust the limit calculations, but without 161bbbf1280Sopenharmony_ci overflowing if the limit is near ULONG_MAX. */ 162bbbf1280Sopenharmony_ci bic src1, src1, #7 163bbbf1280Sopenharmony_ci bic src2, src2, #7 164bbbf1280Sopenharmony_ci ldr data1, [src1], #8 165bbbf1280Sopenharmony_ci neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 166bbbf1280Sopenharmony_ci ldr data2, [src2], #8 167bbbf1280Sopenharmony_ci mov tmp2, #~0 168bbbf1280Sopenharmony_ci LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 169bbbf1280Sopenharmony_ci /* Adjust the limit and ensure it doesn't overflow. */ 170bbbf1280Sopenharmony_ci adds limit, limit, count 171bbbf1280Sopenharmony_ci csinv limit, limit, xzr, lo 172bbbf1280Sopenharmony_ci orr data1, data1, tmp2 173bbbf1280Sopenharmony_ci orr data2, data2, tmp2 174bbbf1280Sopenharmony_ci b L(start_realigned) 175bbbf1280Sopenharmony_ci 176bbbf1280Sopenharmony_ci .p2align 4 177bbbf1280Sopenharmony_ci /* Don't bother with dwords for up to 16 bytes. */ 178bbbf1280Sopenharmony_ciL(misaligned8): 179bbbf1280Sopenharmony_ci cmp limit, #16 180bbbf1280Sopenharmony_ci b.hs L(try_misaligned_words) 181bbbf1280Sopenharmony_ci 182bbbf1280Sopenharmony_ciL(byte_loop): 183bbbf1280Sopenharmony_ci /* Perhaps we can do better than this. */ 184bbbf1280Sopenharmony_ci ldrb data1w, [src1], #1 185bbbf1280Sopenharmony_ci ldrb data2w, [src2], #1 186bbbf1280Sopenharmony_ci subs limit, limit, #1 187bbbf1280Sopenharmony_ci ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 188bbbf1280Sopenharmony_ci ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 189bbbf1280Sopenharmony_ci b.eq L(byte_loop) 190bbbf1280Sopenharmony_ciL(done): 191bbbf1280Sopenharmony_ci sub result, data1, data2 192bbbf1280Sopenharmony_ci ret 193bbbf1280Sopenharmony_ci /* Align the SRC1 to a dword by doing a bytewise compare and then do 194bbbf1280Sopenharmony_ci the dword loop. */ 195bbbf1280Sopenharmony_ciL(try_misaligned_words): 196bbbf1280Sopenharmony_ci cbz count, L(src1_aligned) 197bbbf1280Sopenharmony_ci 198bbbf1280Sopenharmony_ci neg count, count 199bbbf1280Sopenharmony_ci and count, count, #7 200bbbf1280Sopenharmony_ci sub limit, limit, count 201bbbf1280Sopenharmony_ci 202bbbf1280Sopenharmony_ciL(page_end_loop): 203bbbf1280Sopenharmony_ci ldrb data1w, [src1], #1 204bbbf1280Sopenharmony_ci ldrb data2w, [src2], #1 205bbbf1280Sopenharmony_ci cmp data1w, #1 206bbbf1280Sopenharmony_ci ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 207bbbf1280Sopenharmony_ci b.ne L(done) 208bbbf1280Sopenharmony_ci subs count, count, #1 209bbbf1280Sopenharmony_ci b.hi L(page_end_loop) 210bbbf1280Sopenharmony_ci 211bbbf1280Sopenharmony_ci /* The following diagram explains the comparison of misaligned strings. 212bbbf1280Sopenharmony_ci The bytes are shown in natural order. For little-endian, it is 213bbbf1280Sopenharmony_ci reversed in the registers. The "x" bytes are before the string. 214bbbf1280Sopenharmony_ci The "|" separates data that is loaded at one time. 215bbbf1280Sopenharmony_ci src1 | a a a a a a a a | b b b c c c c c | . . . 216bbbf1280Sopenharmony_ci src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 217bbbf1280Sopenharmony_ci 218bbbf1280Sopenharmony_ci After shifting in each step, the data looks like this: 219bbbf1280Sopenharmony_ci STEP_A STEP_B STEP_C 220bbbf1280Sopenharmony_ci data1 a a a a a a a a b b b c c c c c b b b c c c c c 221bbbf1280Sopenharmony_ci data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 222bbbf1280Sopenharmony_ci 223bbbf1280Sopenharmony_ci The bytes with "0" are eliminated from the syndrome via mask. 224bbbf1280Sopenharmony_ci 225bbbf1280Sopenharmony_ci Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 226bbbf1280Sopenharmony_ci time from SRC2. The comparison happens in 3 steps. After each step 227bbbf1280Sopenharmony_ci the loop can exit, or read from SRC1 or SRC2. */ 228bbbf1280Sopenharmony_ciL(src1_aligned): 229bbbf1280Sopenharmony_ci /* Calculate offset from 8 byte alignment to string start in bits. No 230bbbf1280Sopenharmony_ci need to mask offset since shifts are ignoring upper bits. */ 231bbbf1280Sopenharmony_ci lsl offset, src2, #3 232bbbf1280Sopenharmony_ci bic src2, src2, #0xf 233bbbf1280Sopenharmony_ci mov mask, -1 234bbbf1280Sopenharmony_ci neg neg_offset, offset 235bbbf1280Sopenharmony_ci ldr data1, [src1], #8 236bbbf1280Sopenharmony_ci ldp tmp1, tmp2, [src2], #16 237bbbf1280Sopenharmony_ci LS_BK mask, mask, neg_offset 238bbbf1280Sopenharmony_ci and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 239bbbf1280Sopenharmony_ci /* Skip the first compare if data in tmp1 is irrelevant. */ 240bbbf1280Sopenharmony_ci tbnz offset, 6, L(misaligned_mid_loop) 241bbbf1280Sopenharmony_ci 242bbbf1280Sopenharmony_ciL(loop_misaligned): 243bbbf1280Sopenharmony_ci /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 244bbbf1280Sopenharmony_ci LS_FW data2, tmp1, offset 245bbbf1280Sopenharmony_ci LS_BK tmp1, tmp2, neg_offset 246bbbf1280Sopenharmony_ci subs limit, limit, #8 247bbbf1280Sopenharmony_ci orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 248bbbf1280Sopenharmony_ci sub has_nul, data1, zeroones 249bbbf1280Sopenharmony_ci eor diff, data1, data2 /* Non-zero if differences found. */ 250bbbf1280Sopenharmony_ci orr tmp3, data1, #REP8_7f 251bbbf1280Sopenharmony_ci csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 252bbbf1280Sopenharmony_ci bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 253bbbf1280Sopenharmony_ci orr tmp3, endloop, has_nul 254bbbf1280Sopenharmony_ci cbnz tmp3, L(full_check) 255bbbf1280Sopenharmony_ci 256bbbf1280Sopenharmony_ci ldr data1, [src1], #8 257bbbf1280Sopenharmony_ciL(misaligned_mid_loop): 258bbbf1280Sopenharmony_ci /* STEP_B: Compare first part of data1 to second part of tmp2. */ 259bbbf1280Sopenharmony_ci LS_FW data2, tmp2, offset 260bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__ 261bbbf1280Sopenharmony_ci /* For big-endian we do a byte reverse to avoid carry-propagation 262bbbf1280Sopenharmony_ci problem described above. This way we can reuse the has_nul in the 263bbbf1280Sopenharmony_ci next step and also use syndrome value trick at the end. */ 264bbbf1280Sopenharmony_ci rev tmp3, data1 265bbbf1280Sopenharmony_ci #define data1_fixed tmp3 266bbbf1280Sopenharmony_ci#else 267bbbf1280Sopenharmony_ci #define data1_fixed data1 268bbbf1280Sopenharmony_ci#endif 269bbbf1280Sopenharmony_ci sub has_nul, data1_fixed, zeroones 270bbbf1280Sopenharmony_ci orr tmp3, data1_fixed, #REP8_7f 271bbbf1280Sopenharmony_ci eor diff, data2, data1 /* Non-zero if differences found. */ 272bbbf1280Sopenharmony_ci bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 273bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__ 274bbbf1280Sopenharmony_ci rev has_nul, has_nul 275bbbf1280Sopenharmony_ci#endif 276bbbf1280Sopenharmony_ci cmp limit, neg_offset, lsr #3 277bbbf1280Sopenharmony_ci orr syndrome, diff, has_nul 278bbbf1280Sopenharmony_ci bic syndrome, syndrome, mask /* Ignore later bytes. */ 279bbbf1280Sopenharmony_ci csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 280bbbf1280Sopenharmony_ci cbnz tmp3, L(syndrome_check) 281bbbf1280Sopenharmony_ci 282bbbf1280Sopenharmony_ci /* STEP_C: Compare second part of data1 to first part of tmp1. */ 283bbbf1280Sopenharmony_ci ldp tmp1, tmp2, [src2], #16 284bbbf1280Sopenharmony_ci cmp limit, #8 285bbbf1280Sopenharmony_ci LS_BK data2, tmp1, neg_offset 286bbbf1280Sopenharmony_ci eor diff, data2, data1 /* Non-zero if differences found. */ 287bbbf1280Sopenharmony_ci orr syndrome, diff, has_nul 288bbbf1280Sopenharmony_ci and syndrome, syndrome, mask /* Ignore earlier bytes. */ 289bbbf1280Sopenharmony_ci csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 290bbbf1280Sopenharmony_ci cbnz tmp3, L(syndrome_check) 291bbbf1280Sopenharmony_ci 292bbbf1280Sopenharmony_ci ldr data1, [src1], #8 293bbbf1280Sopenharmony_ci sub limit, limit, #8 294bbbf1280Sopenharmony_ci b L(loop_misaligned) 295bbbf1280Sopenharmony_ci 296bbbf1280Sopenharmony_ci#ifdef __AARCH64EB__ 297bbbf1280Sopenharmony_ciL(syndrome_check): 298bbbf1280Sopenharmony_ci clz pos, syndrome 299bbbf1280Sopenharmony_ci cmp pos, limit, lsl #3 300bbbf1280Sopenharmony_ci b.lo L(end_quick) 301bbbf1280Sopenharmony_ci#endif 302bbbf1280Sopenharmony_ci 303bbbf1280Sopenharmony_ciL(ret0): 304bbbf1280Sopenharmony_ci mov result, #0 305bbbf1280Sopenharmony_ci ret 306bbbf1280Sopenharmony_ciEND(__strncmp_aarch64_mte) 307bbbf1280Sopenharmony_ci 308