18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2013 ARM Ltd. 48c2ecf20Sopenharmony_ci * Copyright (C) 2013 Linaro. 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro 78c2ecf20Sopenharmony_ci * be found @ 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 108c2ecf20Sopenharmony_ci * files/head:/src/aarch64/ 118c2ecf20Sopenharmony_ci */ 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci#include <linux/linkage.h> 148c2ecf20Sopenharmony_ci#include <asm/assembler.h> 158c2ecf20Sopenharmony_ci#include <asm/cache.h> 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci/* 188c2ecf20Sopenharmony_ci * Fill in the buffer with character c (alignment handled by the hardware) 198c2ecf20Sopenharmony_ci * 208c2ecf20Sopenharmony_ci * Parameters: 218c2ecf20Sopenharmony_ci * x0 - buf 228c2ecf20Sopenharmony_ci * x1 - c 238c2ecf20Sopenharmony_ci * x2 - n 248c2ecf20Sopenharmony_ci * Returns: 258c2ecf20Sopenharmony_ci * x0 - buf 268c2ecf20Sopenharmony_ci */ 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_cidstin .req x0 298c2ecf20Sopenharmony_cival .req w1 308c2ecf20Sopenharmony_cicount .req x2 318c2ecf20Sopenharmony_citmp1 .req x3 328c2ecf20Sopenharmony_citmp1w .req w3 338c2ecf20Sopenharmony_citmp2 .req x4 348c2ecf20Sopenharmony_citmp2w .req w4 358c2ecf20Sopenharmony_cizva_len_x .req x5 368c2ecf20Sopenharmony_cizva_len .req w5 378c2ecf20Sopenharmony_cizva_bits_x .req x6 388c2ecf20Sopenharmony_ci 398c2ecf20Sopenharmony_ciA_l .req x7 408c2ecf20Sopenharmony_ciA_lw .req w7 418c2ecf20Sopenharmony_cidst .req x8 428c2ecf20Sopenharmony_citmp3w .req w9 438c2ecf20Sopenharmony_citmp3 .req x9 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ciSYM_FUNC_START_ALIAS(__memset) 468c2ecf20Sopenharmony_ciSYM_FUNC_START_WEAK_PI(memset) 478c2ecf20Sopenharmony_ci mov dst, dstin /* Preserve return value. */ 488c2ecf20Sopenharmony_ci and A_lw, val, #255 498c2ecf20Sopenharmony_ci orr A_lw, A_lw, A_lw, lsl #8 508c2ecf20Sopenharmony_ci orr A_lw, A_lw, A_lw, lsl #16 518c2ecf20Sopenharmony_ci orr A_l, A_l, A_l, lsl #32 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci cmp count, #15 548c2ecf20Sopenharmony_ci b.hi .Lover16_proc 558c2ecf20Sopenharmony_ci /*All store maybe are non-aligned..*/ 568c2ecf20Sopenharmony_ci tbz count, #3, 1f 578c2ecf20Sopenharmony_ci str A_l, [dst], #8 588c2ecf20Sopenharmony_ci1: 598c2ecf20Sopenharmony_ci tbz count, #2, 2f 608c2ecf20Sopenharmony_ci str A_lw, [dst], #4 618c2ecf20Sopenharmony_ci2: 628c2ecf20Sopenharmony_ci tbz count, #1, 3f 638c2ecf20Sopenharmony_ci strh A_lw, [dst], #2 648c2ecf20Sopenharmony_ci3: 658c2ecf20Sopenharmony_ci tbz count, #0, 4f 668c2ecf20Sopenharmony_ci strb A_lw, [dst] 678c2ecf20Sopenharmony_ci4: 688c2ecf20Sopenharmony_ci ret 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_ci.Lover16_proc: 718c2ecf20Sopenharmony_ci /*Whether the start address is aligned with 16.*/ 728c2ecf20Sopenharmony_ci neg tmp2, dst 738c2ecf20Sopenharmony_ci ands tmp2, tmp2, #15 748c2ecf20Sopenharmony_ci b.eq .Laligned 758c2ecf20Sopenharmony_ci/* 768c2ecf20Sopenharmony_ci* The count is not less than 16, we can use stp to store the start 16 bytes, 778c2ecf20Sopenharmony_ci* then adjust the dst aligned with 16.This process will make the current 788c2ecf20Sopenharmony_ci* memory address at alignment boundary. 798c2ecf20Sopenharmony_ci*/ 808c2ecf20Sopenharmony_ci stp A_l, A_l, [dst] /*non-aligned store..*/ 818c2ecf20Sopenharmony_ci /*make the dst aligned..*/ 828c2ecf20Sopenharmony_ci sub count, count, tmp2 838c2ecf20Sopenharmony_ci add dst, dst, tmp2 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci.Laligned: 868c2ecf20Sopenharmony_ci cbz A_l, .Lzero_mem 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci.Ltail_maybe_long: 898c2ecf20Sopenharmony_ci cmp count, #64 908c2ecf20Sopenharmony_ci b.ge .Lnot_short 918c2ecf20Sopenharmony_ci.Ltail63: 928c2ecf20Sopenharmony_ci ands tmp1, count, #0x30 938c2ecf20Sopenharmony_ci b.eq 3f 948c2ecf20Sopenharmony_ci cmp tmp1w, #0x20 958c2ecf20Sopenharmony_ci b.eq 1f 968c2ecf20Sopenharmony_ci b.lt 2f 978c2ecf20Sopenharmony_ci stp A_l, A_l, [dst], #16 988c2ecf20Sopenharmony_ci1: 998c2ecf20Sopenharmony_ci stp A_l, A_l, [dst], #16 1008c2ecf20Sopenharmony_ci2: 1018c2ecf20Sopenharmony_ci stp A_l, A_l, [dst], #16 1028c2ecf20Sopenharmony_ci/* 1038c2ecf20Sopenharmony_ci* The last store length is less than 16,use stp to write last 16 bytes. 1048c2ecf20Sopenharmony_ci* It will lead some bytes written twice and the access is non-aligned. 1058c2ecf20Sopenharmony_ci*/ 1068c2ecf20Sopenharmony_ci3: 1078c2ecf20Sopenharmony_ci ands count, count, #15 1088c2ecf20Sopenharmony_ci cbz count, 4f 1098c2ecf20Sopenharmony_ci add dst, dst, count 1108c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 1118c2ecf20Sopenharmony_ci4: 1128c2ecf20Sopenharmony_ci ret 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ci /* 1158c2ecf20Sopenharmony_ci * Critical loop. Start at a new cache line boundary. Assuming 1168c2ecf20Sopenharmony_ci * 64 bytes per line, this ensures the entire loop is in one line. 1178c2ecf20Sopenharmony_ci */ 1188c2ecf20Sopenharmony_ci .p2align L1_CACHE_SHIFT 1198c2ecf20Sopenharmony_ci.Lnot_short: 1208c2ecf20Sopenharmony_ci sub dst, dst, #16/* Pre-bias. */ 1218c2ecf20Sopenharmony_ci sub count, count, #64 1228c2ecf20Sopenharmony_ci1: 1238c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #16] 1248c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #32] 1258c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #48] 1268c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #64]! 1278c2ecf20Sopenharmony_ci subs count, count, #64 1288c2ecf20Sopenharmony_ci b.ge 1b 1298c2ecf20Sopenharmony_ci tst count, #0x3f 1308c2ecf20Sopenharmony_ci add dst, dst, #16 1318c2ecf20Sopenharmony_ci b.ne .Ltail63 1328c2ecf20Sopenharmony_ci.Lexitfunc: 1338c2ecf20Sopenharmony_ci ret 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci /* 1368c2ecf20Sopenharmony_ci * For zeroing memory, check to see if we can use the ZVA feature to 1378c2ecf20Sopenharmony_ci * zero entire 'cache' lines. 1388c2ecf20Sopenharmony_ci */ 1398c2ecf20Sopenharmony_ci.Lzero_mem: 1408c2ecf20Sopenharmony_ci cmp count, #63 1418c2ecf20Sopenharmony_ci b.le .Ltail63 1428c2ecf20Sopenharmony_ci /* 1438c2ecf20Sopenharmony_ci * For zeroing small amounts of memory, it's not worth setting up 1448c2ecf20Sopenharmony_ci * the line-clear code. 1458c2ecf20Sopenharmony_ci */ 1468c2ecf20Sopenharmony_ci cmp count, #128 1478c2ecf20Sopenharmony_ci b.lt .Lnot_short /*count is at least 128 bytes*/ 1488c2ecf20Sopenharmony_ci 1498c2ecf20Sopenharmony_ci mrs tmp1, dczid_el0 1508c2ecf20Sopenharmony_ci tbnz tmp1, #4, .Lnot_short 1518c2ecf20Sopenharmony_ci mov tmp3w, #4 1528c2ecf20Sopenharmony_ci and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 1538c2ecf20Sopenharmony_ci lsl zva_len, tmp3w, zva_len 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci ands tmp3w, zva_len, #63 1568c2ecf20Sopenharmony_ci /* 1578c2ecf20Sopenharmony_ci * ensure the zva_len is not less than 64. 1588c2ecf20Sopenharmony_ci * It is not meaningful to use ZVA if the block size is less than 64. 1598c2ecf20Sopenharmony_ci */ 1608c2ecf20Sopenharmony_ci b.ne .Lnot_short 1618c2ecf20Sopenharmony_ci.Lzero_by_line: 1628c2ecf20Sopenharmony_ci /* 1638c2ecf20Sopenharmony_ci * Compute how far we need to go to become suitably aligned. We're 1648c2ecf20Sopenharmony_ci * already at quad-word alignment. 1658c2ecf20Sopenharmony_ci */ 1668c2ecf20Sopenharmony_ci cmp count, zva_len_x 1678c2ecf20Sopenharmony_ci b.lt .Lnot_short /* Not enough to reach alignment. */ 1688c2ecf20Sopenharmony_ci sub zva_bits_x, zva_len_x, #1 1698c2ecf20Sopenharmony_ci neg tmp2, dst 1708c2ecf20Sopenharmony_ci ands tmp2, tmp2, zva_bits_x 1718c2ecf20Sopenharmony_ci b.eq 2f /* Already aligned. */ 1728c2ecf20Sopenharmony_ci /* Not aligned, check that there's enough to copy after alignment.*/ 1738c2ecf20Sopenharmony_ci sub tmp1, count, tmp2 1748c2ecf20Sopenharmony_ci /* 1758c2ecf20Sopenharmony_ci * grantee the remain length to be ZVA is bigger than 64, 1768c2ecf20Sopenharmony_ci * avoid to make the 2f's process over mem range.*/ 1778c2ecf20Sopenharmony_ci cmp tmp1, #64 1788c2ecf20Sopenharmony_ci ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 1798c2ecf20Sopenharmony_ci b.lt .Lnot_short 1808c2ecf20Sopenharmony_ci /* 1818c2ecf20Sopenharmony_ci * We know that there's at least 64 bytes to zero and that it's safe 1828c2ecf20Sopenharmony_ci * to overrun by 64 bytes. 1838c2ecf20Sopenharmony_ci */ 1848c2ecf20Sopenharmony_ci mov count, tmp1 1858c2ecf20Sopenharmony_ci1: 1868c2ecf20Sopenharmony_ci stp A_l, A_l, [dst] 1878c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #16] 1888c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #32] 1898c2ecf20Sopenharmony_ci subs tmp2, tmp2, #64 1908c2ecf20Sopenharmony_ci stp A_l, A_l, [dst, #48] 1918c2ecf20Sopenharmony_ci add dst, dst, #64 1928c2ecf20Sopenharmony_ci b.ge 1b 1938c2ecf20Sopenharmony_ci /* We've overrun a bit, so adjust dst downwards.*/ 1948c2ecf20Sopenharmony_ci add dst, dst, tmp2 1958c2ecf20Sopenharmony_ci2: 1968c2ecf20Sopenharmony_ci sub count, count, zva_len_x 1978c2ecf20Sopenharmony_ci3: 1988c2ecf20Sopenharmony_ci dc zva, dst 1998c2ecf20Sopenharmony_ci add dst, dst, zva_len_x 2008c2ecf20Sopenharmony_ci subs count, count, zva_len_x 2018c2ecf20Sopenharmony_ci b.ge 3b 2028c2ecf20Sopenharmony_ci ands count, count, zva_bits_x 2038c2ecf20Sopenharmony_ci b.ne .Ltail_maybe_long 2048c2ecf20Sopenharmony_ci ret 2058c2ecf20Sopenharmony_ciSYM_FUNC_END_PI(memset) 2068c2ecf20Sopenharmony_ciEXPORT_SYMBOL(memset) 2078c2ecf20Sopenharmony_ciSYM_FUNC_END_ALIAS(__memset) 2088c2ecf20Sopenharmony_ciEXPORT_SYMBOL(__memset) 209