162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2013 ARM Ltd. 462306a36Sopenharmony_ci * Copyright (C) 2013 Linaro. 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * This code is based on glibc cortex strings work originally authored by Linaro 762306a36Sopenharmony_ci * be found @ 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ 1062306a36Sopenharmony_ci * files/head:/src/aarch64/ 1162306a36Sopenharmony_ci */ 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci#include <linux/linkage.h> 1462306a36Sopenharmony_ci#include <asm/assembler.h> 1562306a36Sopenharmony_ci#include <asm/cache.h> 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci/* 1862306a36Sopenharmony_ci * Fill in the buffer with character c (alignment handled by the hardware) 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * Parameters: 2162306a36Sopenharmony_ci * x0 - buf 2262306a36Sopenharmony_ci * x1 - c 2362306a36Sopenharmony_ci * x2 - n 2462306a36Sopenharmony_ci * Returns: 2562306a36Sopenharmony_ci * x0 - buf 2662306a36Sopenharmony_ci */ 2762306a36Sopenharmony_ci 2862306a36Sopenharmony_cidstin .req x0 2962306a36Sopenharmony_cival .req w1 3062306a36Sopenharmony_cicount .req x2 3162306a36Sopenharmony_citmp1 .req x3 3262306a36Sopenharmony_citmp1w .req w3 3362306a36Sopenharmony_citmp2 .req x4 3462306a36Sopenharmony_citmp2w .req w4 3562306a36Sopenharmony_cizva_len_x .req x5 3662306a36Sopenharmony_cizva_len .req w5 3762306a36Sopenharmony_cizva_bits_x .req x6 3862306a36Sopenharmony_ci 3962306a36Sopenharmony_ciA_l .req x7 4062306a36Sopenharmony_ciA_lw .req w7 4162306a36Sopenharmony_cidst .req x8 4262306a36Sopenharmony_citmp3w .req w9 4362306a36Sopenharmony_citmp3 .req x9 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ciSYM_FUNC_START(__pi_memset) 4662306a36Sopenharmony_ci mov dst, dstin /* Preserve return value. */ 4762306a36Sopenharmony_ci and A_lw, val, #255 4862306a36Sopenharmony_ci orr A_lw, A_lw, A_lw, lsl #8 4962306a36Sopenharmony_ci orr A_lw, A_lw, A_lw, lsl #16 5062306a36Sopenharmony_ci orr A_l, A_l, A_l, lsl #32 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci cmp count, #15 5362306a36Sopenharmony_ci b.hi .Lover16_proc 5462306a36Sopenharmony_ci /*All store maybe are non-aligned..*/ 5562306a36Sopenharmony_ci tbz count, #3, 1f 5662306a36Sopenharmony_ci str A_l, [dst], #8 5762306a36Sopenharmony_ci1: 5862306a36Sopenharmony_ci tbz count, #2, 2f 5962306a36Sopenharmony_ci str A_lw, [dst], #4 6062306a36Sopenharmony_ci2: 6162306a36Sopenharmony_ci tbz count, #1, 3f 6262306a36Sopenharmony_ci strh A_lw, [dst], #2 6362306a36Sopenharmony_ci3: 6462306a36Sopenharmony_ci tbz count, #0, 4f 6562306a36Sopenharmony_ci strb A_lw, [dst] 6662306a36Sopenharmony_ci4: 6762306a36Sopenharmony_ci ret 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci.Lover16_proc: 7062306a36Sopenharmony_ci /*Whether the start address is aligned with 16.*/ 7162306a36Sopenharmony_ci neg tmp2, dst 7262306a36Sopenharmony_ci ands tmp2, tmp2, #15 7362306a36Sopenharmony_ci b.eq .Laligned 7462306a36Sopenharmony_ci/* 7562306a36Sopenharmony_ci* The count is not less than 16, we can use stp to store the start 16 bytes, 7662306a36Sopenharmony_ci* then adjust the dst aligned with 16.This process will make the current 7762306a36Sopenharmony_ci* memory address at alignment boundary. 7862306a36Sopenharmony_ci*/ 7962306a36Sopenharmony_ci stp A_l, A_l, [dst] /*non-aligned store..*/ 8062306a36Sopenharmony_ci /*make the dst aligned..*/ 8162306a36Sopenharmony_ci sub count, count, tmp2 8262306a36Sopenharmony_ci add dst, dst, tmp2 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci.Laligned: 8562306a36Sopenharmony_ci cbz A_l, .Lzero_mem 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci.Ltail_maybe_long: 8862306a36Sopenharmony_ci cmp count, #64 8962306a36Sopenharmony_ci b.ge .Lnot_short 9062306a36Sopenharmony_ci.Ltail63: 9162306a36Sopenharmony_ci ands tmp1, count, #0x30 9262306a36Sopenharmony_ci b.eq 3f 9362306a36Sopenharmony_ci cmp tmp1w, #0x20 9462306a36Sopenharmony_ci b.eq 1f 9562306a36Sopenharmony_ci b.lt 2f 9662306a36Sopenharmony_ci stp A_l, A_l, [dst], #16 9762306a36Sopenharmony_ci1: 9862306a36Sopenharmony_ci stp A_l, A_l, [dst], #16 9962306a36Sopenharmony_ci2: 10062306a36Sopenharmony_ci stp A_l, A_l, [dst], #16 10162306a36Sopenharmony_ci/* 10262306a36Sopenharmony_ci* The last store length is less than 16,use stp to write last 16 bytes. 10362306a36Sopenharmony_ci* It will lead some bytes written twice and the access is non-aligned. 10462306a36Sopenharmony_ci*/ 10562306a36Sopenharmony_ci3: 10662306a36Sopenharmony_ci ands count, count, #15 10762306a36Sopenharmony_ci cbz count, 4f 10862306a36Sopenharmony_ci add dst, dst, count 10962306a36Sopenharmony_ci stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ 11062306a36Sopenharmony_ci4: 11162306a36Sopenharmony_ci ret 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci /* 11462306a36Sopenharmony_ci * Critical loop. Start at a new cache line boundary. Assuming 11562306a36Sopenharmony_ci * 64 bytes per line, this ensures the entire loop is in one line. 11662306a36Sopenharmony_ci */ 11762306a36Sopenharmony_ci .p2align L1_CACHE_SHIFT 11862306a36Sopenharmony_ci.Lnot_short: 11962306a36Sopenharmony_ci sub dst, dst, #16/* Pre-bias. */ 12062306a36Sopenharmony_ci sub count, count, #64 12162306a36Sopenharmony_ci1: 12262306a36Sopenharmony_ci stp A_l, A_l, [dst, #16] 12362306a36Sopenharmony_ci stp A_l, A_l, [dst, #32] 12462306a36Sopenharmony_ci stp A_l, A_l, [dst, #48] 12562306a36Sopenharmony_ci stp A_l, A_l, [dst, #64]! 12662306a36Sopenharmony_ci subs count, count, #64 12762306a36Sopenharmony_ci b.ge 1b 12862306a36Sopenharmony_ci tst count, #0x3f 12962306a36Sopenharmony_ci add dst, dst, #16 13062306a36Sopenharmony_ci b.ne .Ltail63 13162306a36Sopenharmony_ci.Lexitfunc: 13262306a36Sopenharmony_ci ret 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci /* 13562306a36Sopenharmony_ci * For zeroing memory, check to see if we can use the ZVA feature to 13662306a36Sopenharmony_ci * zero entire 'cache' lines. 13762306a36Sopenharmony_ci */ 13862306a36Sopenharmony_ci.Lzero_mem: 13962306a36Sopenharmony_ci cmp count, #63 14062306a36Sopenharmony_ci b.le .Ltail63 14162306a36Sopenharmony_ci /* 14262306a36Sopenharmony_ci * For zeroing small amounts of memory, it's not worth setting up 14362306a36Sopenharmony_ci * the line-clear code. 14462306a36Sopenharmony_ci */ 14562306a36Sopenharmony_ci cmp count, #128 14662306a36Sopenharmony_ci b.lt .Lnot_short /*count is at least 128 bytes*/ 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci mrs tmp1, dczid_el0 14962306a36Sopenharmony_ci tbnz tmp1, #4, .Lnot_short 15062306a36Sopenharmony_ci mov tmp3w, #4 15162306a36Sopenharmony_ci and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ 15262306a36Sopenharmony_ci lsl zva_len, tmp3w, zva_len 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci ands tmp3w, zva_len, #63 15562306a36Sopenharmony_ci /* 15662306a36Sopenharmony_ci * ensure the zva_len is not less than 64. 15762306a36Sopenharmony_ci * It is not meaningful to use ZVA if the block size is less than 64. 15862306a36Sopenharmony_ci */ 15962306a36Sopenharmony_ci b.ne .Lnot_short 16062306a36Sopenharmony_ci.Lzero_by_line: 16162306a36Sopenharmony_ci /* 16262306a36Sopenharmony_ci * Compute how far we need to go to become suitably aligned. We're 16362306a36Sopenharmony_ci * already at quad-word alignment. 16462306a36Sopenharmony_ci */ 16562306a36Sopenharmony_ci cmp count, zva_len_x 16662306a36Sopenharmony_ci b.lt .Lnot_short /* Not enough to reach alignment. */ 16762306a36Sopenharmony_ci sub zva_bits_x, zva_len_x, #1 16862306a36Sopenharmony_ci neg tmp2, dst 16962306a36Sopenharmony_ci ands tmp2, tmp2, zva_bits_x 17062306a36Sopenharmony_ci b.eq 2f /* Already aligned. */ 17162306a36Sopenharmony_ci /* Not aligned, check that there's enough to copy after alignment.*/ 17262306a36Sopenharmony_ci sub tmp1, count, tmp2 17362306a36Sopenharmony_ci /* 17462306a36Sopenharmony_ci * grantee the remain length to be ZVA is bigger than 64, 17562306a36Sopenharmony_ci * avoid to make the 2f's process over mem range.*/ 17662306a36Sopenharmony_ci cmp tmp1, #64 17762306a36Sopenharmony_ci ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ 17862306a36Sopenharmony_ci b.lt .Lnot_short 17962306a36Sopenharmony_ci /* 18062306a36Sopenharmony_ci * We know that there's at least 64 bytes to zero and that it's safe 18162306a36Sopenharmony_ci * to overrun by 64 bytes. 18262306a36Sopenharmony_ci */ 18362306a36Sopenharmony_ci mov count, tmp1 18462306a36Sopenharmony_ci1: 18562306a36Sopenharmony_ci stp A_l, A_l, [dst] 18662306a36Sopenharmony_ci stp A_l, A_l, [dst, #16] 18762306a36Sopenharmony_ci stp A_l, A_l, [dst, #32] 18862306a36Sopenharmony_ci subs tmp2, tmp2, #64 18962306a36Sopenharmony_ci stp A_l, A_l, [dst, #48] 19062306a36Sopenharmony_ci add dst, dst, #64 19162306a36Sopenharmony_ci b.ge 1b 19262306a36Sopenharmony_ci /* We've overrun a bit, so adjust dst downwards.*/ 19362306a36Sopenharmony_ci add dst, dst, tmp2 19462306a36Sopenharmony_ci2: 19562306a36Sopenharmony_ci sub count, count, zva_len_x 19662306a36Sopenharmony_ci3: 19762306a36Sopenharmony_ci dc zva, dst 19862306a36Sopenharmony_ci add dst, dst, zva_len_x 19962306a36Sopenharmony_ci subs count, count, zva_len_x 20062306a36Sopenharmony_ci b.ge 3b 20162306a36Sopenharmony_ci ands count, count, zva_bits_x 20262306a36Sopenharmony_ci b.ne .Ltail_maybe_long 20362306a36Sopenharmony_ci ret 20462306a36Sopenharmony_ciSYM_FUNC_END(__pi_memset) 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ciSYM_FUNC_ALIAS(__memset, __pi_memset) 20762306a36Sopenharmony_ciEXPORT_SYMBOL(__memset) 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ciSYM_FUNC_ALIAS_WEAK(memset, __pi_memset) 21062306a36Sopenharmony_ciEXPORT_SYMBOL(memset) 211