18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci// Copyright (C) 2019-2020 Arm Ltd.
38c2ecf20Sopenharmony_ci
48c2ecf20Sopenharmony_ci#include <linux/compiler.h>
58c2ecf20Sopenharmony_ci#include <linux/kasan-checks.h>
68c2ecf20Sopenharmony_ci#include <linux/kernel.h>
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <net/checksum.h>
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_cistatic u64 accumulate(u64 sum, u64 data)
118c2ecf20Sopenharmony_ci{
128c2ecf20Sopenharmony_ci	sum += data;
138c2ecf20Sopenharmony_ci	if (sum < data)
148c2ecf20Sopenharmony_ci		sum += 1;
158c2ecf20Sopenharmony_ci	return sum;
168c2ecf20Sopenharmony_ci}
178c2ecf20Sopenharmony_ci
188c2ecf20Sopenharmony_ci/*
198c2ecf20Sopenharmony_ci * We over-read the buffer and this makes KASAN unhappy. Instead, disable
208c2ecf20Sopenharmony_ci * instrumentation and call kasan explicitly.
218c2ecf20Sopenharmony_ci */
228c2ecf20Sopenharmony_ciunsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
238c2ecf20Sopenharmony_ci{
248c2ecf20Sopenharmony_ci	unsigned int offset, shift, sum;
258c2ecf20Sopenharmony_ci	const u64 *ptr;
268c2ecf20Sopenharmony_ci	u64 data, sum64 = 0;
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci	if (unlikely(len == 0))
298c2ecf20Sopenharmony_ci		return 0;
308c2ecf20Sopenharmony_ci
318c2ecf20Sopenharmony_ci	offset = (unsigned long)buff & 7;
328c2ecf20Sopenharmony_ci	/*
338c2ecf20Sopenharmony_ci	 * This is to all intents and purposes safe, since rounding down cannot
348c2ecf20Sopenharmony_ci	 * result in a different page or cache line being accessed, and @buff
358c2ecf20Sopenharmony_ci	 * should absolutely not be pointing to anything read-sensitive. We do,
368c2ecf20Sopenharmony_ci	 * however, have to be careful not to piss off KASAN, which means using
378c2ecf20Sopenharmony_ci	 * unchecked reads to accommodate the head and tail, for which we'll
388c2ecf20Sopenharmony_ci	 * compensate with an explicit check up-front.
398c2ecf20Sopenharmony_ci	 */
408c2ecf20Sopenharmony_ci	kasan_check_read(buff, len);
418c2ecf20Sopenharmony_ci	ptr = (u64 *)(buff - offset);
428c2ecf20Sopenharmony_ci	len = len + offset - 8;
438c2ecf20Sopenharmony_ci
448c2ecf20Sopenharmony_ci	/*
458c2ecf20Sopenharmony_ci	 * Head: zero out any excess leading bytes. Shifting back by the same
468c2ecf20Sopenharmony_ci	 * amount should be at least as fast as any other way of handling the
478c2ecf20Sopenharmony_ci	 * odd/even alignment, and means we can ignore it until the very end.
488c2ecf20Sopenharmony_ci	 */
498c2ecf20Sopenharmony_ci	shift = offset * 8;
508c2ecf20Sopenharmony_ci	data = *ptr++;
518c2ecf20Sopenharmony_ci	data = (data >> shift) << shift;
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci	/*
548c2ecf20Sopenharmony_ci	 * Body: straightforward aligned loads from here on (the paired loads
558c2ecf20Sopenharmony_ci	 * underlying the quadword type still only need dword alignment). The
568c2ecf20Sopenharmony_ci	 * main loop strictly excludes the tail, so the second loop will always
578c2ecf20Sopenharmony_ci	 * run at least once.
588c2ecf20Sopenharmony_ci	 */
598c2ecf20Sopenharmony_ci	while (unlikely(len > 64)) {
608c2ecf20Sopenharmony_ci		__uint128_t tmp1, tmp2, tmp3, tmp4;
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci		tmp1 = *(__uint128_t *)ptr;
638c2ecf20Sopenharmony_ci		tmp2 = *(__uint128_t *)(ptr + 2);
648c2ecf20Sopenharmony_ci		tmp3 = *(__uint128_t *)(ptr + 4);
658c2ecf20Sopenharmony_ci		tmp4 = *(__uint128_t *)(ptr + 6);
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci		len -= 64;
688c2ecf20Sopenharmony_ci		ptr += 8;
698c2ecf20Sopenharmony_ci
708c2ecf20Sopenharmony_ci		/* This is the "don't dump the carry flag into a GPR" idiom */
718c2ecf20Sopenharmony_ci		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
728c2ecf20Sopenharmony_ci		tmp2 += (tmp2 >> 64) | (tmp2 << 64);
738c2ecf20Sopenharmony_ci		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
748c2ecf20Sopenharmony_ci		tmp4 += (tmp4 >> 64) | (tmp4 << 64);
758c2ecf20Sopenharmony_ci		tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
768c2ecf20Sopenharmony_ci		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
778c2ecf20Sopenharmony_ci		tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
788c2ecf20Sopenharmony_ci		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
798c2ecf20Sopenharmony_ci		tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
808c2ecf20Sopenharmony_ci		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
818c2ecf20Sopenharmony_ci		tmp1 = ((tmp1 >> 64) << 64) | sum64;
828c2ecf20Sopenharmony_ci		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
838c2ecf20Sopenharmony_ci		sum64 = tmp1 >> 64;
848c2ecf20Sopenharmony_ci	}
858c2ecf20Sopenharmony_ci	while (len > 8) {
868c2ecf20Sopenharmony_ci		__uint128_t tmp;
878c2ecf20Sopenharmony_ci
888c2ecf20Sopenharmony_ci		sum64 = accumulate(sum64, data);
898c2ecf20Sopenharmony_ci		tmp = *(__uint128_t *)ptr;
908c2ecf20Sopenharmony_ci
918c2ecf20Sopenharmony_ci		len -= 16;
928c2ecf20Sopenharmony_ci		ptr += 2;
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_ci		data = tmp >> 64;
958c2ecf20Sopenharmony_ci		sum64 = accumulate(sum64, tmp);
968c2ecf20Sopenharmony_ci	}
978c2ecf20Sopenharmony_ci	if (len > 0) {
988c2ecf20Sopenharmony_ci		sum64 = accumulate(sum64, data);
998c2ecf20Sopenharmony_ci		data = *ptr;
1008c2ecf20Sopenharmony_ci		len -= 8;
1018c2ecf20Sopenharmony_ci	}
1028c2ecf20Sopenharmony_ci	/*
1038c2ecf20Sopenharmony_ci	 * Tail: zero any over-read bytes similarly to the head, again
1048c2ecf20Sopenharmony_ci	 * preserving odd/even alignment.
1058c2ecf20Sopenharmony_ci	 */
1068c2ecf20Sopenharmony_ci	shift = len * -8;
1078c2ecf20Sopenharmony_ci	data = (data << shift) >> shift;
1088c2ecf20Sopenharmony_ci	sum64 = accumulate(sum64, data);
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci	/* Finally, folding */
1118c2ecf20Sopenharmony_ci	sum64 += (sum64 >> 32) | (sum64 << 32);
1128c2ecf20Sopenharmony_ci	sum = sum64 >> 32;
1138c2ecf20Sopenharmony_ci	sum += (sum >> 16) | (sum << 16);
1148c2ecf20Sopenharmony_ci	if (offset & 1)
1158c2ecf20Sopenharmony_ci		return (u16)swab32(sum);
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci	return sum >> 16;
1188c2ecf20Sopenharmony_ci}
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
1218c2ecf20Sopenharmony_ci			const struct in6_addr *daddr,
1228c2ecf20Sopenharmony_ci			__u32 len, __u8 proto, __wsum csum)
1238c2ecf20Sopenharmony_ci{
1248c2ecf20Sopenharmony_ci	__uint128_t src, dst;
1258c2ecf20Sopenharmony_ci	u64 sum = (__force u64)csum;
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci	src = *(const __uint128_t *)saddr->s6_addr;
1288c2ecf20Sopenharmony_ci	dst = *(const __uint128_t *)daddr->s6_addr;
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci	sum += (__force u32)htonl(len);
1318c2ecf20Sopenharmony_ci	sum += (u32)proto << 24;
1328c2ecf20Sopenharmony_ci	src += (src >> 64) | (src << 64);
1338c2ecf20Sopenharmony_ci	dst += (dst >> 64) | (dst << 64);
1348c2ecf20Sopenharmony_ci
1358c2ecf20Sopenharmony_ci	sum = accumulate(sum, src >> 64);
1368c2ecf20Sopenharmony_ci	sum = accumulate(sum, dst >> 64);
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	sum += ((sum >> 32) | (sum << 32));
1398c2ecf20Sopenharmony_ci	return csum_fold((__force __wsum)(sum >> 32));
1408c2ecf20Sopenharmony_ci}
1418c2ecf20Sopenharmony_ciEXPORT_SYMBOL(csum_ipv6_magic);
142