18c2ecf20Sopenharmony_ci/*
28c2ecf20Sopenharmony_ci * Copyright © 2016 Intel Corporation
38c2ecf20Sopenharmony_ci *
48c2ecf20Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
58c2ecf20Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
68c2ecf20Sopenharmony_ci * to deal in the Software without restriction, including without limitation
78c2ecf20Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
88c2ecf20Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
98c2ecf20Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
108c2ecf20Sopenharmony_ci *
118c2ecf20Sopenharmony_ci * The above copyright notice and this permission notice (including the next
128c2ecf20Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
138c2ecf20Sopenharmony_ci * Software.
148c2ecf20Sopenharmony_ci *
158c2ecf20Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
168c2ecf20Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
178c2ecf20Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
188c2ecf20Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
198c2ecf20Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
208c2ecf20Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
218c2ecf20Sopenharmony_ci * IN THE SOFTWARE.
228c2ecf20Sopenharmony_ci *
238c2ecf20Sopenharmony_ci */
248c2ecf20Sopenharmony_ci
258c2ecf20Sopenharmony_ci#include <linux/kernel.h>
268c2ecf20Sopenharmony_ci#include <asm/fpu/api.h>
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci#include "i915_memcpy.h"
298c2ecf20Sopenharmony_ci
308c2ecf20Sopenharmony_ci#if IS_ENABLED(CONFIG_DRM_I915_DEBUG)
318c2ecf20Sopenharmony_ci#define CI_BUG_ON(expr) BUG_ON(expr)
328c2ecf20Sopenharmony_ci#else
338c2ecf20Sopenharmony_ci#define CI_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)
348c2ecf20Sopenharmony_ci#endif
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_cistatic DEFINE_STATIC_KEY_FALSE(has_movntdqa);
378c2ecf20Sopenharmony_ci
388c2ecf20Sopenharmony_cistatic void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
398c2ecf20Sopenharmony_ci{
408c2ecf20Sopenharmony_ci	kernel_fpu_begin();
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci	while (len >= 4) {
438c2ecf20Sopenharmony_ci		asm("movntdqa   (%0), %%xmm0\n"
448c2ecf20Sopenharmony_ci		    "movntdqa 16(%0), %%xmm1\n"
458c2ecf20Sopenharmony_ci		    "movntdqa 32(%0), %%xmm2\n"
468c2ecf20Sopenharmony_ci		    "movntdqa 48(%0), %%xmm3\n"
478c2ecf20Sopenharmony_ci		    "movaps %%xmm0,   (%1)\n"
488c2ecf20Sopenharmony_ci		    "movaps %%xmm1, 16(%1)\n"
498c2ecf20Sopenharmony_ci		    "movaps %%xmm2, 32(%1)\n"
508c2ecf20Sopenharmony_ci		    "movaps %%xmm3, 48(%1)\n"
518c2ecf20Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
528c2ecf20Sopenharmony_ci		src += 64;
538c2ecf20Sopenharmony_ci		dst += 64;
548c2ecf20Sopenharmony_ci		len -= 4;
558c2ecf20Sopenharmony_ci	}
568c2ecf20Sopenharmony_ci	while (len--) {
578c2ecf20Sopenharmony_ci		asm("movntdqa (%0), %%xmm0\n"
588c2ecf20Sopenharmony_ci		    "movaps %%xmm0, (%1)\n"
598c2ecf20Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
608c2ecf20Sopenharmony_ci		src += 16;
618c2ecf20Sopenharmony_ci		dst += 16;
628c2ecf20Sopenharmony_ci	}
638c2ecf20Sopenharmony_ci
648c2ecf20Sopenharmony_ci	kernel_fpu_end();
658c2ecf20Sopenharmony_ci}
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_cistatic void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
688c2ecf20Sopenharmony_ci{
698c2ecf20Sopenharmony_ci	kernel_fpu_begin();
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci	while (len >= 4) {
728c2ecf20Sopenharmony_ci		asm("movntdqa   (%0), %%xmm0\n"
738c2ecf20Sopenharmony_ci		    "movntdqa 16(%0), %%xmm1\n"
748c2ecf20Sopenharmony_ci		    "movntdqa 32(%0), %%xmm2\n"
758c2ecf20Sopenharmony_ci		    "movntdqa 48(%0), %%xmm3\n"
768c2ecf20Sopenharmony_ci		    "movups %%xmm0,   (%1)\n"
778c2ecf20Sopenharmony_ci		    "movups %%xmm1, 16(%1)\n"
788c2ecf20Sopenharmony_ci		    "movups %%xmm2, 32(%1)\n"
798c2ecf20Sopenharmony_ci		    "movups %%xmm3, 48(%1)\n"
808c2ecf20Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
818c2ecf20Sopenharmony_ci		src += 64;
828c2ecf20Sopenharmony_ci		dst += 64;
838c2ecf20Sopenharmony_ci		len -= 4;
848c2ecf20Sopenharmony_ci	}
858c2ecf20Sopenharmony_ci	while (len--) {
868c2ecf20Sopenharmony_ci		asm("movntdqa (%0), %%xmm0\n"
878c2ecf20Sopenharmony_ci		    "movups %%xmm0, (%1)\n"
888c2ecf20Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
898c2ecf20Sopenharmony_ci		src += 16;
908c2ecf20Sopenharmony_ci		dst += 16;
918c2ecf20Sopenharmony_ci	}
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	kernel_fpu_end();
948c2ecf20Sopenharmony_ci}
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci/**
978c2ecf20Sopenharmony_ci * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
988c2ecf20Sopenharmony_ci * @dst: destination pointer
998c2ecf20Sopenharmony_ci * @src: source pointer
1008c2ecf20Sopenharmony_ci * @len: how many bytes to copy
1018c2ecf20Sopenharmony_ci *
1028c2ecf20Sopenharmony_ci * i915_memcpy_from_wc copies @len bytes from @src to @dst using
1038c2ecf20Sopenharmony_ci * non-temporal instructions where available. Note that all arguments
1048c2ecf20Sopenharmony_ci * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
1058c2ecf20Sopenharmony_ci * of 16.
1068c2ecf20Sopenharmony_ci *
1078c2ecf20Sopenharmony_ci * To test whether accelerated reads from WC are supported, use
1088c2ecf20Sopenharmony_ci * i915_memcpy_from_wc(NULL, NULL, 0);
1098c2ecf20Sopenharmony_ci *
1108c2ecf20Sopenharmony_ci * Returns true if the copy was successful, false if the preconditions
1118c2ecf20Sopenharmony_ci * are not met.
1128c2ecf20Sopenharmony_ci */
1138c2ecf20Sopenharmony_cibool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
1148c2ecf20Sopenharmony_ci{
1158c2ecf20Sopenharmony_ci	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
1168c2ecf20Sopenharmony_ci		return false;
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_ci	if (static_branch_likely(&has_movntdqa)) {
1198c2ecf20Sopenharmony_ci		if (likely(len))
1208c2ecf20Sopenharmony_ci			__memcpy_ntdqa(dst, src, len >> 4);
1218c2ecf20Sopenharmony_ci		return true;
1228c2ecf20Sopenharmony_ci	}
1238c2ecf20Sopenharmony_ci
1248c2ecf20Sopenharmony_ci	return false;
1258c2ecf20Sopenharmony_ci}
1268c2ecf20Sopenharmony_ci
1278c2ecf20Sopenharmony_ci/**
1288c2ecf20Sopenharmony_ci * i915_unaligned_memcpy_from_wc: perform a mostly accelerated read from WC
1298c2ecf20Sopenharmony_ci * @dst: destination pointer
1308c2ecf20Sopenharmony_ci * @src: source pointer
1318c2ecf20Sopenharmony_ci * @len: how many bytes to copy
1328c2ecf20Sopenharmony_ci *
1338c2ecf20Sopenharmony_ci * Like i915_memcpy_from_wc(), the unaligned variant copies @len bytes from
1348c2ecf20Sopenharmony_ci * @src to @dst using * non-temporal instructions where available, but
1358c2ecf20Sopenharmony_ci * accepts that its arguments may not be aligned, but are valid for the
1368c2ecf20Sopenharmony_ci * potential 16-byte read past the end.
1378c2ecf20Sopenharmony_ci */
1388c2ecf20Sopenharmony_civoid i915_unaligned_memcpy_from_wc(void *dst, void *src, unsigned long len)
1398c2ecf20Sopenharmony_ci{
1408c2ecf20Sopenharmony_ci	unsigned long addr;
1418c2ecf20Sopenharmony_ci
1428c2ecf20Sopenharmony_ci	CI_BUG_ON(!i915_has_memcpy_from_wc());
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	addr = (unsigned long)src;
1458c2ecf20Sopenharmony_ci	if (!IS_ALIGNED(addr, 16)) {
1468c2ecf20Sopenharmony_ci		unsigned long x = min(ALIGN(addr, 16) - addr, len);
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci		memcpy(dst, src, x);
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci		len -= x;
1518c2ecf20Sopenharmony_ci		dst += x;
1528c2ecf20Sopenharmony_ci		src += x;
1538c2ecf20Sopenharmony_ci	}
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	if (likely(len))
1568c2ecf20Sopenharmony_ci		__memcpy_ntdqu(dst, src, DIV_ROUND_UP(len, 16));
1578c2ecf20Sopenharmony_ci}
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_civoid i915_memcpy_init_early(struct drm_i915_private *dev_priv)
1608c2ecf20Sopenharmony_ci{
1618c2ecf20Sopenharmony_ci	/*
1628c2ecf20Sopenharmony_ci	 * Some hypervisors (e.g. KVM) don't support VEX-prefix instructions
1638c2ecf20Sopenharmony_ci	 * emulation. So don't enable movntdqa in hypervisor guest.
1648c2ecf20Sopenharmony_ci	 */
1658c2ecf20Sopenharmony_ci	if (static_cpu_has(X86_FEATURE_XMM4_1) &&
1668c2ecf20Sopenharmony_ci	    !boot_cpu_has(X86_FEATURE_HYPERVISOR))
1678c2ecf20Sopenharmony_ci		static_branch_enable(&has_movntdqa);
1688c2ecf20Sopenharmony_ci}
169