162306a36Sopenharmony_ci/*
262306a36Sopenharmony_ci * Copyright © 2016 Intel Corporation
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Permission is hereby granted, free of charge, to any person obtaining a
562306a36Sopenharmony_ci * copy of this software and associated documentation files (the "Software"),
662306a36Sopenharmony_ci * to deal in the Software without restriction, including without limitation
762306a36Sopenharmony_ci * the rights to use, copy, modify, merge, publish, distribute, sublicense,
862306a36Sopenharmony_ci * and/or sell copies of the Software, and to permit persons to whom the
962306a36Sopenharmony_ci * Software is furnished to do so, subject to the following conditions:
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci * The above copyright notice and this permission notice (including the next
1262306a36Sopenharmony_ci * paragraph) shall be included in all copies or substantial portions of the
1362306a36Sopenharmony_ci * Software.
1462306a36Sopenharmony_ci *
1562306a36Sopenharmony_ci * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1662306a36Sopenharmony_ci * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1762306a36Sopenharmony_ci * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1862306a36Sopenharmony_ci * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1962306a36Sopenharmony_ci * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2062306a36Sopenharmony_ci * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2162306a36Sopenharmony_ci * IN THE SOFTWARE.
2262306a36Sopenharmony_ci *
2362306a36Sopenharmony_ci */
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci#include <linux/kernel.h>
2662306a36Sopenharmony_ci#include <asm/fpu/api.h>
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_ci#include "i915_memcpy.h"
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_DRM_I915_DEBUG)
3162306a36Sopenharmony_ci#define CI_BUG_ON(expr) BUG_ON(expr)
3262306a36Sopenharmony_ci#else
3362306a36Sopenharmony_ci#define CI_BUG_ON(expr) BUILD_BUG_ON_INVALID(expr)
3462306a36Sopenharmony_ci#endif
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_cistatic DEFINE_STATIC_KEY_FALSE(has_movntdqa);
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_cistatic void __memcpy_ntdqa(void *dst, const void *src, unsigned long len)
3962306a36Sopenharmony_ci{
4062306a36Sopenharmony_ci	kernel_fpu_begin();
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci	while (len >= 4) {
4362306a36Sopenharmony_ci		asm("movntdqa   (%0), %%xmm0\n"
4462306a36Sopenharmony_ci		    "movntdqa 16(%0), %%xmm1\n"
4562306a36Sopenharmony_ci		    "movntdqa 32(%0), %%xmm2\n"
4662306a36Sopenharmony_ci		    "movntdqa 48(%0), %%xmm3\n"
4762306a36Sopenharmony_ci		    "movaps %%xmm0,   (%1)\n"
4862306a36Sopenharmony_ci		    "movaps %%xmm1, 16(%1)\n"
4962306a36Sopenharmony_ci		    "movaps %%xmm2, 32(%1)\n"
5062306a36Sopenharmony_ci		    "movaps %%xmm3, 48(%1)\n"
5162306a36Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
5262306a36Sopenharmony_ci		src += 64;
5362306a36Sopenharmony_ci		dst += 64;
5462306a36Sopenharmony_ci		len -= 4;
5562306a36Sopenharmony_ci	}
5662306a36Sopenharmony_ci	while (len--) {
5762306a36Sopenharmony_ci		asm("movntdqa (%0), %%xmm0\n"
5862306a36Sopenharmony_ci		    "movaps %%xmm0, (%1)\n"
5962306a36Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
6062306a36Sopenharmony_ci		src += 16;
6162306a36Sopenharmony_ci		dst += 16;
6262306a36Sopenharmony_ci	}
6362306a36Sopenharmony_ci
6462306a36Sopenharmony_ci	kernel_fpu_end();
6562306a36Sopenharmony_ci}
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_cistatic void __memcpy_ntdqu(void *dst, const void *src, unsigned long len)
6862306a36Sopenharmony_ci{
6962306a36Sopenharmony_ci	kernel_fpu_begin();
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci	while (len >= 4) {
7262306a36Sopenharmony_ci		asm("movntdqa   (%0), %%xmm0\n"
7362306a36Sopenharmony_ci		    "movntdqa 16(%0), %%xmm1\n"
7462306a36Sopenharmony_ci		    "movntdqa 32(%0), %%xmm2\n"
7562306a36Sopenharmony_ci		    "movntdqa 48(%0), %%xmm3\n"
7662306a36Sopenharmony_ci		    "movups %%xmm0,   (%1)\n"
7762306a36Sopenharmony_ci		    "movups %%xmm1, 16(%1)\n"
7862306a36Sopenharmony_ci		    "movups %%xmm2, 32(%1)\n"
7962306a36Sopenharmony_ci		    "movups %%xmm3, 48(%1)\n"
8062306a36Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
8162306a36Sopenharmony_ci		src += 64;
8262306a36Sopenharmony_ci		dst += 64;
8362306a36Sopenharmony_ci		len -= 4;
8462306a36Sopenharmony_ci	}
8562306a36Sopenharmony_ci	while (len--) {
8662306a36Sopenharmony_ci		asm("movntdqa (%0), %%xmm0\n"
8762306a36Sopenharmony_ci		    "movups %%xmm0, (%1)\n"
8862306a36Sopenharmony_ci		    :: "r" (src), "r" (dst) : "memory");
8962306a36Sopenharmony_ci		src += 16;
9062306a36Sopenharmony_ci		dst += 16;
9162306a36Sopenharmony_ci	}
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci	kernel_fpu_end();
9462306a36Sopenharmony_ci}
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci/**
9762306a36Sopenharmony_ci * i915_memcpy_from_wc: perform an accelerated *aligned* read from WC
9862306a36Sopenharmony_ci * @dst: destination pointer
9962306a36Sopenharmony_ci * @src: source pointer
10062306a36Sopenharmony_ci * @len: how many bytes to copy
10162306a36Sopenharmony_ci *
10262306a36Sopenharmony_ci * i915_memcpy_from_wc copies @len bytes from @src to @dst using
10362306a36Sopenharmony_ci * non-temporal instructions where available. Note that all arguments
10462306a36Sopenharmony_ci * (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
10562306a36Sopenharmony_ci * of 16.
10662306a36Sopenharmony_ci *
10762306a36Sopenharmony_ci * To test whether accelerated reads from WC are supported, use
10862306a36Sopenharmony_ci * i915_memcpy_from_wc(NULL, NULL, 0);
10962306a36Sopenharmony_ci *
11062306a36Sopenharmony_ci * Returns true if the copy was successful, false if the preconditions
11162306a36Sopenharmony_ci * are not met.
11262306a36Sopenharmony_ci */
11362306a36Sopenharmony_cibool i915_memcpy_from_wc(void *dst, const void *src, unsigned long len)
11462306a36Sopenharmony_ci{
11562306a36Sopenharmony_ci	if (unlikely(((unsigned long)dst | (unsigned long)src | len) & 15))
11662306a36Sopenharmony_ci		return false;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	if (static_branch_likely(&has_movntdqa)) {
11962306a36Sopenharmony_ci		if (likely(len))
12062306a36Sopenharmony_ci			__memcpy_ntdqa(dst, src, len >> 4);
12162306a36Sopenharmony_ci		return true;
12262306a36Sopenharmony_ci	}
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_ci	return false;
12562306a36Sopenharmony_ci}
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci/**
12862306a36Sopenharmony_ci * i915_unaligned_memcpy_from_wc: perform a mostly accelerated read from WC
12962306a36Sopenharmony_ci * @dst: destination pointer
13062306a36Sopenharmony_ci * @src: source pointer
13162306a36Sopenharmony_ci * @len: how many bytes to copy
13262306a36Sopenharmony_ci *
13362306a36Sopenharmony_ci * Like i915_memcpy_from_wc(), the unaligned variant copies @len bytes from
13462306a36Sopenharmony_ci * @src to @dst using * non-temporal instructions where available, but
13562306a36Sopenharmony_ci * accepts that its arguments may not be aligned, but are valid for the
13662306a36Sopenharmony_ci * potential 16-byte read past the end.
13762306a36Sopenharmony_ci */
13862306a36Sopenharmony_civoid i915_unaligned_memcpy_from_wc(void *dst, const void *src, unsigned long len)
13962306a36Sopenharmony_ci{
14062306a36Sopenharmony_ci	unsigned long addr;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci	CI_BUG_ON(!i915_has_memcpy_from_wc());
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	addr = (unsigned long)src;
14562306a36Sopenharmony_ci	if (!IS_ALIGNED(addr, 16)) {
14662306a36Sopenharmony_ci		unsigned long x = min(ALIGN(addr, 16) - addr, len);
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci		memcpy(dst, src, x);
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_ci		len -= x;
15162306a36Sopenharmony_ci		dst += x;
15262306a36Sopenharmony_ci		src += x;
15362306a36Sopenharmony_ci	}
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	if (likely(len))
15662306a36Sopenharmony_ci		__memcpy_ntdqu(dst, src, DIV_ROUND_UP(len, 16));
15762306a36Sopenharmony_ci}
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_civoid i915_memcpy_init_early(struct drm_i915_private *dev_priv)
16062306a36Sopenharmony_ci{
16162306a36Sopenharmony_ci	/*
16262306a36Sopenharmony_ci	 * Some hypervisors (e.g. KVM) don't support VEX-prefix instructions
16362306a36Sopenharmony_ci	 * emulation. So don't enable movntdqa in hypervisor guest.
16462306a36Sopenharmony_ci	 */
16562306a36Sopenharmony_ci	if (static_cpu_has(X86_FEATURE_XMM4_1) &&
16662306a36Sopenharmony_ci	    !boot_cpu_has(X86_FEATURE_HYPERVISOR))
16762306a36Sopenharmony_ci		static_branch_enable(&has_movntdqa);
16862306a36Sopenharmony_ci}
169