162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (c) 2012, Microsoft Corporation.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Author:
662306a36Sopenharmony_ci *   K. Y. Srinivasan <kys@microsoft.com>
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
1062306a36Sopenharmony_ci
1162306a36Sopenharmony_ci#include <linux/cleanup.h>
1262306a36Sopenharmony_ci#include <linux/kernel.h>
1362306a36Sopenharmony_ci#include <linux/jiffies.h>
1462306a36Sopenharmony_ci#include <linux/mman.h>
1562306a36Sopenharmony_ci#include <linux/debugfs.h>
1662306a36Sopenharmony_ci#include <linux/delay.h>
1762306a36Sopenharmony_ci#include <linux/init.h>
1862306a36Sopenharmony_ci#include <linux/module.h>
1962306a36Sopenharmony_ci#include <linux/slab.h>
2062306a36Sopenharmony_ci#include <linux/kthread.h>
2162306a36Sopenharmony_ci#include <linux/completion.h>
2262306a36Sopenharmony_ci#include <linux/count_zeros.h>
2362306a36Sopenharmony_ci#include <linux/memory_hotplug.h>
2462306a36Sopenharmony_ci#include <linux/memory.h>
2562306a36Sopenharmony_ci#include <linux/notifier.h>
2662306a36Sopenharmony_ci#include <linux/percpu_counter.h>
2762306a36Sopenharmony_ci#include <linux/page_reporting.h>
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci#include <linux/hyperv.h>
3062306a36Sopenharmony_ci#include <asm/hyperv-tlfs.h>
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci#include <asm/mshyperv.h>
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
3562306a36Sopenharmony_ci#include "hv_trace_balloon.h"
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci/*
3862306a36Sopenharmony_ci * We begin with definitions supporting the Dynamic Memory protocol
3962306a36Sopenharmony_ci * with the host.
4062306a36Sopenharmony_ci *
4162306a36Sopenharmony_ci * Begin protocol definitions.
4262306a36Sopenharmony_ci */
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_ci/*
4762306a36Sopenharmony_ci * Protocol versions. The low word is the minor version, the high word the major
4862306a36Sopenharmony_ci * version.
4962306a36Sopenharmony_ci *
5062306a36Sopenharmony_ci * History:
5162306a36Sopenharmony_ci * Initial version 1.0
5262306a36Sopenharmony_ci * Changed to 0.1 on 2009/03/25
5362306a36Sopenharmony_ci * Changes to 0.2 on 2009/05/14
5462306a36Sopenharmony_ci * Changes to 0.3 on 2009/12/03
5562306a36Sopenharmony_ci * Changed to 1.0 on 2011/04/05
5662306a36Sopenharmony_ci */
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci#define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
5962306a36Sopenharmony_ci#define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
6062306a36Sopenharmony_ci#define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_cienum {
6362306a36Sopenharmony_ci	DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
6462306a36Sopenharmony_ci	DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
6562306a36Sopenharmony_ci	DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0),
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci	DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
6862306a36Sopenharmony_ci	DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
6962306a36Sopenharmony_ci	DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3,
7062306a36Sopenharmony_ci
7162306a36Sopenharmony_ci	DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
7262306a36Sopenharmony_ci};
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci/*
7762306a36Sopenharmony_ci * Message Types
7862306a36Sopenharmony_ci */
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_cienum dm_message_type {
8162306a36Sopenharmony_ci	/*
8262306a36Sopenharmony_ci	 * Version 0.3
8362306a36Sopenharmony_ci	 */
8462306a36Sopenharmony_ci	DM_ERROR			= 0,
8562306a36Sopenharmony_ci	DM_VERSION_REQUEST		= 1,
8662306a36Sopenharmony_ci	DM_VERSION_RESPONSE		= 2,
8762306a36Sopenharmony_ci	DM_CAPABILITIES_REPORT		= 3,
8862306a36Sopenharmony_ci	DM_CAPABILITIES_RESPONSE	= 4,
8962306a36Sopenharmony_ci	DM_STATUS_REPORT		= 5,
9062306a36Sopenharmony_ci	DM_BALLOON_REQUEST		= 6,
9162306a36Sopenharmony_ci	DM_BALLOON_RESPONSE		= 7,
9262306a36Sopenharmony_ci	DM_UNBALLOON_REQUEST		= 8,
9362306a36Sopenharmony_ci	DM_UNBALLOON_RESPONSE		= 9,
9462306a36Sopenharmony_ci	DM_MEM_HOT_ADD_REQUEST		= 10,
9562306a36Sopenharmony_ci	DM_MEM_HOT_ADD_RESPONSE		= 11,
9662306a36Sopenharmony_ci	DM_VERSION_03_MAX		= 11,
9762306a36Sopenharmony_ci	/*
9862306a36Sopenharmony_ci	 * Version 1.0.
9962306a36Sopenharmony_ci	 */
10062306a36Sopenharmony_ci	DM_INFO_MESSAGE			= 12,
10162306a36Sopenharmony_ci	DM_VERSION_1_MAX		= 12
10262306a36Sopenharmony_ci};
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci/*
10662306a36Sopenharmony_ci * Structures defining the dynamic memory management
10762306a36Sopenharmony_ci * protocol.
10862306a36Sopenharmony_ci */
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ciunion dm_version {
11162306a36Sopenharmony_ci	struct {
11262306a36Sopenharmony_ci		__u16 minor_version;
11362306a36Sopenharmony_ci		__u16 major_version;
11462306a36Sopenharmony_ci	};
11562306a36Sopenharmony_ci	__u32 version;
11662306a36Sopenharmony_ci} __packed;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ciunion dm_caps {
12062306a36Sopenharmony_ci	struct {
12162306a36Sopenharmony_ci		__u64 balloon:1;
12262306a36Sopenharmony_ci		__u64 hot_add:1;
12362306a36Sopenharmony_ci		/*
12462306a36Sopenharmony_ci		 * To support guests that may have alignment
12562306a36Sopenharmony_ci		 * limitations on hot-add, the guest can specify
12662306a36Sopenharmony_ci		 * its alignment requirements; a value of n
12762306a36Sopenharmony_ci		 * represents an alignment of 2^n in mega bytes.
12862306a36Sopenharmony_ci		 */
12962306a36Sopenharmony_ci		__u64 hot_add_alignment:4;
13062306a36Sopenharmony_ci		__u64 reservedz:58;
13162306a36Sopenharmony_ci	} cap_bits;
13262306a36Sopenharmony_ci	__u64 caps;
13362306a36Sopenharmony_ci} __packed;
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ciunion dm_mem_page_range {
13662306a36Sopenharmony_ci	struct  {
13762306a36Sopenharmony_ci		/*
13862306a36Sopenharmony_ci		 * The PFN number of the first page in the range.
13962306a36Sopenharmony_ci		 * 40 bits is the architectural limit of a PFN
14062306a36Sopenharmony_ci		 * number for AMD64.
14162306a36Sopenharmony_ci		 */
14262306a36Sopenharmony_ci		__u64 start_page:40;
14362306a36Sopenharmony_ci		/*
14462306a36Sopenharmony_ci		 * The number of pages in the range.
14562306a36Sopenharmony_ci		 */
14662306a36Sopenharmony_ci		__u64 page_cnt:24;
14762306a36Sopenharmony_ci	} finfo;
14862306a36Sopenharmony_ci	__u64  page_range;
14962306a36Sopenharmony_ci} __packed;
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci/*
15462306a36Sopenharmony_ci * The header for all dynamic memory messages:
15562306a36Sopenharmony_ci *
15662306a36Sopenharmony_ci * type: Type of the message.
15762306a36Sopenharmony_ci * size: Size of the message in bytes; including the header.
15862306a36Sopenharmony_ci * trans_id: The guest is responsible for manufacturing this ID.
15962306a36Sopenharmony_ci */
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_cistruct dm_header {
16262306a36Sopenharmony_ci	__u16 type;
16362306a36Sopenharmony_ci	__u16 size;
16462306a36Sopenharmony_ci	__u32 trans_id;
16562306a36Sopenharmony_ci} __packed;
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_ci/*
16862306a36Sopenharmony_ci * A generic message format for dynamic memory.
16962306a36Sopenharmony_ci * Specific message formats are defined later in the file.
17062306a36Sopenharmony_ci */
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_cistruct dm_message {
17362306a36Sopenharmony_ci	struct dm_header hdr;
17462306a36Sopenharmony_ci	__u8 data[]; /* enclosed message */
17562306a36Sopenharmony_ci} __packed;
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci/*
17962306a36Sopenharmony_ci * Specific message types supporting the dynamic memory protocol.
18062306a36Sopenharmony_ci */
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci/*
18362306a36Sopenharmony_ci * Version negotiation message. Sent from the guest to the host.
18462306a36Sopenharmony_ci * The guest is free to try different versions until the host
18562306a36Sopenharmony_ci * accepts the version.
18662306a36Sopenharmony_ci *
18762306a36Sopenharmony_ci * dm_version: The protocol version requested.
18862306a36Sopenharmony_ci * is_last_attempt: If TRUE, this is the last version guest will request.
18962306a36Sopenharmony_ci * reservedz: Reserved field, set to zero.
19062306a36Sopenharmony_ci */
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_cistruct dm_version_request {
19362306a36Sopenharmony_ci	struct dm_header hdr;
19462306a36Sopenharmony_ci	union dm_version version;
19562306a36Sopenharmony_ci	__u32 is_last_attempt:1;
19662306a36Sopenharmony_ci	__u32 reservedz:31;
19762306a36Sopenharmony_ci} __packed;
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci/*
20062306a36Sopenharmony_ci * Version response message; Host to Guest and indicates
20162306a36Sopenharmony_ci * if the host has accepted the version sent by the guest.
20262306a36Sopenharmony_ci *
20362306a36Sopenharmony_ci * is_accepted: If TRUE, host has accepted the version and the guest
20462306a36Sopenharmony_ci * should proceed to the next stage of the protocol. FALSE indicates that
20562306a36Sopenharmony_ci * guest should re-try with a different version.
20662306a36Sopenharmony_ci *
20762306a36Sopenharmony_ci * reservedz: Reserved field, set to zero.
20862306a36Sopenharmony_ci */
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_cistruct dm_version_response {
21162306a36Sopenharmony_ci	struct dm_header hdr;
21262306a36Sopenharmony_ci	__u64 is_accepted:1;
21362306a36Sopenharmony_ci	__u64 reservedz:63;
21462306a36Sopenharmony_ci} __packed;
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci/*
21762306a36Sopenharmony_ci * Message reporting capabilities. This is sent from the guest to the
21862306a36Sopenharmony_ci * host.
21962306a36Sopenharmony_ci */
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_cistruct dm_capabilities {
22262306a36Sopenharmony_ci	struct dm_header hdr;
22362306a36Sopenharmony_ci	union dm_caps caps;
22462306a36Sopenharmony_ci	__u64 min_page_cnt;
22562306a36Sopenharmony_ci	__u64 max_page_number;
22662306a36Sopenharmony_ci} __packed;
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci/*
22962306a36Sopenharmony_ci * Response to the capabilities message. This is sent from the host to the
23062306a36Sopenharmony_ci * guest. This message notifies if the host has accepted the guest's
23162306a36Sopenharmony_ci * capabilities. If the host has not accepted, the guest must shutdown
23262306a36Sopenharmony_ci * the service.
23362306a36Sopenharmony_ci *
23462306a36Sopenharmony_ci * is_accepted: Indicates if the host has accepted guest's capabilities.
23562306a36Sopenharmony_ci * reservedz: Must be 0.
23662306a36Sopenharmony_ci */
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_cistruct dm_capabilities_resp_msg {
23962306a36Sopenharmony_ci	struct dm_header hdr;
24062306a36Sopenharmony_ci	__u64 is_accepted:1;
24162306a36Sopenharmony_ci	__u64 reservedz:63;
24262306a36Sopenharmony_ci} __packed;
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci/*
24562306a36Sopenharmony_ci * This message is used to report memory pressure from the guest.
24662306a36Sopenharmony_ci * This message is not part of any transaction and there is no
24762306a36Sopenharmony_ci * response to this message.
24862306a36Sopenharmony_ci *
24962306a36Sopenharmony_ci * num_avail: Available memory in pages.
25062306a36Sopenharmony_ci * num_committed: Committed memory in pages.
25162306a36Sopenharmony_ci * page_file_size: The accumulated size of all page files
25262306a36Sopenharmony_ci *		   in the system in pages.
25362306a36Sopenharmony_ci * zero_free: The number of zero and free pages.
25462306a36Sopenharmony_ci * page_file_writes: The writes to the page file in pages.
25562306a36Sopenharmony_ci * io_diff: An indicator of file cache efficiency or page file activity,
25662306a36Sopenharmony_ci *	    calculated as File Cache Page Fault Count - Page Read Count.
25762306a36Sopenharmony_ci *	    This value is in pages.
25862306a36Sopenharmony_ci *
25962306a36Sopenharmony_ci * Some of these metrics are Windows specific and fortunately
26062306a36Sopenharmony_ci * the algorithm on the host side that computes the guest memory
26162306a36Sopenharmony_ci * pressure only uses num_committed value.
26262306a36Sopenharmony_ci */
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_cistruct dm_status {
26562306a36Sopenharmony_ci	struct dm_header hdr;
26662306a36Sopenharmony_ci	__u64 num_avail;
26762306a36Sopenharmony_ci	__u64 num_committed;
26862306a36Sopenharmony_ci	__u64 page_file_size;
26962306a36Sopenharmony_ci	__u64 zero_free;
27062306a36Sopenharmony_ci	__u32 page_file_writes;
27162306a36Sopenharmony_ci	__u32 io_diff;
27262306a36Sopenharmony_ci} __packed;
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci/*
27662306a36Sopenharmony_ci * Message to ask the guest to allocate memory - balloon up message.
27762306a36Sopenharmony_ci * This message is sent from the host to the guest. The guest may not be
27862306a36Sopenharmony_ci * able to allocate as much memory as requested.
27962306a36Sopenharmony_ci *
28062306a36Sopenharmony_ci * num_pages: number of pages to allocate.
28162306a36Sopenharmony_ci */
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_cistruct dm_balloon {
28462306a36Sopenharmony_ci	struct dm_header hdr;
28562306a36Sopenharmony_ci	__u32 num_pages;
28662306a36Sopenharmony_ci	__u32 reservedz;
28762306a36Sopenharmony_ci} __packed;
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci
29062306a36Sopenharmony_ci/*
29162306a36Sopenharmony_ci * Balloon response message; this message is sent from the guest
29262306a36Sopenharmony_ci * to the host in response to the balloon message.
29362306a36Sopenharmony_ci *
29462306a36Sopenharmony_ci * reservedz: Reserved; must be set to zero.
29562306a36Sopenharmony_ci * more_pages: If FALSE, this is the last message of the transaction.
29662306a36Sopenharmony_ci * if TRUE there will atleast one more message from the guest.
29762306a36Sopenharmony_ci *
29862306a36Sopenharmony_ci * range_count: The number of ranges in the range array.
29962306a36Sopenharmony_ci *
30062306a36Sopenharmony_ci * range_array: An array of page ranges returned to the host.
30162306a36Sopenharmony_ci *
30262306a36Sopenharmony_ci */
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_cistruct dm_balloon_response {
30562306a36Sopenharmony_ci	struct dm_header hdr;
30662306a36Sopenharmony_ci	__u32 reservedz;
30762306a36Sopenharmony_ci	__u32 more_pages:1;
30862306a36Sopenharmony_ci	__u32 range_count:31;
30962306a36Sopenharmony_ci	union dm_mem_page_range range_array[];
31062306a36Sopenharmony_ci} __packed;
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci/*
31362306a36Sopenharmony_ci * Un-balloon message; this message is sent from the host
31462306a36Sopenharmony_ci * to the guest to give guest more memory.
31562306a36Sopenharmony_ci *
31662306a36Sopenharmony_ci * more_pages: If FALSE, this is the last message of the transaction.
31762306a36Sopenharmony_ci * if TRUE there will atleast one more message from the guest.
31862306a36Sopenharmony_ci *
31962306a36Sopenharmony_ci * reservedz: Reserved; must be set to zero.
32062306a36Sopenharmony_ci *
32162306a36Sopenharmony_ci * range_count: The number of ranges in the range array.
32262306a36Sopenharmony_ci *
32362306a36Sopenharmony_ci * range_array: An array of page ranges returned to the host.
32462306a36Sopenharmony_ci *
32562306a36Sopenharmony_ci */
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_cistruct dm_unballoon_request {
32862306a36Sopenharmony_ci	struct dm_header hdr;
32962306a36Sopenharmony_ci	__u32 more_pages:1;
33062306a36Sopenharmony_ci	__u32 reservedz:31;
33162306a36Sopenharmony_ci	__u32 range_count;
33262306a36Sopenharmony_ci	union dm_mem_page_range range_array[];
33362306a36Sopenharmony_ci} __packed;
33462306a36Sopenharmony_ci
33562306a36Sopenharmony_ci/*
33662306a36Sopenharmony_ci * Un-balloon response message; this message is sent from the guest
33762306a36Sopenharmony_ci * to the host in response to an unballoon request.
33862306a36Sopenharmony_ci *
33962306a36Sopenharmony_ci */
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_cistruct dm_unballoon_response {
34262306a36Sopenharmony_ci	struct dm_header hdr;
34362306a36Sopenharmony_ci} __packed;
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci/*
34762306a36Sopenharmony_ci * Hot add request message. Message sent from the host to the guest.
34862306a36Sopenharmony_ci *
34962306a36Sopenharmony_ci * mem_range: Memory range to hot add.
35062306a36Sopenharmony_ci *
35162306a36Sopenharmony_ci */
35262306a36Sopenharmony_ci
35362306a36Sopenharmony_cistruct dm_hot_add {
35462306a36Sopenharmony_ci	struct dm_header hdr;
35562306a36Sopenharmony_ci	union dm_mem_page_range range;
35662306a36Sopenharmony_ci} __packed;
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci/*
35962306a36Sopenharmony_ci * Hot add response message.
36062306a36Sopenharmony_ci * This message is sent by the guest to report the status of a hot add request.
36162306a36Sopenharmony_ci * If page_count is less than the requested page count, then the host should
36262306a36Sopenharmony_ci * assume all further hot add requests will fail, since this indicates that
36362306a36Sopenharmony_ci * the guest has hit an upper physical memory barrier.
36462306a36Sopenharmony_ci *
36562306a36Sopenharmony_ci * Hot adds may also fail due to low resources; in this case, the guest must
36662306a36Sopenharmony_ci * not complete this message until the hot add can succeed, and the host must
36762306a36Sopenharmony_ci * not send a new hot add request until the response is sent.
36862306a36Sopenharmony_ci * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
36962306a36Sopenharmony_ci * times it fails the request.
37062306a36Sopenharmony_ci *
37162306a36Sopenharmony_ci *
37262306a36Sopenharmony_ci * page_count: number of pages that were successfully hot added.
37362306a36Sopenharmony_ci *
37462306a36Sopenharmony_ci * result: result of the operation 1: success, 0: failure.
37562306a36Sopenharmony_ci *
37662306a36Sopenharmony_ci */
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_cistruct dm_hot_add_response {
37962306a36Sopenharmony_ci	struct dm_header hdr;
38062306a36Sopenharmony_ci	__u32 page_count;
38162306a36Sopenharmony_ci	__u32 result;
38262306a36Sopenharmony_ci} __packed;
38362306a36Sopenharmony_ci
38462306a36Sopenharmony_ci/*
38562306a36Sopenharmony_ci * Types of information sent from host to the guest.
38662306a36Sopenharmony_ci */
38762306a36Sopenharmony_ci
38862306a36Sopenharmony_cienum dm_info_type {
38962306a36Sopenharmony_ci	INFO_TYPE_MAX_PAGE_CNT = 0,
39062306a36Sopenharmony_ci	MAX_INFO_TYPE
39162306a36Sopenharmony_ci};
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci/*
39562306a36Sopenharmony_ci * Header for the information message.
39662306a36Sopenharmony_ci */
39762306a36Sopenharmony_ci
39862306a36Sopenharmony_cistruct dm_info_header {
39962306a36Sopenharmony_ci	enum dm_info_type type;
40062306a36Sopenharmony_ci	__u32 data_size;
40162306a36Sopenharmony_ci} __packed;
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci/*
40462306a36Sopenharmony_ci * This message is sent from the host to the guest to pass
40562306a36Sopenharmony_ci * some relevant information (win8 addition).
40662306a36Sopenharmony_ci *
40762306a36Sopenharmony_ci * reserved: no used.
40862306a36Sopenharmony_ci * info_size: size of the information blob.
40962306a36Sopenharmony_ci * info: information blob.
41062306a36Sopenharmony_ci */
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_cistruct dm_info_msg {
41362306a36Sopenharmony_ci	struct dm_header hdr;
41462306a36Sopenharmony_ci	__u32 reserved;
41562306a36Sopenharmony_ci	__u32 info_size;
41662306a36Sopenharmony_ci	__u8  info[];
41762306a36Sopenharmony_ci};
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci/*
42062306a36Sopenharmony_ci * End protocol definitions.
42162306a36Sopenharmony_ci */
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci/*
42462306a36Sopenharmony_ci * State to manage hot adding memory into the guest.
42562306a36Sopenharmony_ci * The range start_pfn : end_pfn specifies the range
42662306a36Sopenharmony_ci * that the host has asked us to hot add. The range
42762306a36Sopenharmony_ci * start_pfn : ha_end_pfn specifies the range that we have
42862306a36Sopenharmony_ci * currently hot added. We hot add in multiples of 128M
42962306a36Sopenharmony_ci * chunks; it is possible that we may not be able to bring
43062306a36Sopenharmony_ci * online all the pages in the region. The range
43162306a36Sopenharmony_ci * covered_start_pfn:covered_end_pfn defines the pages that can
43262306a36Sopenharmony_ci * be brough online.
43362306a36Sopenharmony_ci */
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_cistruct hv_hotadd_state {
43662306a36Sopenharmony_ci	struct list_head list;
43762306a36Sopenharmony_ci	unsigned long start_pfn;
43862306a36Sopenharmony_ci	unsigned long covered_start_pfn;
43962306a36Sopenharmony_ci	unsigned long covered_end_pfn;
44062306a36Sopenharmony_ci	unsigned long ha_end_pfn;
44162306a36Sopenharmony_ci	unsigned long end_pfn;
44262306a36Sopenharmony_ci	/*
44362306a36Sopenharmony_ci	 * A list of gaps.
44462306a36Sopenharmony_ci	 */
44562306a36Sopenharmony_ci	struct list_head gap_list;
44662306a36Sopenharmony_ci};
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_cistruct hv_hotadd_gap {
44962306a36Sopenharmony_ci	struct list_head list;
45062306a36Sopenharmony_ci	unsigned long start_pfn;
45162306a36Sopenharmony_ci	unsigned long end_pfn;
45262306a36Sopenharmony_ci};
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_cistruct balloon_state {
45562306a36Sopenharmony_ci	__u32 num_pages;
45662306a36Sopenharmony_ci	struct work_struct wrk;
45762306a36Sopenharmony_ci};
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_cistruct hot_add_wrk {
46062306a36Sopenharmony_ci	union dm_mem_page_range ha_page_range;
46162306a36Sopenharmony_ci	union dm_mem_page_range ha_region_range;
46262306a36Sopenharmony_ci	struct work_struct wrk;
46362306a36Sopenharmony_ci};
46462306a36Sopenharmony_ci
46562306a36Sopenharmony_cistatic bool allow_hibernation;
46662306a36Sopenharmony_cistatic bool hot_add = true;
46762306a36Sopenharmony_cistatic bool do_hot_add;
46862306a36Sopenharmony_ci/*
46962306a36Sopenharmony_ci * Delay reporting memory pressure by
47062306a36Sopenharmony_ci * the specified number of seconds.
47162306a36Sopenharmony_ci */
47262306a36Sopenharmony_cistatic uint pressure_report_delay = 45;
47362306a36Sopenharmony_ciextern unsigned int page_reporting_order;
47462306a36Sopenharmony_ci#define HV_MAX_FAILURES	2
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci/*
47762306a36Sopenharmony_ci * The last time we posted a pressure report to host.
47862306a36Sopenharmony_ci */
47962306a36Sopenharmony_cistatic unsigned long last_post_time;
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_cistatic int hv_hypercall_multi_failure;
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_cimodule_param(hot_add, bool, (S_IRUGO | S_IWUSR));
48462306a36Sopenharmony_ciMODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_cimodule_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
48762306a36Sopenharmony_ciMODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
48862306a36Sopenharmony_cistatic atomic_t trans_id = ATOMIC_INIT(0);
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_cistatic int dm_ring_size = VMBUS_RING_SIZE(16 * 1024);
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci/*
49362306a36Sopenharmony_ci * Driver specific state.
49462306a36Sopenharmony_ci */
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_cienum hv_dm_state {
49762306a36Sopenharmony_ci	DM_INITIALIZING = 0,
49862306a36Sopenharmony_ci	DM_INITIALIZED,
49962306a36Sopenharmony_ci	DM_BALLOON_UP,
50062306a36Sopenharmony_ci	DM_BALLOON_DOWN,
50162306a36Sopenharmony_ci	DM_HOT_ADD,
50262306a36Sopenharmony_ci	DM_INIT_ERROR
50362306a36Sopenharmony_ci};
50462306a36Sopenharmony_ci
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_cistatic __u8 recv_buffer[HV_HYP_PAGE_SIZE];
50762306a36Sopenharmony_cistatic __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
50862306a36Sopenharmony_ci#define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
50962306a36Sopenharmony_ci#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
51062306a36Sopenharmony_ci
51162306a36Sopenharmony_cistruct hv_dynmem_device {
51262306a36Sopenharmony_ci	struct hv_device *dev;
51362306a36Sopenharmony_ci	enum hv_dm_state state;
51462306a36Sopenharmony_ci	struct completion host_event;
51562306a36Sopenharmony_ci	struct completion config_event;
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	/*
51862306a36Sopenharmony_ci	 * Number of pages we have currently ballooned out.
51962306a36Sopenharmony_ci	 */
52062306a36Sopenharmony_ci	unsigned int num_pages_ballooned;
52162306a36Sopenharmony_ci	unsigned int num_pages_onlined;
52262306a36Sopenharmony_ci	unsigned int num_pages_added;
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci	/*
52562306a36Sopenharmony_ci	 * State to manage the ballooning (up) operation.
52662306a36Sopenharmony_ci	 */
52762306a36Sopenharmony_ci	struct balloon_state balloon_wrk;
52862306a36Sopenharmony_ci
52962306a36Sopenharmony_ci	/*
53062306a36Sopenharmony_ci	 * State to execute the "hot-add" operation.
53162306a36Sopenharmony_ci	 */
53262306a36Sopenharmony_ci	struct hot_add_wrk ha_wrk;
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_ci	/*
53562306a36Sopenharmony_ci	 * This state tracks if the host has specified a hot-add
53662306a36Sopenharmony_ci	 * region.
53762306a36Sopenharmony_ci	 */
53862306a36Sopenharmony_ci	bool host_specified_ha_region;
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci	/*
54162306a36Sopenharmony_ci	 * State to synchronize hot-add.
54262306a36Sopenharmony_ci	 */
54362306a36Sopenharmony_ci	struct completion  ol_waitevent;
54462306a36Sopenharmony_ci	/*
54562306a36Sopenharmony_ci	 * This thread handles hot-add
54662306a36Sopenharmony_ci	 * requests from the host as well as notifying
54762306a36Sopenharmony_ci	 * the host with regards to memory pressure in
54862306a36Sopenharmony_ci	 * the guest.
54962306a36Sopenharmony_ci	 */
55062306a36Sopenharmony_ci	struct task_struct *thread;
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci	/*
55362306a36Sopenharmony_ci	 * Protects ha_region_list, num_pages_onlined counter and individual
55462306a36Sopenharmony_ci	 * regions from ha_region_list.
55562306a36Sopenharmony_ci	 */
55662306a36Sopenharmony_ci	spinlock_t ha_lock;
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	/*
55962306a36Sopenharmony_ci	 * A list of hot-add regions.
56062306a36Sopenharmony_ci	 */
56162306a36Sopenharmony_ci	struct list_head ha_region_list;
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_ci	/*
56462306a36Sopenharmony_ci	 * We start with the highest version we can support
56562306a36Sopenharmony_ci	 * and downgrade based on the host; we save here the
56662306a36Sopenharmony_ci	 * next version to try.
56762306a36Sopenharmony_ci	 */
56862306a36Sopenharmony_ci	__u32 next_version;
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	/*
57162306a36Sopenharmony_ci	 * The negotiated version agreed by host.
57262306a36Sopenharmony_ci	 */
57362306a36Sopenharmony_ci	__u32 version;
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	struct page_reporting_dev_info pr_dev_info;
57662306a36Sopenharmony_ci
57762306a36Sopenharmony_ci	/*
57862306a36Sopenharmony_ci	 * Maximum number of pages that can be hot_add-ed
57962306a36Sopenharmony_ci	 */
58062306a36Sopenharmony_ci	__u64 max_dynamic_page_count;
58162306a36Sopenharmony_ci};
58262306a36Sopenharmony_ci
58362306a36Sopenharmony_cistatic struct hv_dynmem_device dm_device;
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_cistatic void post_status(struct hv_dynmem_device *dm);
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_cistatic void enable_page_reporting(void);
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_cistatic void disable_page_reporting(void);
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
59262306a36Sopenharmony_cistatic inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
59362306a36Sopenharmony_ci				     unsigned long pfn)
59462306a36Sopenharmony_ci{
59562306a36Sopenharmony_ci	struct hv_hotadd_gap *gap;
59662306a36Sopenharmony_ci
59762306a36Sopenharmony_ci	/* The page is not backed. */
59862306a36Sopenharmony_ci	if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn))
59962306a36Sopenharmony_ci		return false;
60062306a36Sopenharmony_ci
60162306a36Sopenharmony_ci	/* Check for gaps. */
60262306a36Sopenharmony_ci	list_for_each_entry(gap, &has->gap_list, list) {
60362306a36Sopenharmony_ci		if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn))
60462306a36Sopenharmony_ci			return false;
60562306a36Sopenharmony_ci	}
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	return true;
60862306a36Sopenharmony_ci}
60962306a36Sopenharmony_ci
61062306a36Sopenharmony_cistatic unsigned long hv_page_offline_check(unsigned long start_pfn,
61162306a36Sopenharmony_ci					   unsigned long nr_pages)
61262306a36Sopenharmony_ci{
61362306a36Sopenharmony_ci	unsigned long pfn = start_pfn, count = 0;
61462306a36Sopenharmony_ci	struct hv_hotadd_state *has;
61562306a36Sopenharmony_ci	bool found;
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci	while (pfn < start_pfn + nr_pages) {
61862306a36Sopenharmony_ci		/*
61962306a36Sopenharmony_ci		 * Search for HAS which covers the pfn and when we find one
62062306a36Sopenharmony_ci		 * count how many consequitive PFNs are covered.
62162306a36Sopenharmony_ci		 */
62262306a36Sopenharmony_ci		found = false;
62362306a36Sopenharmony_ci		list_for_each_entry(has, &dm_device.ha_region_list, list) {
62462306a36Sopenharmony_ci			while ((pfn >= has->start_pfn) &&
62562306a36Sopenharmony_ci			       (pfn < has->end_pfn) &&
62662306a36Sopenharmony_ci			       (pfn < start_pfn + nr_pages)) {
62762306a36Sopenharmony_ci				found = true;
62862306a36Sopenharmony_ci				if (has_pfn_is_backed(has, pfn))
62962306a36Sopenharmony_ci					count++;
63062306a36Sopenharmony_ci				pfn++;
63162306a36Sopenharmony_ci			}
63262306a36Sopenharmony_ci		}
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci		/*
63562306a36Sopenharmony_ci		 * This PFN is not in any HAS (e.g. we're offlining a region
63662306a36Sopenharmony_ci		 * which was present at boot), no need to account for it. Go
63762306a36Sopenharmony_ci		 * to the next one.
63862306a36Sopenharmony_ci		 */
63962306a36Sopenharmony_ci		if (!found)
64062306a36Sopenharmony_ci			pfn++;
64162306a36Sopenharmony_ci	}
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_ci	return count;
64462306a36Sopenharmony_ci}
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_cistatic int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
64762306a36Sopenharmony_ci			      void *v)
64862306a36Sopenharmony_ci{
64962306a36Sopenharmony_ci	struct memory_notify *mem = (struct memory_notify *)v;
65062306a36Sopenharmony_ci	unsigned long pfn_count;
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci	switch (val) {
65362306a36Sopenharmony_ci	case MEM_ONLINE:
65462306a36Sopenharmony_ci	case MEM_CANCEL_ONLINE:
65562306a36Sopenharmony_ci		complete(&dm_device.ol_waitevent);
65662306a36Sopenharmony_ci		break;
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci	case MEM_OFFLINE:
65962306a36Sopenharmony_ci		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
66062306a36Sopenharmony_ci			pfn_count = hv_page_offline_check(mem->start_pfn,
66162306a36Sopenharmony_ci							  mem->nr_pages);
66262306a36Sopenharmony_ci			if (pfn_count <= dm_device.num_pages_onlined) {
66362306a36Sopenharmony_ci				dm_device.num_pages_onlined -= pfn_count;
66462306a36Sopenharmony_ci			} else {
66562306a36Sopenharmony_ci				/*
66662306a36Sopenharmony_ci				 * We're offlining more pages than we
66762306a36Sopenharmony_ci				 * managed to online. This is
66862306a36Sopenharmony_ci				 * unexpected. In any case don't let
66962306a36Sopenharmony_ci				 * num_pages_onlined wrap around zero.
67062306a36Sopenharmony_ci				 */
67162306a36Sopenharmony_ci				WARN_ON_ONCE(1);
67262306a36Sopenharmony_ci				dm_device.num_pages_onlined = 0;
67362306a36Sopenharmony_ci			}
67462306a36Sopenharmony_ci		}
67562306a36Sopenharmony_ci		break;
67662306a36Sopenharmony_ci	case MEM_GOING_ONLINE:
67762306a36Sopenharmony_ci	case MEM_GOING_OFFLINE:
67862306a36Sopenharmony_ci	case MEM_CANCEL_OFFLINE:
67962306a36Sopenharmony_ci		break;
68062306a36Sopenharmony_ci	}
68162306a36Sopenharmony_ci	return NOTIFY_OK;
68262306a36Sopenharmony_ci}
68362306a36Sopenharmony_ci
68462306a36Sopenharmony_cistatic struct notifier_block hv_memory_nb = {
68562306a36Sopenharmony_ci	.notifier_call = hv_memory_notifier,
68662306a36Sopenharmony_ci	.priority = 0
68762306a36Sopenharmony_ci};
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci/* Check if the particular page is backed and can be onlined and online it. */
69062306a36Sopenharmony_cistatic void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
69162306a36Sopenharmony_ci{
69262306a36Sopenharmony_ci	if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
69362306a36Sopenharmony_ci		if (!PageOffline(pg))
69462306a36Sopenharmony_ci			__SetPageOffline(pg);
69562306a36Sopenharmony_ci		return;
69662306a36Sopenharmony_ci	}
69762306a36Sopenharmony_ci	if (PageOffline(pg))
69862306a36Sopenharmony_ci		__ClearPageOffline(pg);
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ci	/* This frame is currently backed; online the page. */
70162306a36Sopenharmony_ci	generic_online_page(pg, 0);
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ci	lockdep_assert_held(&dm_device.ha_lock);
70462306a36Sopenharmony_ci	dm_device.num_pages_onlined++;
70562306a36Sopenharmony_ci}
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_cistatic void hv_bring_pgs_online(struct hv_hotadd_state *has,
70862306a36Sopenharmony_ci				unsigned long start_pfn, unsigned long size)
70962306a36Sopenharmony_ci{
71062306a36Sopenharmony_ci	int i;
71162306a36Sopenharmony_ci
71262306a36Sopenharmony_ci	pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
71362306a36Sopenharmony_ci	for (i = 0; i < size; i++)
71462306a36Sopenharmony_ci		hv_page_online_one(has, pfn_to_page(start_pfn + i));
71562306a36Sopenharmony_ci}
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_cistatic void hv_mem_hot_add(unsigned long start, unsigned long size,
71862306a36Sopenharmony_ci				unsigned long pfn_count,
71962306a36Sopenharmony_ci				struct hv_hotadd_state *has)
72062306a36Sopenharmony_ci{
72162306a36Sopenharmony_ci	int ret = 0;
72262306a36Sopenharmony_ci	int i, nid;
72362306a36Sopenharmony_ci	unsigned long start_pfn;
72462306a36Sopenharmony_ci	unsigned long processed_pfn;
72562306a36Sopenharmony_ci	unsigned long total_pfn = pfn_count;
72662306a36Sopenharmony_ci
72762306a36Sopenharmony_ci	for (i = 0; i < (size/HA_CHUNK); i++) {
72862306a36Sopenharmony_ci		start_pfn = start + (i * HA_CHUNK);
72962306a36Sopenharmony_ci
73062306a36Sopenharmony_ci		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
73162306a36Sopenharmony_ci			has->ha_end_pfn +=  HA_CHUNK;
73262306a36Sopenharmony_ci
73362306a36Sopenharmony_ci			if (total_pfn > HA_CHUNK) {
73462306a36Sopenharmony_ci				processed_pfn = HA_CHUNK;
73562306a36Sopenharmony_ci				total_pfn -= HA_CHUNK;
73662306a36Sopenharmony_ci			} else {
73762306a36Sopenharmony_ci				processed_pfn = total_pfn;
73862306a36Sopenharmony_ci				total_pfn = 0;
73962306a36Sopenharmony_ci			}
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci			has->covered_end_pfn +=  processed_pfn;
74262306a36Sopenharmony_ci		}
74362306a36Sopenharmony_ci
74462306a36Sopenharmony_ci		reinit_completion(&dm_device.ol_waitevent);
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ci		nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
74762306a36Sopenharmony_ci		ret = add_memory(nid, PFN_PHYS((start_pfn)),
74862306a36Sopenharmony_ci				(HA_CHUNK << PAGE_SHIFT), MHP_MERGE_RESOURCE);
74962306a36Sopenharmony_ci
75062306a36Sopenharmony_ci		if (ret) {
75162306a36Sopenharmony_ci			pr_err("hot_add memory failed error is %d\n", ret);
75262306a36Sopenharmony_ci			if (ret == -EEXIST) {
75362306a36Sopenharmony_ci				/*
75462306a36Sopenharmony_ci				 * This error indicates that the error
75562306a36Sopenharmony_ci				 * is not a transient failure. This is the
75662306a36Sopenharmony_ci				 * case where the guest's physical address map
75762306a36Sopenharmony_ci				 * precludes hot adding memory. Stop all further
75862306a36Sopenharmony_ci				 * memory hot-add.
75962306a36Sopenharmony_ci				 */
76062306a36Sopenharmony_ci				do_hot_add = false;
76162306a36Sopenharmony_ci			}
76262306a36Sopenharmony_ci			scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
76362306a36Sopenharmony_ci				has->ha_end_pfn -= HA_CHUNK;
76462306a36Sopenharmony_ci				has->covered_end_pfn -=  processed_pfn;
76562306a36Sopenharmony_ci			}
76662306a36Sopenharmony_ci			break;
76762306a36Sopenharmony_ci		}
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci		/*
77062306a36Sopenharmony_ci		 * Wait for memory to get onlined. If the kernel onlined the
77162306a36Sopenharmony_ci		 * memory when adding it, this will return directly. Otherwise,
77262306a36Sopenharmony_ci		 * it will wait for user space to online the memory. This helps
77362306a36Sopenharmony_ci		 * to avoid adding memory faster than it is getting onlined. As
77462306a36Sopenharmony_ci		 * adding succeeded, it is ok to proceed even if the memory was
77562306a36Sopenharmony_ci		 * not onlined in time.
77662306a36Sopenharmony_ci		 */
77762306a36Sopenharmony_ci		wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
77862306a36Sopenharmony_ci		post_status(&dm_device);
77962306a36Sopenharmony_ci	}
78062306a36Sopenharmony_ci}
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_cistatic void hv_online_page(struct page *pg, unsigned int order)
78362306a36Sopenharmony_ci{
78462306a36Sopenharmony_ci	struct hv_hotadd_state *has;
78562306a36Sopenharmony_ci	unsigned long pfn = page_to_pfn(pg);
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	guard(spinlock_irqsave)(&dm_device.ha_lock);
78862306a36Sopenharmony_ci	list_for_each_entry(has, &dm_device.ha_region_list, list) {
78962306a36Sopenharmony_ci		/* The page belongs to a different HAS. */
79062306a36Sopenharmony_ci		if ((pfn < has->start_pfn) ||
79162306a36Sopenharmony_ci				(pfn + (1UL << order) > has->end_pfn))
79262306a36Sopenharmony_ci			continue;
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci		hv_bring_pgs_online(has, pfn, 1UL << order);
79562306a36Sopenharmony_ci		break;
79662306a36Sopenharmony_ci	}
79762306a36Sopenharmony_ci}
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_cistatic int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
80062306a36Sopenharmony_ci{
80162306a36Sopenharmony_ci	struct hv_hotadd_state *has;
80262306a36Sopenharmony_ci	struct hv_hotadd_gap *gap;
80362306a36Sopenharmony_ci	unsigned long residual, new_inc;
80462306a36Sopenharmony_ci	int ret = 0;
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	guard(spinlock_irqsave)(&dm_device.ha_lock);
80762306a36Sopenharmony_ci	list_for_each_entry(has, &dm_device.ha_region_list, list) {
80862306a36Sopenharmony_ci		/*
80962306a36Sopenharmony_ci		 * If the pfn range we are dealing with is not in the current
81062306a36Sopenharmony_ci		 * "hot add block", move on.
81162306a36Sopenharmony_ci		 */
81262306a36Sopenharmony_ci		if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
81362306a36Sopenharmony_ci			continue;
81462306a36Sopenharmony_ci
81562306a36Sopenharmony_ci		/*
81662306a36Sopenharmony_ci		 * If the current start pfn is not where the covered_end
81762306a36Sopenharmony_ci		 * is, create a gap and update covered_end_pfn.
81862306a36Sopenharmony_ci		 */
81962306a36Sopenharmony_ci		if (has->covered_end_pfn != start_pfn) {
82062306a36Sopenharmony_ci			gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
82162306a36Sopenharmony_ci			if (!gap) {
82262306a36Sopenharmony_ci				ret = -ENOMEM;
82362306a36Sopenharmony_ci				break;
82462306a36Sopenharmony_ci			}
82562306a36Sopenharmony_ci
82662306a36Sopenharmony_ci			INIT_LIST_HEAD(&gap->list);
82762306a36Sopenharmony_ci			gap->start_pfn = has->covered_end_pfn;
82862306a36Sopenharmony_ci			gap->end_pfn = start_pfn;
82962306a36Sopenharmony_ci			list_add_tail(&gap->list, &has->gap_list);
83062306a36Sopenharmony_ci
83162306a36Sopenharmony_ci			has->covered_end_pfn = start_pfn;
83262306a36Sopenharmony_ci		}
83362306a36Sopenharmony_ci
83462306a36Sopenharmony_ci		/*
83562306a36Sopenharmony_ci		 * If the current hot add-request extends beyond
83662306a36Sopenharmony_ci		 * our current limit; extend it.
83762306a36Sopenharmony_ci		 */
83862306a36Sopenharmony_ci		if ((start_pfn + pfn_cnt) > has->end_pfn) {
83962306a36Sopenharmony_ci			residual = (start_pfn + pfn_cnt - has->end_pfn);
84062306a36Sopenharmony_ci			/*
84162306a36Sopenharmony_ci			 * Extend the region by multiples of HA_CHUNK.
84262306a36Sopenharmony_ci			 */
84362306a36Sopenharmony_ci			new_inc = (residual / HA_CHUNK) * HA_CHUNK;
84462306a36Sopenharmony_ci			if (residual % HA_CHUNK)
84562306a36Sopenharmony_ci				new_inc += HA_CHUNK;
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci			has->end_pfn += new_inc;
84862306a36Sopenharmony_ci		}
84962306a36Sopenharmony_ci
85062306a36Sopenharmony_ci		ret = 1;
85162306a36Sopenharmony_ci		break;
85262306a36Sopenharmony_ci	}
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	return ret;
85562306a36Sopenharmony_ci}
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_cistatic unsigned long handle_pg_range(unsigned long pg_start,
85862306a36Sopenharmony_ci					unsigned long pg_count)
85962306a36Sopenharmony_ci{
86062306a36Sopenharmony_ci	unsigned long start_pfn = pg_start;
86162306a36Sopenharmony_ci	unsigned long pfn_cnt = pg_count;
86262306a36Sopenharmony_ci	unsigned long size;
86362306a36Sopenharmony_ci	struct hv_hotadd_state *has;
86462306a36Sopenharmony_ci	unsigned long pgs_ol = 0;
86562306a36Sopenharmony_ci	unsigned long old_covered_state;
86662306a36Sopenharmony_ci	unsigned long res = 0, flags;
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_ci	pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count,
86962306a36Sopenharmony_ci		pg_start);
87062306a36Sopenharmony_ci
87162306a36Sopenharmony_ci	spin_lock_irqsave(&dm_device.ha_lock, flags);
87262306a36Sopenharmony_ci	list_for_each_entry(has, &dm_device.ha_region_list, list) {
87362306a36Sopenharmony_ci		/*
87462306a36Sopenharmony_ci		 * If the pfn range we are dealing with is not in the current
87562306a36Sopenharmony_ci		 * "hot add block", move on.
87662306a36Sopenharmony_ci		 */
87762306a36Sopenharmony_ci		if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
87862306a36Sopenharmony_ci			continue;
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci		old_covered_state = has->covered_end_pfn;
88162306a36Sopenharmony_ci
88262306a36Sopenharmony_ci		if (start_pfn < has->ha_end_pfn) {
88362306a36Sopenharmony_ci			/*
88462306a36Sopenharmony_ci			 * This is the case where we are backing pages
88562306a36Sopenharmony_ci			 * in an already hot added region. Bring
88662306a36Sopenharmony_ci			 * these pages online first.
88762306a36Sopenharmony_ci			 */
88862306a36Sopenharmony_ci			pgs_ol = has->ha_end_pfn - start_pfn;
88962306a36Sopenharmony_ci			if (pgs_ol > pfn_cnt)
89062306a36Sopenharmony_ci				pgs_ol = pfn_cnt;
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci			has->covered_end_pfn +=  pgs_ol;
89362306a36Sopenharmony_ci			pfn_cnt -= pgs_ol;
89462306a36Sopenharmony_ci			/*
89562306a36Sopenharmony_ci			 * Check if the corresponding memory block is already
89662306a36Sopenharmony_ci			 * online. It is possible to observe struct pages still
89762306a36Sopenharmony_ci			 * being uninitialized here so check section instead.
89862306a36Sopenharmony_ci			 * In case the section is online we need to bring the
89962306a36Sopenharmony_ci			 * rest of pfns (which were not backed previously)
90062306a36Sopenharmony_ci			 * online too.
90162306a36Sopenharmony_ci			 */
90262306a36Sopenharmony_ci			if (start_pfn > has->start_pfn &&
90362306a36Sopenharmony_ci			    online_section_nr(pfn_to_section_nr(start_pfn)))
90462306a36Sopenharmony_ci				hv_bring_pgs_online(has, start_pfn, pgs_ol);
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci		}
90762306a36Sopenharmony_ci
90862306a36Sopenharmony_ci		if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
90962306a36Sopenharmony_ci			/*
91062306a36Sopenharmony_ci			 * We have some residual hot add range
91162306a36Sopenharmony_ci			 * that needs to be hot added; hot add
91262306a36Sopenharmony_ci			 * it now. Hot add a multiple of
91362306a36Sopenharmony_ci			 * HA_CHUNK that fully covers the pages
91462306a36Sopenharmony_ci			 * we have.
91562306a36Sopenharmony_ci			 */
91662306a36Sopenharmony_ci			size = (has->end_pfn - has->ha_end_pfn);
91762306a36Sopenharmony_ci			if (pfn_cnt <= size) {
91862306a36Sopenharmony_ci				size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
91962306a36Sopenharmony_ci				if (pfn_cnt % HA_CHUNK)
92062306a36Sopenharmony_ci					size += HA_CHUNK;
92162306a36Sopenharmony_ci			} else {
92262306a36Sopenharmony_ci				pfn_cnt = size;
92362306a36Sopenharmony_ci			}
92462306a36Sopenharmony_ci			spin_unlock_irqrestore(&dm_device.ha_lock, flags);
92562306a36Sopenharmony_ci			hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has);
92662306a36Sopenharmony_ci			spin_lock_irqsave(&dm_device.ha_lock, flags);
92762306a36Sopenharmony_ci		}
92862306a36Sopenharmony_ci		/*
92962306a36Sopenharmony_ci		 * If we managed to online any pages that were given to us,
93062306a36Sopenharmony_ci		 * we declare success.
93162306a36Sopenharmony_ci		 */
93262306a36Sopenharmony_ci		res = has->covered_end_pfn - old_covered_state;
93362306a36Sopenharmony_ci		break;
93462306a36Sopenharmony_ci	}
93562306a36Sopenharmony_ci	spin_unlock_irqrestore(&dm_device.ha_lock, flags);
93662306a36Sopenharmony_ci
93762306a36Sopenharmony_ci	return res;
93862306a36Sopenharmony_ci}
93962306a36Sopenharmony_ci
94062306a36Sopenharmony_cistatic unsigned long process_hot_add(unsigned long pg_start,
94162306a36Sopenharmony_ci					unsigned long pfn_cnt,
94262306a36Sopenharmony_ci					unsigned long rg_start,
94362306a36Sopenharmony_ci					unsigned long rg_size)
94462306a36Sopenharmony_ci{
94562306a36Sopenharmony_ci	struct hv_hotadd_state *ha_region = NULL;
94662306a36Sopenharmony_ci	int covered;
94762306a36Sopenharmony_ci
94862306a36Sopenharmony_ci	if (pfn_cnt == 0)
94962306a36Sopenharmony_ci		return 0;
95062306a36Sopenharmony_ci
95162306a36Sopenharmony_ci	if (!dm_device.host_specified_ha_region) {
95262306a36Sopenharmony_ci		covered = pfn_covered(pg_start, pfn_cnt);
95362306a36Sopenharmony_ci		if (covered < 0)
95462306a36Sopenharmony_ci			return 0;
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci		if (covered)
95762306a36Sopenharmony_ci			goto do_pg_range;
95862306a36Sopenharmony_ci	}
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ci	/*
96162306a36Sopenharmony_ci	 * If the host has specified a hot-add range; deal with it first.
96262306a36Sopenharmony_ci	 */
96362306a36Sopenharmony_ci
96462306a36Sopenharmony_ci	if (rg_size != 0) {
96562306a36Sopenharmony_ci		ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL);
96662306a36Sopenharmony_ci		if (!ha_region)
96762306a36Sopenharmony_ci			return 0;
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci		INIT_LIST_HEAD(&ha_region->list);
97062306a36Sopenharmony_ci		INIT_LIST_HEAD(&ha_region->gap_list);
97162306a36Sopenharmony_ci
97262306a36Sopenharmony_ci		ha_region->start_pfn = rg_start;
97362306a36Sopenharmony_ci		ha_region->ha_end_pfn = rg_start;
97462306a36Sopenharmony_ci		ha_region->covered_start_pfn = pg_start;
97562306a36Sopenharmony_ci		ha_region->covered_end_pfn = pg_start;
97662306a36Sopenharmony_ci		ha_region->end_pfn = rg_start + rg_size;
97762306a36Sopenharmony_ci
97862306a36Sopenharmony_ci		scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
97962306a36Sopenharmony_ci			list_add_tail(&ha_region->list, &dm_device.ha_region_list);
98062306a36Sopenharmony_ci		}
98162306a36Sopenharmony_ci	}
98262306a36Sopenharmony_ci
98362306a36Sopenharmony_cido_pg_range:
98462306a36Sopenharmony_ci	/*
98562306a36Sopenharmony_ci	 * Process the page range specified; bringing them
98662306a36Sopenharmony_ci	 * online if possible.
98762306a36Sopenharmony_ci	 */
98862306a36Sopenharmony_ci	return handle_pg_range(pg_start, pfn_cnt);
98962306a36Sopenharmony_ci}
99062306a36Sopenharmony_ci
99162306a36Sopenharmony_ci#endif
99262306a36Sopenharmony_ci
99362306a36Sopenharmony_cistatic void hot_add_req(struct work_struct *dummy)
99462306a36Sopenharmony_ci{
99562306a36Sopenharmony_ci	struct dm_hot_add_response resp;
99662306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
99762306a36Sopenharmony_ci	unsigned long pg_start, pfn_cnt;
99862306a36Sopenharmony_ci	unsigned long rg_start, rg_sz;
99962306a36Sopenharmony_ci#endif
100062306a36Sopenharmony_ci	struct hv_dynmem_device *dm = &dm_device;
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	memset(&resp, 0, sizeof(struct dm_hot_add_response));
100362306a36Sopenharmony_ci	resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
100462306a36Sopenharmony_ci	resp.hdr.size = sizeof(struct dm_hot_add_response);
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
100762306a36Sopenharmony_ci	pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
100862306a36Sopenharmony_ci	pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
100962306a36Sopenharmony_ci
101062306a36Sopenharmony_ci	rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
101162306a36Sopenharmony_ci	rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
101262306a36Sopenharmony_ci
101362306a36Sopenharmony_ci	if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
101462306a36Sopenharmony_ci		unsigned long region_size;
101562306a36Sopenharmony_ci		unsigned long region_start;
101662306a36Sopenharmony_ci
101762306a36Sopenharmony_ci		/*
101862306a36Sopenharmony_ci		 * The host has not specified the hot-add region.
101962306a36Sopenharmony_ci		 * Based on the hot-add page range being specified,
102062306a36Sopenharmony_ci		 * compute a hot-add region that can cover the pages
102162306a36Sopenharmony_ci		 * that need to be hot-added while ensuring the alignment
102262306a36Sopenharmony_ci		 * and size requirements of Linux as it relates to hot-add.
102362306a36Sopenharmony_ci		 */
102462306a36Sopenharmony_ci		region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
102562306a36Sopenharmony_ci		if (pfn_cnt % HA_CHUNK)
102662306a36Sopenharmony_ci			region_size += HA_CHUNK;
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_ci		region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
102962306a36Sopenharmony_ci
103062306a36Sopenharmony_ci		rg_start = region_start;
103162306a36Sopenharmony_ci		rg_sz = region_size;
103262306a36Sopenharmony_ci	}
103362306a36Sopenharmony_ci
103462306a36Sopenharmony_ci	if (do_hot_add)
103562306a36Sopenharmony_ci		resp.page_count = process_hot_add(pg_start, pfn_cnt,
103662306a36Sopenharmony_ci						rg_start, rg_sz);
103762306a36Sopenharmony_ci
103862306a36Sopenharmony_ci	dm->num_pages_added += resp.page_count;
103962306a36Sopenharmony_ci#endif
104062306a36Sopenharmony_ci	/*
104162306a36Sopenharmony_ci	 * The result field of the response structure has the
104262306a36Sopenharmony_ci	 * following semantics:
104362306a36Sopenharmony_ci	 *
104462306a36Sopenharmony_ci	 * 1. If all or some pages hot-added: Guest should return success.
104562306a36Sopenharmony_ci	 *
104662306a36Sopenharmony_ci	 * 2. If no pages could be hot-added:
104762306a36Sopenharmony_ci	 *
104862306a36Sopenharmony_ci	 * If the guest returns success, then the host
104962306a36Sopenharmony_ci	 * will not attempt any further hot-add operations. This
105062306a36Sopenharmony_ci	 * signifies a permanent failure.
105162306a36Sopenharmony_ci	 *
105262306a36Sopenharmony_ci	 * If the guest returns failure, then this failure will be
105362306a36Sopenharmony_ci	 * treated as a transient failure and the host may retry the
105462306a36Sopenharmony_ci	 * hot-add operation after some delay.
105562306a36Sopenharmony_ci	 */
105662306a36Sopenharmony_ci	if (resp.page_count > 0)
105762306a36Sopenharmony_ci		resp.result = 1;
105862306a36Sopenharmony_ci	else if (!do_hot_add)
105962306a36Sopenharmony_ci		resp.result = 1;
106062306a36Sopenharmony_ci	else
106162306a36Sopenharmony_ci		resp.result = 0;
106262306a36Sopenharmony_ci
106362306a36Sopenharmony_ci	if (!do_hot_add || resp.page_count == 0) {
106462306a36Sopenharmony_ci		if (!allow_hibernation)
106562306a36Sopenharmony_ci			pr_err("Memory hot add failed\n");
106662306a36Sopenharmony_ci		else
106762306a36Sopenharmony_ci			pr_info("Ignore hot-add request!\n");
106862306a36Sopenharmony_ci	}
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ci	dm->state = DM_INITIALIZED;
107162306a36Sopenharmony_ci	resp.hdr.trans_id = atomic_inc_return(&trans_id);
107262306a36Sopenharmony_ci	vmbus_sendpacket(dm->dev->channel, &resp,
107362306a36Sopenharmony_ci			sizeof(struct dm_hot_add_response),
107462306a36Sopenharmony_ci			(unsigned long)NULL,
107562306a36Sopenharmony_ci			VM_PKT_DATA_INBAND, 0);
107662306a36Sopenharmony_ci}
107762306a36Sopenharmony_ci
107862306a36Sopenharmony_cistatic void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
107962306a36Sopenharmony_ci{
108062306a36Sopenharmony_ci	struct dm_info_header *info_hdr;
108162306a36Sopenharmony_ci
108262306a36Sopenharmony_ci	info_hdr = (struct dm_info_header *)msg->info;
108362306a36Sopenharmony_ci
108462306a36Sopenharmony_ci	switch (info_hdr->type) {
108562306a36Sopenharmony_ci	case INFO_TYPE_MAX_PAGE_CNT:
108662306a36Sopenharmony_ci		if (info_hdr->data_size == sizeof(__u64)) {
108762306a36Sopenharmony_ci			__u64 *max_page_count = (__u64 *)&info_hdr[1];
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_ci			pr_info("Max. dynamic memory size: %llu MB\n",
109062306a36Sopenharmony_ci				(*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT));
109162306a36Sopenharmony_ci			dm->max_dynamic_page_count = *max_page_count;
109262306a36Sopenharmony_ci		}
109362306a36Sopenharmony_ci
109462306a36Sopenharmony_ci		break;
109562306a36Sopenharmony_ci	default:
109662306a36Sopenharmony_ci		pr_warn("Received Unknown type: %d\n", info_hdr->type);
109762306a36Sopenharmony_ci	}
109862306a36Sopenharmony_ci}
109962306a36Sopenharmony_ci
110062306a36Sopenharmony_cistatic unsigned long compute_balloon_floor(void)
110162306a36Sopenharmony_ci{
110262306a36Sopenharmony_ci	unsigned long min_pages;
110362306a36Sopenharmony_ci	unsigned long nr_pages = totalram_pages();
110462306a36Sopenharmony_ci#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
110562306a36Sopenharmony_ci	/* Simple continuous piecewiese linear function:
110662306a36Sopenharmony_ci	 *  max MiB -> min MiB  gradient
110762306a36Sopenharmony_ci	 *       0         0
110862306a36Sopenharmony_ci	 *      16        16
110962306a36Sopenharmony_ci	 *      32        24
111062306a36Sopenharmony_ci	 *     128        72    (1/2)
111162306a36Sopenharmony_ci	 *     512       168    (1/4)
111262306a36Sopenharmony_ci	 *    2048       360    (1/8)
111362306a36Sopenharmony_ci	 *    8192       744    (1/16)
111462306a36Sopenharmony_ci	 *   32768      1512	(1/32)
111562306a36Sopenharmony_ci	 */
111662306a36Sopenharmony_ci	if (nr_pages < MB2PAGES(128))
111762306a36Sopenharmony_ci		min_pages = MB2PAGES(8) + (nr_pages >> 1);
111862306a36Sopenharmony_ci	else if (nr_pages < MB2PAGES(512))
111962306a36Sopenharmony_ci		min_pages = MB2PAGES(40) + (nr_pages >> 2);
112062306a36Sopenharmony_ci	else if (nr_pages < MB2PAGES(2048))
112162306a36Sopenharmony_ci		min_pages = MB2PAGES(104) + (nr_pages >> 3);
112262306a36Sopenharmony_ci	else if (nr_pages < MB2PAGES(8192))
112362306a36Sopenharmony_ci		min_pages = MB2PAGES(232) + (nr_pages >> 4);
112462306a36Sopenharmony_ci	else
112562306a36Sopenharmony_ci		min_pages = MB2PAGES(488) + (nr_pages >> 5);
112662306a36Sopenharmony_ci#undef MB2PAGES
112762306a36Sopenharmony_ci	return min_pages;
112862306a36Sopenharmony_ci}
112962306a36Sopenharmony_ci
113062306a36Sopenharmony_ci/*
113162306a36Sopenharmony_ci * Compute total committed memory pages
113262306a36Sopenharmony_ci */
113362306a36Sopenharmony_ci
113462306a36Sopenharmony_cistatic unsigned long get_pages_committed(struct hv_dynmem_device *dm)
113562306a36Sopenharmony_ci{
113662306a36Sopenharmony_ci	return vm_memory_committed() +
113762306a36Sopenharmony_ci		dm->num_pages_ballooned +
113862306a36Sopenharmony_ci		(dm->num_pages_added > dm->num_pages_onlined ?
113962306a36Sopenharmony_ci		 dm->num_pages_added - dm->num_pages_onlined : 0) +
114062306a36Sopenharmony_ci		compute_balloon_floor();
114162306a36Sopenharmony_ci}
114262306a36Sopenharmony_ci
114362306a36Sopenharmony_ci/*
114462306a36Sopenharmony_ci * Post our status as it relates memory pressure to the
114562306a36Sopenharmony_ci * host. Host expects the guests to post this status
114662306a36Sopenharmony_ci * periodically at 1 second intervals.
114762306a36Sopenharmony_ci *
114862306a36Sopenharmony_ci * The metrics specified in this protocol are very Windows
114962306a36Sopenharmony_ci * specific and so we cook up numbers here to convey our memory
115062306a36Sopenharmony_ci * pressure.
115162306a36Sopenharmony_ci */
115262306a36Sopenharmony_ci
115362306a36Sopenharmony_cistatic void post_status(struct hv_dynmem_device *dm)
115462306a36Sopenharmony_ci{
115562306a36Sopenharmony_ci	struct dm_status status;
115662306a36Sopenharmony_ci	unsigned long now = jiffies;
115762306a36Sopenharmony_ci	unsigned long last_post = last_post_time;
115862306a36Sopenharmony_ci	unsigned long num_pages_avail, num_pages_committed;
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_ci	if (pressure_report_delay > 0) {
116162306a36Sopenharmony_ci		--pressure_report_delay;
116262306a36Sopenharmony_ci		return;
116362306a36Sopenharmony_ci	}
116462306a36Sopenharmony_ci
116562306a36Sopenharmony_ci	if (!time_after(now, (last_post_time + HZ)))
116662306a36Sopenharmony_ci		return;
116762306a36Sopenharmony_ci
116862306a36Sopenharmony_ci	memset(&status, 0, sizeof(struct dm_status));
116962306a36Sopenharmony_ci	status.hdr.type = DM_STATUS_REPORT;
117062306a36Sopenharmony_ci	status.hdr.size = sizeof(struct dm_status);
117162306a36Sopenharmony_ci	status.hdr.trans_id = atomic_inc_return(&trans_id);
117262306a36Sopenharmony_ci
117362306a36Sopenharmony_ci	/*
117462306a36Sopenharmony_ci	 * The host expects the guest to report free and committed memory.
117562306a36Sopenharmony_ci	 * Furthermore, the host expects the pressure information to include
117662306a36Sopenharmony_ci	 * the ballooned out pages. For a given amount of memory that we are
117762306a36Sopenharmony_ci	 * managing we need to compute a floor below which we should not
117862306a36Sopenharmony_ci	 * balloon. Compute this and add it to the pressure report.
117962306a36Sopenharmony_ci	 * We also need to report all offline pages (num_pages_added -
118062306a36Sopenharmony_ci	 * num_pages_onlined) as committed to the host, otherwise it can try
118162306a36Sopenharmony_ci	 * asking us to balloon them out.
118262306a36Sopenharmony_ci	 */
118362306a36Sopenharmony_ci	num_pages_avail = si_mem_available();
118462306a36Sopenharmony_ci	num_pages_committed = get_pages_committed(dm);
118562306a36Sopenharmony_ci
118662306a36Sopenharmony_ci	trace_balloon_status(num_pages_avail, num_pages_committed,
118762306a36Sopenharmony_ci			     vm_memory_committed(), dm->num_pages_ballooned,
118862306a36Sopenharmony_ci			     dm->num_pages_added, dm->num_pages_onlined);
118962306a36Sopenharmony_ci
119062306a36Sopenharmony_ci	/* Convert numbers of pages into numbers of HV_HYP_PAGEs. */
119162306a36Sopenharmony_ci	status.num_avail = num_pages_avail * NR_HV_HYP_PAGES_IN_PAGE;
119262306a36Sopenharmony_ci	status.num_committed = num_pages_committed * NR_HV_HYP_PAGES_IN_PAGE;
119362306a36Sopenharmony_ci
119462306a36Sopenharmony_ci	/*
119562306a36Sopenharmony_ci	 * If our transaction ID is no longer current, just don't
119662306a36Sopenharmony_ci	 * send the status. This can happen if we were interrupted
119762306a36Sopenharmony_ci	 * after we picked our transaction ID.
119862306a36Sopenharmony_ci	 */
119962306a36Sopenharmony_ci	if (status.hdr.trans_id != atomic_read(&trans_id))
120062306a36Sopenharmony_ci		return;
120162306a36Sopenharmony_ci
120262306a36Sopenharmony_ci	/*
120362306a36Sopenharmony_ci	 * If the last post time that we sampled has changed,
120462306a36Sopenharmony_ci	 * we have raced, don't post the status.
120562306a36Sopenharmony_ci	 */
120662306a36Sopenharmony_ci	if (last_post != last_post_time)
120762306a36Sopenharmony_ci		return;
120862306a36Sopenharmony_ci
120962306a36Sopenharmony_ci	last_post_time = jiffies;
121062306a36Sopenharmony_ci	vmbus_sendpacket(dm->dev->channel, &status,
121162306a36Sopenharmony_ci				sizeof(struct dm_status),
121262306a36Sopenharmony_ci				(unsigned long)NULL,
121362306a36Sopenharmony_ci				VM_PKT_DATA_INBAND, 0);
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci}
121662306a36Sopenharmony_ci
121762306a36Sopenharmony_cistatic void free_balloon_pages(struct hv_dynmem_device *dm,
121862306a36Sopenharmony_ci			 union dm_mem_page_range *range_array)
121962306a36Sopenharmony_ci{
122062306a36Sopenharmony_ci	int num_pages = range_array->finfo.page_cnt;
122162306a36Sopenharmony_ci	__u64 start_frame = range_array->finfo.start_page;
122262306a36Sopenharmony_ci	struct page *pg;
122362306a36Sopenharmony_ci	int i;
122462306a36Sopenharmony_ci
122562306a36Sopenharmony_ci	for (i = 0; i < num_pages; i++) {
122662306a36Sopenharmony_ci		pg = pfn_to_page(i + start_frame);
122762306a36Sopenharmony_ci		__ClearPageOffline(pg);
122862306a36Sopenharmony_ci		__free_page(pg);
122962306a36Sopenharmony_ci		dm->num_pages_ballooned--;
123062306a36Sopenharmony_ci		adjust_managed_page_count(pg, 1);
123162306a36Sopenharmony_ci	}
123262306a36Sopenharmony_ci}
123362306a36Sopenharmony_ci
123462306a36Sopenharmony_ci
123562306a36Sopenharmony_ci
123662306a36Sopenharmony_cistatic unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
123762306a36Sopenharmony_ci					unsigned int num_pages,
123862306a36Sopenharmony_ci					struct dm_balloon_response *bl_resp,
123962306a36Sopenharmony_ci					int alloc_unit)
124062306a36Sopenharmony_ci{
124162306a36Sopenharmony_ci	unsigned int i, j;
124262306a36Sopenharmony_ci	struct page *pg;
124362306a36Sopenharmony_ci
124462306a36Sopenharmony_ci	for (i = 0; i < num_pages / alloc_unit; i++) {
124562306a36Sopenharmony_ci		if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
124662306a36Sopenharmony_ci			HV_HYP_PAGE_SIZE)
124762306a36Sopenharmony_ci			return i * alloc_unit;
124862306a36Sopenharmony_ci
124962306a36Sopenharmony_ci		/*
125062306a36Sopenharmony_ci		 * We execute this code in a thread context. Furthermore,
125162306a36Sopenharmony_ci		 * we don't want the kernel to try too hard.
125262306a36Sopenharmony_ci		 */
125362306a36Sopenharmony_ci		pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
125462306a36Sopenharmony_ci				__GFP_NOMEMALLOC | __GFP_NOWARN,
125562306a36Sopenharmony_ci				get_order(alloc_unit << PAGE_SHIFT));
125662306a36Sopenharmony_ci
125762306a36Sopenharmony_ci		if (!pg)
125862306a36Sopenharmony_ci			return i * alloc_unit;
125962306a36Sopenharmony_ci
126062306a36Sopenharmony_ci		dm->num_pages_ballooned += alloc_unit;
126162306a36Sopenharmony_ci
126262306a36Sopenharmony_ci		/*
126362306a36Sopenharmony_ci		 * If we allocatted 2M pages; split them so we
126462306a36Sopenharmony_ci		 * can free them in any order we get.
126562306a36Sopenharmony_ci		 */
126662306a36Sopenharmony_ci
126762306a36Sopenharmony_ci		if (alloc_unit != 1)
126862306a36Sopenharmony_ci			split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
126962306a36Sopenharmony_ci
127062306a36Sopenharmony_ci		/* mark all pages offline */
127162306a36Sopenharmony_ci		for (j = 0; j < alloc_unit; j++) {
127262306a36Sopenharmony_ci			__SetPageOffline(pg + j);
127362306a36Sopenharmony_ci			adjust_managed_page_count(pg + j, -1);
127462306a36Sopenharmony_ci		}
127562306a36Sopenharmony_ci
127662306a36Sopenharmony_ci		bl_resp->range_count++;
127762306a36Sopenharmony_ci		bl_resp->range_array[i].finfo.start_page =
127862306a36Sopenharmony_ci			page_to_pfn(pg);
127962306a36Sopenharmony_ci		bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
128062306a36Sopenharmony_ci		bl_resp->hdr.size += sizeof(union dm_mem_page_range);
128162306a36Sopenharmony_ci
128262306a36Sopenharmony_ci	}
128362306a36Sopenharmony_ci
128462306a36Sopenharmony_ci	return i * alloc_unit;
128562306a36Sopenharmony_ci}
128662306a36Sopenharmony_ci
128762306a36Sopenharmony_cistatic void balloon_up(struct work_struct *dummy)
128862306a36Sopenharmony_ci{
128962306a36Sopenharmony_ci	unsigned int num_pages = dm_device.balloon_wrk.num_pages;
129062306a36Sopenharmony_ci	unsigned int num_ballooned = 0;
129162306a36Sopenharmony_ci	struct dm_balloon_response *bl_resp;
129262306a36Sopenharmony_ci	int alloc_unit;
129362306a36Sopenharmony_ci	int ret;
129462306a36Sopenharmony_ci	bool done = false;
129562306a36Sopenharmony_ci	int i;
129662306a36Sopenharmony_ci	long avail_pages;
129762306a36Sopenharmony_ci	unsigned long floor;
129862306a36Sopenharmony_ci
129962306a36Sopenharmony_ci	/*
130062306a36Sopenharmony_ci	 * We will attempt 2M allocations. However, if we fail to
130162306a36Sopenharmony_ci	 * allocate 2M chunks, we will go back to PAGE_SIZE allocations.
130262306a36Sopenharmony_ci	 */
130362306a36Sopenharmony_ci	alloc_unit = PAGES_IN_2M;
130462306a36Sopenharmony_ci
130562306a36Sopenharmony_ci	avail_pages = si_mem_available();
130662306a36Sopenharmony_ci	floor = compute_balloon_floor();
130762306a36Sopenharmony_ci
130862306a36Sopenharmony_ci	/* Refuse to balloon below the floor. */
130962306a36Sopenharmony_ci	if (avail_pages < num_pages || avail_pages - num_pages < floor) {
131062306a36Sopenharmony_ci		pr_info("Balloon request will be partially fulfilled. %s\n",
131162306a36Sopenharmony_ci			avail_pages < num_pages ? "Not enough memory." :
131262306a36Sopenharmony_ci			"Balloon floor reached.");
131362306a36Sopenharmony_ci
131462306a36Sopenharmony_ci		num_pages = avail_pages > floor ? (avail_pages - floor) : 0;
131562306a36Sopenharmony_ci	}
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci	while (!done) {
131862306a36Sopenharmony_ci		memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE);
131962306a36Sopenharmony_ci		bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
132062306a36Sopenharmony_ci		bl_resp->hdr.type = DM_BALLOON_RESPONSE;
132162306a36Sopenharmony_ci		bl_resp->hdr.size = sizeof(struct dm_balloon_response);
132262306a36Sopenharmony_ci		bl_resp->more_pages = 1;
132362306a36Sopenharmony_ci
132462306a36Sopenharmony_ci		num_pages -= num_ballooned;
132562306a36Sopenharmony_ci		num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
132662306a36Sopenharmony_ci						    bl_resp, alloc_unit);
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci		if (alloc_unit != 1 && num_ballooned == 0) {
132962306a36Sopenharmony_ci			alloc_unit = 1;
133062306a36Sopenharmony_ci			continue;
133162306a36Sopenharmony_ci		}
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_ci		if (num_ballooned == 0 || num_ballooned == num_pages) {
133462306a36Sopenharmony_ci			pr_debug("Ballooned %u out of %u requested pages.\n",
133562306a36Sopenharmony_ci				num_pages, dm_device.balloon_wrk.num_pages);
133662306a36Sopenharmony_ci
133762306a36Sopenharmony_ci			bl_resp->more_pages = 0;
133862306a36Sopenharmony_ci			done = true;
133962306a36Sopenharmony_ci			dm_device.state = DM_INITIALIZED;
134062306a36Sopenharmony_ci		}
134162306a36Sopenharmony_ci
134262306a36Sopenharmony_ci		/*
134362306a36Sopenharmony_ci		 * We are pushing a lot of data through the channel;
134462306a36Sopenharmony_ci		 * deal with transient failures caused because of the
134562306a36Sopenharmony_ci		 * lack of space in the ring buffer.
134662306a36Sopenharmony_ci		 */
134762306a36Sopenharmony_ci
134862306a36Sopenharmony_ci		do {
134962306a36Sopenharmony_ci			bl_resp->hdr.trans_id = atomic_inc_return(&trans_id);
135062306a36Sopenharmony_ci			ret = vmbus_sendpacket(dm_device.dev->channel,
135162306a36Sopenharmony_ci						bl_resp,
135262306a36Sopenharmony_ci						bl_resp->hdr.size,
135362306a36Sopenharmony_ci						(unsigned long)NULL,
135462306a36Sopenharmony_ci						VM_PKT_DATA_INBAND, 0);
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci			if (ret == -EAGAIN)
135762306a36Sopenharmony_ci				msleep(20);
135862306a36Sopenharmony_ci			post_status(&dm_device);
135962306a36Sopenharmony_ci		} while (ret == -EAGAIN);
136062306a36Sopenharmony_ci
136162306a36Sopenharmony_ci		if (ret) {
136262306a36Sopenharmony_ci			/*
136362306a36Sopenharmony_ci			 * Free up the memory we allocatted.
136462306a36Sopenharmony_ci			 */
136562306a36Sopenharmony_ci			pr_err("Balloon response failed\n");
136662306a36Sopenharmony_ci
136762306a36Sopenharmony_ci			for (i = 0; i < bl_resp->range_count; i++)
136862306a36Sopenharmony_ci				free_balloon_pages(&dm_device,
136962306a36Sopenharmony_ci						 &bl_resp->range_array[i]);
137062306a36Sopenharmony_ci
137162306a36Sopenharmony_ci			done = true;
137262306a36Sopenharmony_ci		}
137362306a36Sopenharmony_ci	}
137462306a36Sopenharmony_ci
137562306a36Sopenharmony_ci}
137662306a36Sopenharmony_ci
137762306a36Sopenharmony_cistatic void balloon_down(struct hv_dynmem_device *dm,
137862306a36Sopenharmony_ci			struct dm_unballoon_request *req)
137962306a36Sopenharmony_ci{
138062306a36Sopenharmony_ci	union dm_mem_page_range *range_array = req->range_array;
138162306a36Sopenharmony_ci	int range_count = req->range_count;
138262306a36Sopenharmony_ci	struct dm_unballoon_response resp;
138362306a36Sopenharmony_ci	int i;
138462306a36Sopenharmony_ci	unsigned int prev_pages_ballooned = dm->num_pages_ballooned;
138562306a36Sopenharmony_ci
138662306a36Sopenharmony_ci	for (i = 0; i < range_count; i++) {
138762306a36Sopenharmony_ci		free_balloon_pages(dm, &range_array[i]);
138862306a36Sopenharmony_ci		complete(&dm_device.config_event);
138962306a36Sopenharmony_ci	}
139062306a36Sopenharmony_ci
139162306a36Sopenharmony_ci	pr_debug("Freed %u ballooned pages.\n",
139262306a36Sopenharmony_ci		prev_pages_ballooned - dm->num_pages_ballooned);
139362306a36Sopenharmony_ci
139462306a36Sopenharmony_ci	if (req->more_pages == 1)
139562306a36Sopenharmony_ci		return;
139662306a36Sopenharmony_ci
139762306a36Sopenharmony_ci	memset(&resp, 0, sizeof(struct dm_unballoon_response));
139862306a36Sopenharmony_ci	resp.hdr.type = DM_UNBALLOON_RESPONSE;
139962306a36Sopenharmony_ci	resp.hdr.trans_id = atomic_inc_return(&trans_id);
140062306a36Sopenharmony_ci	resp.hdr.size = sizeof(struct dm_unballoon_response);
140162306a36Sopenharmony_ci
140262306a36Sopenharmony_ci	vmbus_sendpacket(dm_device.dev->channel, &resp,
140362306a36Sopenharmony_ci				sizeof(struct dm_unballoon_response),
140462306a36Sopenharmony_ci				(unsigned long)NULL,
140562306a36Sopenharmony_ci				VM_PKT_DATA_INBAND, 0);
140662306a36Sopenharmony_ci
140762306a36Sopenharmony_ci	dm->state = DM_INITIALIZED;
140862306a36Sopenharmony_ci}
140962306a36Sopenharmony_ci
141062306a36Sopenharmony_cistatic void balloon_onchannelcallback(void *context);
141162306a36Sopenharmony_ci
141262306a36Sopenharmony_cistatic int dm_thread_func(void *dm_dev)
141362306a36Sopenharmony_ci{
141462306a36Sopenharmony_ci	struct hv_dynmem_device *dm = dm_dev;
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	while (!kthread_should_stop()) {
141762306a36Sopenharmony_ci		wait_for_completion_interruptible_timeout(
141862306a36Sopenharmony_ci						&dm_device.config_event, 1*HZ);
141962306a36Sopenharmony_ci		/*
142062306a36Sopenharmony_ci		 * The host expects us to post information on the memory
142162306a36Sopenharmony_ci		 * pressure every second.
142262306a36Sopenharmony_ci		 */
142362306a36Sopenharmony_ci		reinit_completion(&dm_device.config_event);
142462306a36Sopenharmony_ci		post_status(dm);
142562306a36Sopenharmony_ci		/*
142662306a36Sopenharmony_ci		 * disable free page reporting if multiple hypercall
142762306a36Sopenharmony_ci		 * failure flag set. It is not done in the page_reporting
142862306a36Sopenharmony_ci		 * callback context as that causes a deadlock between
142962306a36Sopenharmony_ci		 * page_reporting_process() and page_reporting_unregister()
143062306a36Sopenharmony_ci		 */
143162306a36Sopenharmony_ci		if (hv_hypercall_multi_failure >= HV_MAX_FAILURES) {
143262306a36Sopenharmony_ci			pr_err("Multiple failures in cold memory discard hypercall, disabling page reporting\n");
143362306a36Sopenharmony_ci			disable_page_reporting();
143462306a36Sopenharmony_ci			/* Reset the flag after disabling reporting */
143562306a36Sopenharmony_ci			hv_hypercall_multi_failure = 0;
143662306a36Sopenharmony_ci		}
143762306a36Sopenharmony_ci	}
143862306a36Sopenharmony_ci
143962306a36Sopenharmony_ci	return 0;
144062306a36Sopenharmony_ci}
144162306a36Sopenharmony_ci
144262306a36Sopenharmony_ci
144362306a36Sopenharmony_cistatic void version_resp(struct hv_dynmem_device *dm,
144462306a36Sopenharmony_ci			struct dm_version_response *vresp)
144562306a36Sopenharmony_ci{
144662306a36Sopenharmony_ci	struct dm_version_request version_req;
144762306a36Sopenharmony_ci	int ret;
144862306a36Sopenharmony_ci
144962306a36Sopenharmony_ci	if (vresp->is_accepted) {
145062306a36Sopenharmony_ci		/*
145162306a36Sopenharmony_ci		 * We are done; wakeup the
145262306a36Sopenharmony_ci		 * context waiting for version
145362306a36Sopenharmony_ci		 * negotiation.
145462306a36Sopenharmony_ci		 */
145562306a36Sopenharmony_ci		complete(&dm->host_event);
145662306a36Sopenharmony_ci		return;
145762306a36Sopenharmony_ci	}
145862306a36Sopenharmony_ci	/*
145962306a36Sopenharmony_ci	 * If there are more versions to try, continue
146062306a36Sopenharmony_ci	 * with negotiations; if not
146162306a36Sopenharmony_ci	 * shutdown the service since we are not able
146262306a36Sopenharmony_ci	 * to negotiate a suitable version number
146362306a36Sopenharmony_ci	 * with the host.
146462306a36Sopenharmony_ci	 */
146562306a36Sopenharmony_ci	if (dm->next_version == 0)
146662306a36Sopenharmony_ci		goto version_error;
146762306a36Sopenharmony_ci
146862306a36Sopenharmony_ci	memset(&version_req, 0, sizeof(struct dm_version_request));
146962306a36Sopenharmony_ci	version_req.hdr.type = DM_VERSION_REQUEST;
147062306a36Sopenharmony_ci	version_req.hdr.size = sizeof(struct dm_version_request);
147162306a36Sopenharmony_ci	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
147262306a36Sopenharmony_ci	version_req.version.version = dm->next_version;
147362306a36Sopenharmony_ci	dm->version = version_req.version.version;
147462306a36Sopenharmony_ci
147562306a36Sopenharmony_ci	/*
147662306a36Sopenharmony_ci	 * Set the next version to try in case current version fails.
147762306a36Sopenharmony_ci	 * Win7 protocol ought to be the last one to try.
147862306a36Sopenharmony_ci	 */
147962306a36Sopenharmony_ci	switch (version_req.version.version) {
148062306a36Sopenharmony_ci	case DYNMEM_PROTOCOL_VERSION_WIN8:
148162306a36Sopenharmony_ci		dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
148262306a36Sopenharmony_ci		version_req.is_last_attempt = 0;
148362306a36Sopenharmony_ci		break;
148462306a36Sopenharmony_ci	default:
148562306a36Sopenharmony_ci		dm->next_version = 0;
148662306a36Sopenharmony_ci		version_req.is_last_attempt = 1;
148762306a36Sopenharmony_ci	}
148862306a36Sopenharmony_ci
148962306a36Sopenharmony_ci	ret = vmbus_sendpacket(dm->dev->channel, &version_req,
149062306a36Sopenharmony_ci				sizeof(struct dm_version_request),
149162306a36Sopenharmony_ci				(unsigned long)NULL,
149262306a36Sopenharmony_ci				VM_PKT_DATA_INBAND, 0);
149362306a36Sopenharmony_ci
149462306a36Sopenharmony_ci	if (ret)
149562306a36Sopenharmony_ci		goto version_error;
149662306a36Sopenharmony_ci
149762306a36Sopenharmony_ci	return;
149862306a36Sopenharmony_ci
149962306a36Sopenharmony_civersion_error:
150062306a36Sopenharmony_ci	dm->state = DM_INIT_ERROR;
150162306a36Sopenharmony_ci	complete(&dm->host_event);
150262306a36Sopenharmony_ci}
150362306a36Sopenharmony_ci
150462306a36Sopenharmony_cistatic void cap_resp(struct hv_dynmem_device *dm,
150562306a36Sopenharmony_ci			struct dm_capabilities_resp_msg *cap_resp)
150662306a36Sopenharmony_ci{
150762306a36Sopenharmony_ci	if (!cap_resp->is_accepted) {
150862306a36Sopenharmony_ci		pr_err("Capabilities not accepted by host\n");
150962306a36Sopenharmony_ci		dm->state = DM_INIT_ERROR;
151062306a36Sopenharmony_ci	}
151162306a36Sopenharmony_ci	complete(&dm->host_event);
151262306a36Sopenharmony_ci}
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_cistatic void balloon_onchannelcallback(void *context)
151562306a36Sopenharmony_ci{
151662306a36Sopenharmony_ci	struct hv_device *dev = context;
151762306a36Sopenharmony_ci	u32 recvlen;
151862306a36Sopenharmony_ci	u64 requestid;
151962306a36Sopenharmony_ci	struct dm_message *dm_msg;
152062306a36Sopenharmony_ci	struct dm_header *dm_hdr;
152162306a36Sopenharmony_ci	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
152262306a36Sopenharmony_ci	struct dm_balloon *bal_msg;
152362306a36Sopenharmony_ci	struct dm_hot_add *ha_msg;
152462306a36Sopenharmony_ci	union dm_mem_page_range *ha_pg_range;
152562306a36Sopenharmony_ci	union dm_mem_page_range *ha_region;
152662306a36Sopenharmony_ci
152762306a36Sopenharmony_ci	memset(recv_buffer, 0, sizeof(recv_buffer));
152862306a36Sopenharmony_ci	vmbus_recvpacket(dev->channel, recv_buffer,
152962306a36Sopenharmony_ci			 HV_HYP_PAGE_SIZE, &recvlen, &requestid);
153062306a36Sopenharmony_ci
153162306a36Sopenharmony_ci	if (recvlen > 0) {
153262306a36Sopenharmony_ci		dm_msg = (struct dm_message *)recv_buffer;
153362306a36Sopenharmony_ci		dm_hdr = &dm_msg->hdr;
153462306a36Sopenharmony_ci
153562306a36Sopenharmony_ci		switch (dm_hdr->type) {
153662306a36Sopenharmony_ci		case DM_VERSION_RESPONSE:
153762306a36Sopenharmony_ci			version_resp(dm,
153862306a36Sopenharmony_ci				 (struct dm_version_response *)dm_msg);
153962306a36Sopenharmony_ci			break;
154062306a36Sopenharmony_ci
154162306a36Sopenharmony_ci		case DM_CAPABILITIES_RESPONSE:
154262306a36Sopenharmony_ci			cap_resp(dm,
154362306a36Sopenharmony_ci				 (struct dm_capabilities_resp_msg *)dm_msg);
154462306a36Sopenharmony_ci			break;
154562306a36Sopenharmony_ci
154662306a36Sopenharmony_ci		case DM_BALLOON_REQUEST:
154762306a36Sopenharmony_ci			if (allow_hibernation) {
154862306a36Sopenharmony_ci				pr_info("Ignore balloon-up request!\n");
154962306a36Sopenharmony_ci				break;
155062306a36Sopenharmony_ci			}
155162306a36Sopenharmony_ci
155262306a36Sopenharmony_ci			if (dm->state == DM_BALLOON_UP)
155362306a36Sopenharmony_ci				pr_warn("Currently ballooning\n");
155462306a36Sopenharmony_ci			bal_msg = (struct dm_balloon *)recv_buffer;
155562306a36Sopenharmony_ci			dm->state = DM_BALLOON_UP;
155662306a36Sopenharmony_ci			dm_device.balloon_wrk.num_pages = bal_msg->num_pages;
155762306a36Sopenharmony_ci			schedule_work(&dm_device.balloon_wrk.wrk);
155862306a36Sopenharmony_ci			break;
155962306a36Sopenharmony_ci
156062306a36Sopenharmony_ci		case DM_UNBALLOON_REQUEST:
156162306a36Sopenharmony_ci			if (allow_hibernation) {
156262306a36Sopenharmony_ci				pr_info("Ignore balloon-down request!\n");
156362306a36Sopenharmony_ci				break;
156462306a36Sopenharmony_ci			}
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci			dm->state = DM_BALLOON_DOWN;
156762306a36Sopenharmony_ci			balloon_down(dm,
156862306a36Sopenharmony_ci				 (struct dm_unballoon_request *)recv_buffer);
156962306a36Sopenharmony_ci			break;
157062306a36Sopenharmony_ci
157162306a36Sopenharmony_ci		case DM_MEM_HOT_ADD_REQUEST:
157262306a36Sopenharmony_ci			if (dm->state == DM_HOT_ADD)
157362306a36Sopenharmony_ci				pr_warn("Currently hot-adding\n");
157462306a36Sopenharmony_ci			dm->state = DM_HOT_ADD;
157562306a36Sopenharmony_ci			ha_msg = (struct dm_hot_add *)recv_buffer;
157662306a36Sopenharmony_ci			if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
157762306a36Sopenharmony_ci				/*
157862306a36Sopenharmony_ci				 * This is a normal hot-add request specifying
157962306a36Sopenharmony_ci				 * hot-add memory.
158062306a36Sopenharmony_ci				 */
158162306a36Sopenharmony_ci				dm->host_specified_ha_region = false;
158262306a36Sopenharmony_ci				ha_pg_range = &ha_msg->range;
158362306a36Sopenharmony_ci				dm->ha_wrk.ha_page_range = *ha_pg_range;
158462306a36Sopenharmony_ci				dm->ha_wrk.ha_region_range.page_range = 0;
158562306a36Sopenharmony_ci			} else {
158662306a36Sopenharmony_ci				/*
158762306a36Sopenharmony_ci				 * Host is specifying that we first hot-add
158862306a36Sopenharmony_ci				 * a region and then partially populate this
158962306a36Sopenharmony_ci				 * region.
159062306a36Sopenharmony_ci				 */
159162306a36Sopenharmony_ci				dm->host_specified_ha_region = true;
159262306a36Sopenharmony_ci				ha_pg_range = &ha_msg->range;
159362306a36Sopenharmony_ci				ha_region = &ha_pg_range[1];
159462306a36Sopenharmony_ci				dm->ha_wrk.ha_page_range = *ha_pg_range;
159562306a36Sopenharmony_ci				dm->ha_wrk.ha_region_range = *ha_region;
159662306a36Sopenharmony_ci			}
159762306a36Sopenharmony_ci			schedule_work(&dm_device.ha_wrk.wrk);
159862306a36Sopenharmony_ci			break;
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci		case DM_INFO_MESSAGE:
160162306a36Sopenharmony_ci			process_info(dm, (struct dm_info_msg *)dm_msg);
160262306a36Sopenharmony_ci			break;
160362306a36Sopenharmony_ci
160462306a36Sopenharmony_ci		default:
160562306a36Sopenharmony_ci			pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type);
160662306a36Sopenharmony_ci
160762306a36Sopenharmony_ci		}
160862306a36Sopenharmony_ci	}
160962306a36Sopenharmony_ci
161062306a36Sopenharmony_ci}
161162306a36Sopenharmony_ci
161262306a36Sopenharmony_ci#define HV_LARGE_REPORTING_ORDER	9
161362306a36Sopenharmony_ci#define HV_LARGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << \
161462306a36Sopenharmony_ci		HV_LARGE_REPORTING_ORDER)
161562306a36Sopenharmony_cistatic int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info,
161662306a36Sopenharmony_ci		    struct scatterlist *sgl, unsigned int nents)
161762306a36Sopenharmony_ci{
161862306a36Sopenharmony_ci	unsigned long flags;
161962306a36Sopenharmony_ci	struct hv_memory_hint *hint;
162062306a36Sopenharmony_ci	int i, order;
162162306a36Sopenharmony_ci	u64 status;
162262306a36Sopenharmony_ci	struct scatterlist *sg;
162362306a36Sopenharmony_ci
162462306a36Sopenharmony_ci	WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
162562306a36Sopenharmony_ci	WARN_ON_ONCE(sgl->length < (HV_HYP_PAGE_SIZE << page_reporting_order));
162662306a36Sopenharmony_ci	local_irq_save(flags);
162762306a36Sopenharmony_ci	hint = *this_cpu_ptr(hyperv_pcpu_input_arg);
162862306a36Sopenharmony_ci	if (!hint) {
162962306a36Sopenharmony_ci		local_irq_restore(flags);
163062306a36Sopenharmony_ci		return -ENOSPC;
163162306a36Sopenharmony_ci	}
163262306a36Sopenharmony_ci
163362306a36Sopenharmony_ci	hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD;
163462306a36Sopenharmony_ci	hint->reserved = 0;
163562306a36Sopenharmony_ci	for_each_sg(sgl, sg, nents, i) {
163662306a36Sopenharmony_ci		union hv_gpa_page_range *range;
163762306a36Sopenharmony_ci
163862306a36Sopenharmony_ci		range = &hint->ranges[i];
163962306a36Sopenharmony_ci		range->address_space = 0;
164062306a36Sopenharmony_ci		order = get_order(sg->length);
164162306a36Sopenharmony_ci		/*
164262306a36Sopenharmony_ci		 * Hyper-V expects the additional_pages field in the units
164362306a36Sopenharmony_ci		 * of one of these 3 sizes, 4Kbytes, 2Mbytes or 1Gbytes.
164462306a36Sopenharmony_ci		 * This is dictated by the values of the fields page.largesize
164562306a36Sopenharmony_ci		 * and page_size.
164662306a36Sopenharmony_ci		 * This code however, only uses 4Kbytes and 2Mbytes units
164762306a36Sopenharmony_ci		 * and not 1Gbytes unit.
164862306a36Sopenharmony_ci		 */
164962306a36Sopenharmony_ci
165062306a36Sopenharmony_ci		/* page reporting for pages 2MB or higher */
165162306a36Sopenharmony_ci		if (order >= HV_LARGE_REPORTING_ORDER ) {
165262306a36Sopenharmony_ci			range->page.largepage = 1;
165362306a36Sopenharmony_ci			range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB;
165462306a36Sopenharmony_ci			range->base_large_pfn = page_to_hvpfn(
165562306a36Sopenharmony_ci					sg_page(sg)) >> HV_LARGE_REPORTING_ORDER;
165662306a36Sopenharmony_ci			range->page.additional_pages =
165762306a36Sopenharmony_ci				(sg->length / HV_LARGE_REPORTING_LEN) - 1;
165862306a36Sopenharmony_ci		} else {
165962306a36Sopenharmony_ci			/* Page reporting for pages below 2MB */
166062306a36Sopenharmony_ci			range->page.basepfn = page_to_hvpfn(sg_page(sg));
166162306a36Sopenharmony_ci			range->page.largepage = false;
166262306a36Sopenharmony_ci			range->page.additional_pages =
166362306a36Sopenharmony_ci				(sg->length / HV_HYP_PAGE_SIZE) - 1;
166462306a36Sopenharmony_ci		}
166562306a36Sopenharmony_ci
166662306a36Sopenharmony_ci	}
166762306a36Sopenharmony_ci
166862306a36Sopenharmony_ci	status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0,
166962306a36Sopenharmony_ci				     hint, NULL);
167062306a36Sopenharmony_ci	local_irq_restore(flags);
167162306a36Sopenharmony_ci	if (!hv_result_success(status)) {
167262306a36Sopenharmony_ci
167362306a36Sopenharmony_ci		pr_err("Cold memory discard hypercall failed with status %llx\n",
167462306a36Sopenharmony_ci				status);
167562306a36Sopenharmony_ci		if (hv_hypercall_multi_failure > 0)
167662306a36Sopenharmony_ci			hv_hypercall_multi_failure++;
167762306a36Sopenharmony_ci
167862306a36Sopenharmony_ci		if (hv_result(status) == HV_STATUS_INVALID_PARAMETER) {
167962306a36Sopenharmony_ci			pr_err("Underlying Hyper-V does not support order less than 9. Hypercall failed\n");
168062306a36Sopenharmony_ci			pr_err("Defaulting to page_reporting_order %d\n",
168162306a36Sopenharmony_ci					pageblock_order);
168262306a36Sopenharmony_ci			page_reporting_order = pageblock_order;
168362306a36Sopenharmony_ci			hv_hypercall_multi_failure++;
168462306a36Sopenharmony_ci			return -EINVAL;
168562306a36Sopenharmony_ci		}
168662306a36Sopenharmony_ci
168762306a36Sopenharmony_ci		return -EINVAL;
168862306a36Sopenharmony_ci	}
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ci	return 0;
169162306a36Sopenharmony_ci}
169262306a36Sopenharmony_ci
169362306a36Sopenharmony_cistatic void enable_page_reporting(void)
169462306a36Sopenharmony_ci{
169562306a36Sopenharmony_ci	int ret;
169662306a36Sopenharmony_ci
169762306a36Sopenharmony_ci	if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) {
169862306a36Sopenharmony_ci		pr_debug("Cold memory discard hint not supported by Hyper-V\n");
169962306a36Sopenharmony_ci		return;
170062306a36Sopenharmony_ci	}
170162306a36Sopenharmony_ci
170262306a36Sopenharmony_ci	BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES);
170362306a36Sopenharmony_ci	dm_device.pr_dev_info.report = hv_free_page_report;
170462306a36Sopenharmony_ci	/*
170562306a36Sopenharmony_ci	 * We let the page_reporting_order parameter decide the order
170662306a36Sopenharmony_ci	 * in the page_reporting code
170762306a36Sopenharmony_ci	 */
170862306a36Sopenharmony_ci	dm_device.pr_dev_info.order = 0;
170962306a36Sopenharmony_ci	ret = page_reporting_register(&dm_device.pr_dev_info);
171062306a36Sopenharmony_ci	if (ret < 0) {
171162306a36Sopenharmony_ci		dm_device.pr_dev_info.report = NULL;
171262306a36Sopenharmony_ci		pr_err("Failed to enable cold memory discard: %d\n", ret);
171362306a36Sopenharmony_ci	} else {
171462306a36Sopenharmony_ci		pr_info("Cold memory discard hint enabled with order %d\n",
171562306a36Sopenharmony_ci				page_reporting_order);
171662306a36Sopenharmony_ci	}
171762306a36Sopenharmony_ci}
171862306a36Sopenharmony_ci
171962306a36Sopenharmony_cistatic void disable_page_reporting(void)
172062306a36Sopenharmony_ci{
172162306a36Sopenharmony_ci	if (dm_device.pr_dev_info.report) {
172262306a36Sopenharmony_ci		page_reporting_unregister(&dm_device.pr_dev_info);
172362306a36Sopenharmony_ci		dm_device.pr_dev_info.report = NULL;
172462306a36Sopenharmony_ci	}
172562306a36Sopenharmony_ci}
172662306a36Sopenharmony_ci
172762306a36Sopenharmony_cistatic int ballooning_enabled(void)
172862306a36Sopenharmony_ci{
172962306a36Sopenharmony_ci	/*
173062306a36Sopenharmony_ci	 * Disable ballooning if the page size is not 4k (HV_HYP_PAGE_SIZE),
173162306a36Sopenharmony_ci	 * since currently it's unclear to us whether an unballoon request can
173262306a36Sopenharmony_ci	 * make sure all page ranges are guest page size aligned.
173362306a36Sopenharmony_ci	 */
173462306a36Sopenharmony_ci	if (PAGE_SIZE != HV_HYP_PAGE_SIZE) {
173562306a36Sopenharmony_ci		pr_info("Ballooning disabled because page size is not 4096 bytes\n");
173662306a36Sopenharmony_ci		return 0;
173762306a36Sopenharmony_ci	}
173862306a36Sopenharmony_ci
173962306a36Sopenharmony_ci	return 1;
174062306a36Sopenharmony_ci}
174162306a36Sopenharmony_ci
174262306a36Sopenharmony_cistatic int hot_add_enabled(void)
174362306a36Sopenharmony_ci{
174462306a36Sopenharmony_ci	/*
174562306a36Sopenharmony_ci	 * Disable hot add on ARM64, because we currently rely on
174662306a36Sopenharmony_ci	 * memory_add_physaddr_to_nid() to get a node id of a hot add range,
174762306a36Sopenharmony_ci	 * however ARM64's memory_add_physaddr_to_nid() always return 0 and
174862306a36Sopenharmony_ci	 * DM_MEM_HOT_ADD_REQUEST doesn't have the NUMA node information for
174962306a36Sopenharmony_ci	 * add_memory().
175062306a36Sopenharmony_ci	 */
175162306a36Sopenharmony_ci	if (IS_ENABLED(CONFIG_ARM64)) {
175262306a36Sopenharmony_ci		pr_info("Memory hot add disabled on ARM64\n");
175362306a36Sopenharmony_ci		return 0;
175462306a36Sopenharmony_ci	}
175562306a36Sopenharmony_ci
175662306a36Sopenharmony_ci	return 1;
175762306a36Sopenharmony_ci}
175862306a36Sopenharmony_ci
175962306a36Sopenharmony_cistatic int balloon_connect_vsp(struct hv_device *dev)
176062306a36Sopenharmony_ci{
176162306a36Sopenharmony_ci	struct dm_version_request version_req;
176262306a36Sopenharmony_ci	struct dm_capabilities cap_msg;
176362306a36Sopenharmony_ci	unsigned long t;
176462306a36Sopenharmony_ci	int ret;
176562306a36Sopenharmony_ci
176662306a36Sopenharmony_ci	/*
176762306a36Sopenharmony_ci	 * max_pkt_size should be large enough for one vmbus packet header plus
176862306a36Sopenharmony_ci	 * our receive buffer size. Hyper-V sends messages up to
176962306a36Sopenharmony_ci	 * HV_HYP_PAGE_SIZE bytes long on balloon channel.
177062306a36Sopenharmony_ci	 */
177162306a36Sopenharmony_ci	dev->channel->max_pkt_size = HV_HYP_PAGE_SIZE * 2;
177262306a36Sopenharmony_ci
177362306a36Sopenharmony_ci	ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
177462306a36Sopenharmony_ci			 balloon_onchannelcallback, dev);
177562306a36Sopenharmony_ci	if (ret)
177662306a36Sopenharmony_ci		return ret;
177762306a36Sopenharmony_ci
177862306a36Sopenharmony_ci	/*
177962306a36Sopenharmony_ci	 * Initiate the hand shake with the host and negotiate
178062306a36Sopenharmony_ci	 * a version that the host can support. We start with the
178162306a36Sopenharmony_ci	 * highest version number and go down if the host cannot
178262306a36Sopenharmony_ci	 * support it.
178362306a36Sopenharmony_ci	 */
178462306a36Sopenharmony_ci	memset(&version_req, 0, sizeof(struct dm_version_request));
178562306a36Sopenharmony_ci	version_req.hdr.type = DM_VERSION_REQUEST;
178662306a36Sopenharmony_ci	version_req.hdr.size = sizeof(struct dm_version_request);
178762306a36Sopenharmony_ci	version_req.hdr.trans_id = atomic_inc_return(&trans_id);
178862306a36Sopenharmony_ci	version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10;
178962306a36Sopenharmony_ci	version_req.is_last_attempt = 0;
179062306a36Sopenharmony_ci	dm_device.version = version_req.version.version;
179162306a36Sopenharmony_ci
179262306a36Sopenharmony_ci	ret = vmbus_sendpacket(dev->channel, &version_req,
179362306a36Sopenharmony_ci			       sizeof(struct dm_version_request),
179462306a36Sopenharmony_ci			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
179562306a36Sopenharmony_ci	if (ret)
179662306a36Sopenharmony_ci		goto out;
179762306a36Sopenharmony_ci
179862306a36Sopenharmony_ci	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
179962306a36Sopenharmony_ci	if (t == 0) {
180062306a36Sopenharmony_ci		ret = -ETIMEDOUT;
180162306a36Sopenharmony_ci		goto out;
180262306a36Sopenharmony_ci	}
180362306a36Sopenharmony_ci
180462306a36Sopenharmony_ci	/*
180562306a36Sopenharmony_ci	 * If we could not negotiate a compatible version with the host
180662306a36Sopenharmony_ci	 * fail the probe function.
180762306a36Sopenharmony_ci	 */
180862306a36Sopenharmony_ci	if (dm_device.state == DM_INIT_ERROR) {
180962306a36Sopenharmony_ci		ret = -EPROTO;
181062306a36Sopenharmony_ci		goto out;
181162306a36Sopenharmony_ci	}
181262306a36Sopenharmony_ci
181362306a36Sopenharmony_ci	pr_info("Using Dynamic Memory protocol version %u.%u\n",
181462306a36Sopenharmony_ci		DYNMEM_MAJOR_VERSION(dm_device.version),
181562306a36Sopenharmony_ci		DYNMEM_MINOR_VERSION(dm_device.version));
181662306a36Sopenharmony_ci
181762306a36Sopenharmony_ci	/*
181862306a36Sopenharmony_ci	 * Now submit our capabilities to the host.
181962306a36Sopenharmony_ci	 */
182062306a36Sopenharmony_ci	memset(&cap_msg, 0, sizeof(struct dm_capabilities));
182162306a36Sopenharmony_ci	cap_msg.hdr.type = DM_CAPABILITIES_REPORT;
182262306a36Sopenharmony_ci	cap_msg.hdr.size = sizeof(struct dm_capabilities);
182362306a36Sopenharmony_ci	cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
182462306a36Sopenharmony_ci
182562306a36Sopenharmony_ci	/*
182662306a36Sopenharmony_ci	 * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host
182762306a36Sopenharmony_ci	 * currently still requires the bits to be set, so we have to add code
182862306a36Sopenharmony_ci	 * to fail the host's hot-add and balloon up/down requests, if any.
182962306a36Sopenharmony_ci	 */
183062306a36Sopenharmony_ci	cap_msg.caps.cap_bits.balloon = ballooning_enabled();
183162306a36Sopenharmony_ci	cap_msg.caps.cap_bits.hot_add = hot_add_enabled();
183262306a36Sopenharmony_ci
183362306a36Sopenharmony_ci	/*
183462306a36Sopenharmony_ci	 * Specify our alignment requirements as it relates
183562306a36Sopenharmony_ci	 * memory hot-add. Specify 128MB alignment.
183662306a36Sopenharmony_ci	 */
183762306a36Sopenharmony_ci	cap_msg.caps.cap_bits.hot_add_alignment = 7;
183862306a36Sopenharmony_ci
183962306a36Sopenharmony_ci	/*
184062306a36Sopenharmony_ci	 * Currently the host does not use these
184162306a36Sopenharmony_ci	 * values and we set them to what is done in the
184262306a36Sopenharmony_ci	 * Windows driver.
184362306a36Sopenharmony_ci	 */
184462306a36Sopenharmony_ci	cap_msg.min_page_cnt = 0;
184562306a36Sopenharmony_ci	cap_msg.max_page_number = -1;
184662306a36Sopenharmony_ci
184762306a36Sopenharmony_ci	ret = vmbus_sendpacket(dev->channel, &cap_msg,
184862306a36Sopenharmony_ci			       sizeof(struct dm_capabilities),
184962306a36Sopenharmony_ci			       (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
185062306a36Sopenharmony_ci	if (ret)
185162306a36Sopenharmony_ci		goto out;
185262306a36Sopenharmony_ci
185362306a36Sopenharmony_ci	t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
185462306a36Sopenharmony_ci	if (t == 0) {
185562306a36Sopenharmony_ci		ret = -ETIMEDOUT;
185662306a36Sopenharmony_ci		goto out;
185762306a36Sopenharmony_ci	}
185862306a36Sopenharmony_ci
185962306a36Sopenharmony_ci	/*
186062306a36Sopenharmony_ci	 * If the host does not like our capabilities,
186162306a36Sopenharmony_ci	 * fail the probe function.
186262306a36Sopenharmony_ci	 */
186362306a36Sopenharmony_ci	if (dm_device.state == DM_INIT_ERROR) {
186462306a36Sopenharmony_ci		ret = -EPROTO;
186562306a36Sopenharmony_ci		goto out;
186662306a36Sopenharmony_ci	}
186762306a36Sopenharmony_ci
186862306a36Sopenharmony_ci	return 0;
186962306a36Sopenharmony_ciout:
187062306a36Sopenharmony_ci	vmbus_close(dev->channel);
187162306a36Sopenharmony_ci	return ret;
187262306a36Sopenharmony_ci}
187362306a36Sopenharmony_ci
187462306a36Sopenharmony_ci/*
187562306a36Sopenharmony_ci * DEBUGFS Interface
187662306a36Sopenharmony_ci */
187762306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_FS
187862306a36Sopenharmony_ci
187962306a36Sopenharmony_ci/**
188062306a36Sopenharmony_ci * hv_balloon_debug_show - shows statistics of balloon operations.
188162306a36Sopenharmony_ci * @f: pointer to the &struct seq_file.
188262306a36Sopenharmony_ci * @offset: ignored.
188362306a36Sopenharmony_ci *
188462306a36Sopenharmony_ci * Provides the statistics that can be accessed in hv-balloon in the debugfs.
188562306a36Sopenharmony_ci *
188662306a36Sopenharmony_ci * Return: zero on success or an error code.
188762306a36Sopenharmony_ci */
188862306a36Sopenharmony_cistatic int hv_balloon_debug_show(struct seq_file *f, void *offset)
188962306a36Sopenharmony_ci{
189062306a36Sopenharmony_ci	struct hv_dynmem_device *dm = f->private;
189162306a36Sopenharmony_ci	char *sname;
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_ci	seq_printf(f, "%-22s: %u.%u\n", "host_version",
189462306a36Sopenharmony_ci				DYNMEM_MAJOR_VERSION(dm->version),
189562306a36Sopenharmony_ci				DYNMEM_MINOR_VERSION(dm->version));
189662306a36Sopenharmony_ci
189762306a36Sopenharmony_ci	seq_printf(f, "%-22s:", "capabilities");
189862306a36Sopenharmony_ci	if (ballooning_enabled())
189962306a36Sopenharmony_ci		seq_puts(f, " enabled");
190062306a36Sopenharmony_ci
190162306a36Sopenharmony_ci	if (hot_add_enabled())
190262306a36Sopenharmony_ci		seq_puts(f, " hot_add");
190362306a36Sopenharmony_ci
190462306a36Sopenharmony_ci	seq_puts(f, "\n");
190562306a36Sopenharmony_ci
190662306a36Sopenharmony_ci	seq_printf(f, "%-22s: %u", "state", dm->state);
190762306a36Sopenharmony_ci	switch (dm->state) {
190862306a36Sopenharmony_ci	case DM_INITIALIZING:
190962306a36Sopenharmony_ci			sname = "Initializing";
191062306a36Sopenharmony_ci			break;
191162306a36Sopenharmony_ci	case DM_INITIALIZED:
191262306a36Sopenharmony_ci			sname = "Initialized";
191362306a36Sopenharmony_ci			break;
191462306a36Sopenharmony_ci	case DM_BALLOON_UP:
191562306a36Sopenharmony_ci			sname = "Balloon Up";
191662306a36Sopenharmony_ci			break;
191762306a36Sopenharmony_ci	case DM_BALLOON_DOWN:
191862306a36Sopenharmony_ci			sname = "Balloon Down";
191962306a36Sopenharmony_ci			break;
192062306a36Sopenharmony_ci	case DM_HOT_ADD:
192162306a36Sopenharmony_ci			sname = "Hot Add";
192262306a36Sopenharmony_ci			break;
192362306a36Sopenharmony_ci	case DM_INIT_ERROR:
192462306a36Sopenharmony_ci			sname = "Error";
192562306a36Sopenharmony_ci			break;
192662306a36Sopenharmony_ci	default:
192762306a36Sopenharmony_ci			sname = "Unknown";
192862306a36Sopenharmony_ci	}
192962306a36Sopenharmony_ci	seq_printf(f, " (%s)\n", sname);
193062306a36Sopenharmony_ci
193162306a36Sopenharmony_ci	/* HV Page Size */
193262306a36Sopenharmony_ci	seq_printf(f, "%-22s: %ld\n", "page_size", HV_HYP_PAGE_SIZE);
193362306a36Sopenharmony_ci
193462306a36Sopenharmony_ci	/* Pages added with hot_add */
193562306a36Sopenharmony_ci	seq_printf(f, "%-22s: %u\n", "pages_added", dm->num_pages_added);
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_ci	/* pages that are "onlined"/used from pages_added */
193862306a36Sopenharmony_ci	seq_printf(f, "%-22s: %u\n", "pages_onlined", dm->num_pages_onlined);
193962306a36Sopenharmony_ci
194062306a36Sopenharmony_ci	/* pages we have given back to host */
194162306a36Sopenharmony_ci	seq_printf(f, "%-22s: %u\n", "pages_ballooned", dm->num_pages_ballooned);
194262306a36Sopenharmony_ci
194362306a36Sopenharmony_ci	seq_printf(f, "%-22s: %lu\n", "total_pages_committed",
194462306a36Sopenharmony_ci				get_pages_committed(dm));
194562306a36Sopenharmony_ci
194662306a36Sopenharmony_ci	seq_printf(f, "%-22s: %llu\n", "max_dynamic_page_count",
194762306a36Sopenharmony_ci				dm->max_dynamic_page_count);
194862306a36Sopenharmony_ci
194962306a36Sopenharmony_ci	return 0;
195062306a36Sopenharmony_ci}
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ciDEFINE_SHOW_ATTRIBUTE(hv_balloon_debug);
195362306a36Sopenharmony_ci
195462306a36Sopenharmony_cistatic void  hv_balloon_debugfs_init(struct hv_dynmem_device *b)
195562306a36Sopenharmony_ci{
195662306a36Sopenharmony_ci	debugfs_create_file("hv-balloon", 0444, NULL, b,
195762306a36Sopenharmony_ci			&hv_balloon_debug_fops);
195862306a36Sopenharmony_ci}
195962306a36Sopenharmony_ci
196062306a36Sopenharmony_cistatic void  hv_balloon_debugfs_exit(struct hv_dynmem_device *b)
196162306a36Sopenharmony_ci{
196262306a36Sopenharmony_ci	debugfs_lookup_and_remove("hv-balloon", NULL);
196362306a36Sopenharmony_ci}
196462306a36Sopenharmony_ci
196562306a36Sopenharmony_ci#else
196662306a36Sopenharmony_ci
196762306a36Sopenharmony_cistatic inline void hv_balloon_debugfs_init(struct hv_dynmem_device  *b)
196862306a36Sopenharmony_ci{
196962306a36Sopenharmony_ci}
197062306a36Sopenharmony_ci
197162306a36Sopenharmony_cistatic inline void hv_balloon_debugfs_exit(struct hv_dynmem_device *b)
197262306a36Sopenharmony_ci{
197362306a36Sopenharmony_ci}
197462306a36Sopenharmony_ci
197562306a36Sopenharmony_ci#endif	/* CONFIG_DEBUG_FS */
197662306a36Sopenharmony_ci
197762306a36Sopenharmony_cistatic int balloon_probe(struct hv_device *dev,
197862306a36Sopenharmony_ci			 const struct hv_vmbus_device_id *dev_id)
197962306a36Sopenharmony_ci{
198062306a36Sopenharmony_ci	int ret;
198162306a36Sopenharmony_ci
198262306a36Sopenharmony_ci	allow_hibernation = hv_is_hibernation_supported();
198362306a36Sopenharmony_ci	if (allow_hibernation)
198462306a36Sopenharmony_ci		hot_add = false;
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
198762306a36Sopenharmony_ci	do_hot_add = hot_add;
198862306a36Sopenharmony_ci#else
198962306a36Sopenharmony_ci	do_hot_add = false;
199062306a36Sopenharmony_ci#endif
199162306a36Sopenharmony_ci	dm_device.dev = dev;
199262306a36Sopenharmony_ci	dm_device.state = DM_INITIALIZING;
199362306a36Sopenharmony_ci	dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
199462306a36Sopenharmony_ci	init_completion(&dm_device.host_event);
199562306a36Sopenharmony_ci	init_completion(&dm_device.config_event);
199662306a36Sopenharmony_ci	INIT_LIST_HEAD(&dm_device.ha_region_list);
199762306a36Sopenharmony_ci	spin_lock_init(&dm_device.ha_lock);
199862306a36Sopenharmony_ci	INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
199962306a36Sopenharmony_ci	INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
200062306a36Sopenharmony_ci	dm_device.host_specified_ha_region = false;
200162306a36Sopenharmony_ci
200262306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
200362306a36Sopenharmony_ci	set_online_page_callback(&hv_online_page);
200462306a36Sopenharmony_ci	init_completion(&dm_device.ol_waitevent);
200562306a36Sopenharmony_ci	register_memory_notifier(&hv_memory_nb);
200662306a36Sopenharmony_ci#endif
200762306a36Sopenharmony_ci
200862306a36Sopenharmony_ci	hv_set_drvdata(dev, &dm_device);
200962306a36Sopenharmony_ci
201062306a36Sopenharmony_ci	ret = balloon_connect_vsp(dev);
201162306a36Sopenharmony_ci	if (ret != 0)
201262306a36Sopenharmony_ci		goto connect_error;
201362306a36Sopenharmony_ci
201462306a36Sopenharmony_ci	enable_page_reporting();
201562306a36Sopenharmony_ci	dm_device.state = DM_INITIALIZED;
201662306a36Sopenharmony_ci
201762306a36Sopenharmony_ci	dm_device.thread =
201862306a36Sopenharmony_ci		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
201962306a36Sopenharmony_ci	if (IS_ERR(dm_device.thread)) {
202062306a36Sopenharmony_ci		ret = PTR_ERR(dm_device.thread);
202162306a36Sopenharmony_ci		goto probe_error;
202262306a36Sopenharmony_ci	}
202362306a36Sopenharmony_ci
202462306a36Sopenharmony_ci	hv_balloon_debugfs_init(&dm_device);
202562306a36Sopenharmony_ci
202662306a36Sopenharmony_ci	return 0;
202762306a36Sopenharmony_ci
202862306a36Sopenharmony_ciprobe_error:
202962306a36Sopenharmony_ci	dm_device.state = DM_INIT_ERROR;
203062306a36Sopenharmony_ci	dm_device.thread  = NULL;
203162306a36Sopenharmony_ci	disable_page_reporting();
203262306a36Sopenharmony_ci	vmbus_close(dev->channel);
203362306a36Sopenharmony_ciconnect_error:
203462306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
203562306a36Sopenharmony_ci	unregister_memory_notifier(&hv_memory_nb);
203662306a36Sopenharmony_ci	restore_online_page_callback(&hv_online_page);
203762306a36Sopenharmony_ci#endif
203862306a36Sopenharmony_ci	return ret;
203962306a36Sopenharmony_ci}
204062306a36Sopenharmony_ci
204162306a36Sopenharmony_cistatic void balloon_remove(struct hv_device *dev)
204262306a36Sopenharmony_ci{
204362306a36Sopenharmony_ci	struct hv_dynmem_device *dm = hv_get_drvdata(dev);
204462306a36Sopenharmony_ci	struct hv_hotadd_state *has, *tmp;
204562306a36Sopenharmony_ci	struct hv_hotadd_gap *gap, *tmp_gap;
204662306a36Sopenharmony_ci
204762306a36Sopenharmony_ci	if (dm->num_pages_ballooned != 0)
204862306a36Sopenharmony_ci		pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
204962306a36Sopenharmony_ci
205062306a36Sopenharmony_ci	hv_balloon_debugfs_exit(dm);
205162306a36Sopenharmony_ci
205262306a36Sopenharmony_ci	cancel_work_sync(&dm->balloon_wrk.wrk);
205362306a36Sopenharmony_ci	cancel_work_sync(&dm->ha_wrk.wrk);
205462306a36Sopenharmony_ci
205562306a36Sopenharmony_ci	kthread_stop(dm->thread);
205662306a36Sopenharmony_ci
205762306a36Sopenharmony_ci	/*
205862306a36Sopenharmony_ci	 * This is to handle the case when balloon_resume()
205962306a36Sopenharmony_ci	 * call has failed and some cleanup has been done as
206062306a36Sopenharmony_ci	 * a part of the error handling.
206162306a36Sopenharmony_ci	 */
206262306a36Sopenharmony_ci	if (dm_device.state != DM_INIT_ERROR) {
206362306a36Sopenharmony_ci		disable_page_reporting();
206462306a36Sopenharmony_ci		vmbus_close(dev->channel);
206562306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
206662306a36Sopenharmony_ci		unregister_memory_notifier(&hv_memory_nb);
206762306a36Sopenharmony_ci		restore_online_page_callback(&hv_online_page);
206862306a36Sopenharmony_ci#endif
206962306a36Sopenharmony_ci	}
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_ci	guard(spinlock_irqsave)(&dm_device.ha_lock);
207262306a36Sopenharmony_ci	list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
207362306a36Sopenharmony_ci		list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
207462306a36Sopenharmony_ci			list_del(&gap->list);
207562306a36Sopenharmony_ci			kfree(gap);
207662306a36Sopenharmony_ci		}
207762306a36Sopenharmony_ci		list_del(&has->list);
207862306a36Sopenharmony_ci		kfree(has);
207962306a36Sopenharmony_ci	}
208062306a36Sopenharmony_ci}
208162306a36Sopenharmony_ci
208262306a36Sopenharmony_cistatic int balloon_suspend(struct hv_device *hv_dev)
208362306a36Sopenharmony_ci{
208462306a36Sopenharmony_ci	struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev);
208562306a36Sopenharmony_ci
208662306a36Sopenharmony_ci	tasklet_disable(&hv_dev->channel->callback_event);
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_ci	cancel_work_sync(&dm->balloon_wrk.wrk);
208962306a36Sopenharmony_ci	cancel_work_sync(&dm->ha_wrk.wrk);
209062306a36Sopenharmony_ci
209162306a36Sopenharmony_ci	if (dm->thread) {
209262306a36Sopenharmony_ci		kthread_stop(dm->thread);
209362306a36Sopenharmony_ci		dm->thread = NULL;
209462306a36Sopenharmony_ci		vmbus_close(hv_dev->channel);
209562306a36Sopenharmony_ci	}
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ci	tasklet_enable(&hv_dev->channel->callback_event);
209862306a36Sopenharmony_ci
209962306a36Sopenharmony_ci	return 0;
210062306a36Sopenharmony_ci
210162306a36Sopenharmony_ci}
210262306a36Sopenharmony_ci
210362306a36Sopenharmony_cistatic int balloon_resume(struct hv_device *dev)
210462306a36Sopenharmony_ci{
210562306a36Sopenharmony_ci	int ret;
210662306a36Sopenharmony_ci
210762306a36Sopenharmony_ci	dm_device.state = DM_INITIALIZING;
210862306a36Sopenharmony_ci
210962306a36Sopenharmony_ci	ret = balloon_connect_vsp(dev);
211062306a36Sopenharmony_ci
211162306a36Sopenharmony_ci	if (ret != 0)
211262306a36Sopenharmony_ci		goto out;
211362306a36Sopenharmony_ci
211462306a36Sopenharmony_ci	dm_device.thread =
211562306a36Sopenharmony_ci		 kthread_run(dm_thread_func, &dm_device, "hv_balloon");
211662306a36Sopenharmony_ci	if (IS_ERR(dm_device.thread)) {
211762306a36Sopenharmony_ci		ret = PTR_ERR(dm_device.thread);
211862306a36Sopenharmony_ci		dm_device.thread = NULL;
211962306a36Sopenharmony_ci		goto close_channel;
212062306a36Sopenharmony_ci	}
212162306a36Sopenharmony_ci
212262306a36Sopenharmony_ci	dm_device.state = DM_INITIALIZED;
212362306a36Sopenharmony_ci	return 0;
212462306a36Sopenharmony_ciclose_channel:
212562306a36Sopenharmony_ci	vmbus_close(dev->channel);
212662306a36Sopenharmony_ciout:
212762306a36Sopenharmony_ci	dm_device.state = DM_INIT_ERROR;
212862306a36Sopenharmony_ci	disable_page_reporting();
212962306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTPLUG
213062306a36Sopenharmony_ci	unregister_memory_notifier(&hv_memory_nb);
213162306a36Sopenharmony_ci	restore_online_page_callback(&hv_online_page);
213262306a36Sopenharmony_ci#endif
213362306a36Sopenharmony_ci	return ret;
213462306a36Sopenharmony_ci}
213562306a36Sopenharmony_ci
213662306a36Sopenharmony_cistatic const struct hv_vmbus_device_id id_table[] = {
213762306a36Sopenharmony_ci	/* Dynamic Memory Class ID */
213862306a36Sopenharmony_ci	/* 525074DC-8985-46e2-8057-A307DC18A502 */
213962306a36Sopenharmony_ci	{ HV_DM_GUID, },
214062306a36Sopenharmony_ci	{ },
214162306a36Sopenharmony_ci};
214262306a36Sopenharmony_ci
214362306a36Sopenharmony_ciMODULE_DEVICE_TABLE(vmbus, id_table);
214462306a36Sopenharmony_ci
214562306a36Sopenharmony_cistatic  struct hv_driver balloon_drv = {
214662306a36Sopenharmony_ci	.name = "hv_balloon",
214762306a36Sopenharmony_ci	.id_table = id_table,
214862306a36Sopenharmony_ci	.probe =  balloon_probe,
214962306a36Sopenharmony_ci	.remove =  balloon_remove,
215062306a36Sopenharmony_ci	.suspend = balloon_suspend,
215162306a36Sopenharmony_ci	.resume = balloon_resume,
215262306a36Sopenharmony_ci	.driver = {
215362306a36Sopenharmony_ci		.probe_type = PROBE_PREFER_ASYNCHRONOUS,
215462306a36Sopenharmony_ci	},
215562306a36Sopenharmony_ci};
215662306a36Sopenharmony_ci
215762306a36Sopenharmony_cistatic int __init init_balloon_drv(void)
215862306a36Sopenharmony_ci{
215962306a36Sopenharmony_ci
216062306a36Sopenharmony_ci	return vmbus_driver_register(&balloon_drv);
216162306a36Sopenharmony_ci}
216262306a36Sopenharmony_ci
216362306a36Sopenharmony_cimodule_init(init_balloon_drv);
216462306a36Sopenharmony_ci
216562306a36Sopenharmony_ciMODULE_DESCRIPTION("Hyper-V Balloon");
216662306a36Sopenharmony_ciMODULE_LICENSE("GPL");
2167