1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 *
4 * Copyright (c) 2009, Microsoft Corporation.
5 *
6 * Authors:
7 *   Haiyang Zhang <haiyangz@microsoft.com>
8 *   Hank Janssen  <hjanssen@microsoft.com>
9 *   K. Y. Srinivasan <kys@microsoft.com>
10 */
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/kernel.h>
14#include <linux/mm.h>
15#include <linux/hyperv.h>
16#include <linux/uio.h>
17#include <linux/vmalloc.h>
18#include <linux/slab.h>
19#include <linux/prefetch.h>
20
21#include "hyperv_vmbus.h"
22
23#define VMBUS_PKT_TRAILER	8
24
25/*
26 * When we write to the ring buffer, check if the host needs to
27 * be signaled. Here is the details of this protocol:
28 *
29 *	1. The host guarantees that while it is draining the
30 *	   ring buffer, it will set the interrupt_mask to
31 *	   indicate it does not need to be interrupted when
32 *	   new data is placed.
33 *
34 *	2. The host guarantees that it will completely drain
35 *	   the ring buffer before exiting the read loop. Further,
36 *	   once the ring buffer is empty, it will clear the
37 *	   interrupt_mask and re-check to see if new data has
38 *	   arrived.
39 *
40 * KYS: Oct. 30, 2016:
41 * It looks like Windows hosts have logic to deal with DOS attacks that
42 * can be triggered if it receives interrupts when it is not expecting
43 * the interrupt. The host expects interrupts only when the ring
44 * transitions from empty to non-empty (or full to non full on the guest
45 * to host ring).
46 * So, base the signaling decision solely on the ring state until the
47 * host logic is fixed.
48 */
49
50static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel)
51{
52	struct hv_ring_buffer_info *rbi = &channel->outbound;
53
54	virt_mb();
55	if (READ_ONCE(rbi->ring_buffer->interrupt_mask))
56		return;
57
58	/* check interrupt_mask before read_index */
59	virt_rmb();
60	/*
61	 * This is the only case we need to signal when the
62	 * ring transitions from being empty to non-empty.
63	 */
64	if (old_write == READ_ONCE(rbi->ring_buffer->read_index)) {
65		++channel->intr_out_empty;
66		vmbus_setevent(channel);
67	}
68}
69
70/* Get the next write location for the specified ring buffer. */
71static inline u32
72hv_get_next_write_location(struct hv_ring_buffer_info *ring_info)
73{
74	u32 next = ring_info->ring_buffer->write_index;
75
76	return next;
77}
78
79/* Set the next write location for the specified ring buffer. */
80static inline void
81hv_set_next_write_location(struct hv_ring_buffer_info *ring_info,
82		     u32 next_write_location)
83{
84	ring_info->ring_buffer->write_index = next_write_location;
85}
86
87/* Set the next read location for the specified ring buffer. */
88static inline void
89hv_set_next_read_location(struct hv_ring_buffer_info *ring_info,
90		    u32 next_read_location)
91{
92	ring_info->ring_buffer->read_index = next_read_location;
93	ring_info->priv_read_index = next_read_location;
94}
95
96/* Get the size of the ring buffer. */
97static inline u32
98hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info)
99{
100	return ring_info->ring_datasize;
101}
102
103/* Get the read and write indices as u64 of the specified ring buffer. */
104static inline u64
105hv_get_ring_bufferindices(struct hv_ring_buffer_info *ring_info)
106{
107	return (u64)ring_info->ring_buffer->write_index << 32;
108}
109
110/*
111 * Helper routine to copy from source to ring buffer.
112 * Assume there is enough room. Handles wrap-around in dest case only!!
113 */
114static u32 hv_copyto_ringbuffer(
115	struct hv_ring_buffer_info	*ring_info,
116	u32				start_write_offset,
117	const void			*src,
118	u32				srclen)
119{
120	void *ring_buffer = hv_get_ring_buffer(ring_info);
121	u32 ring_buffer_size = hv_get_ring_buffersize(ring_info);
122
123	memcpy(ring_buffer + start_write_offset, src, srclen);
124
125	start_write_offset += srclen;
126	if (start_write_offset >= ring_buffer_size)
127		start_write_offset -= ring_buffer_size;
128
129	return start_write_offset;
130}
131
132/*
133 *
134 * hv_get_ringbuffer_availbytes()
135 *
136 * Get number of bytes available to read and to write to
137 * for the specified ring buffer
138 */
139static void
140hv_get_ringbuffer_availbytes(const struct hv_ring_buffer_info *rbi,
141			     u32 *read, u32 *write)
142{
143	u32 read_loc, write_loc, dsize;
144
145	/* Capture the read/write indices before they changed */
146	read_loc = READ_ONCE(rbi->ring_buffer->read_index);
147	write_loc = READ_ONCE(rbi->ring_buffer->write_index);
148	dsize = rbi->ring_datasize;
149
150	*write = write_loc >= read_loc ? dsize - (write_loc - read_loc) :
151		read_loc - write_loc;
152	*read = dsize - *write;
153}
154
155/* Get various debug metrics for the specified ring buffer. */
156int hv_ringbuffer_get_debuginfo(struct hv_ring_buffer_info *ring_info,
157				struct hv_ring_buffer_debug_info *debug_info)
158{
159	u32 bytes_avail_towrite;
160	u32 bytes_avail_toread;
161
162	mutex_lock(&ring_info->ring_buffer_mutex);
163
164	if (!ring_info->ring_buffer) {
165		mutex_unlock(&ring_info->ring_buffer_mutex);
166		return -EINVAL;
167	}
168
169	hv_get_ringbuffer_availbytes(ring_info,
170				     &bytes_avail_toread,
171				     &bytes_avail_towrite);
172	debug_info->bytes_avail_toread = bytes_avail_toread;
173	debug_info->bytes_avail_towrite = bytes_avail_towrite;
174	debug_info->current_read_index = ring_info->ring_buffer->read_index;
175	debug_info->current_write_index = ring_info->ring_buffer->write_index;
176	debug_info->current_interrupt_mask
177		= ring_info->ring_buffer->interrupt_mask;
178	mutex_unlock(&ring_info->ring_buffer_mutex);
179
180	return 0;
181}
182EXPORT_SYMBOL_GPL(hv_ringbuffer_get_debuginfo);
183
184/* Initialize a channel's ring buffer info mutex locks */
185void hv_ringbuffer_pre_init(struct vmbus_channel *channel)
186{
187	mutex_init(&channel->inbound.ring_buffer_mutex);
188	mutex_init(&channel->outbound.ring_buffer_mutex);
189}
190
191/* Initialize the ring buffer. */
192int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
193		       struct page *pages, u32 page_cnt)
194{
195	int i;
196	struct page **pages_wraparound;
197
198	BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE));
199
200	/*
201	 * First page holds struct hv_ring_buffer, do wraparound mapping for
202	 * the rest.
203	 */
204	pages_wraparound = kcalloc(page_cnt * 2 - 1, sizeof(struct page *),
205				   GFP_KERNEL);
206	if (!pages_wraparound)
207		return -ENOMEM;
208
209	pages_wraparound[0] = pages;
210	for (i = 0; i < 2 * (page_cnt - 1); i++)
211		pages_wraparound[i + 1] = &pages[i % (page_cnt - 1) + 1];
212
213	ring_info->ring_buffer = (struct hv_ring_buffer *)
214		vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, PAGE_KERNEL);
215
216	kfree(pages_wraparound);
217
218
219	if (!ring_info->ring_buffer)
220		return -ENOMEM;
221
222	ring_info->ring_buffer->read_index =
223		ring_info->ring_buffer->write_index = 0;
224
225	/* Set the feature bit for enabling flow control. */
226	ring_info->ring_buffer->feature_bits.value = 1;
227
228	ring_info->ring_size = page_cnt << PAGE_SHIFT;
229	ring_info->ring_size_div10_reciprocal =
230		reciprocal_value(ring_info->ring_size / 10);
231	ring_info->ring_datasize = ring_info->ring_size -
232		sizeof(struct hv_ring_buffer);
233	ring_info->priv_read_index = 0;
234
235	spin_lock_init(&ring_info->ring_lock);
236
237	return 0;
238}
239
240/* Cleanup the ring buffer. */
241void hv_ringbuffer_cleanup(struct hv_ring_buffer_info *ring_info)
242{
243	mutex_lock(&ring_info->ring_buffer_mutex);
244	vunmap(ring_info->ring_buffer);
245	ring_info->ring_buffer = NULL;
246	mutex_unlock(&ring_info->ring_buffer_mutex);
247}
248
249/*
250 * Check if the ring buffer spinlock is available to take or not; used on
251 * atomic contexts, like panic path (see the Hyper-V framebuffer driver).
252 */
253
254bool hv_ringbuffer_spinlock_busy(struct vmbus_channel *channel)
255{
256	struct hv_ring_buffer_info *rinfo = &channel->outbound;
257
258	return spin_is_locked(&rinfo->ring_lock);
259}
260EXPORT_SYMBOL_GPL(hv_ringbuffer_spinlock_busy);
261
262/* Write to the ring buffer. */
263int hv_ringbuffer_write(struct vmbus_channel *channel,
264			const struct kvec *kv_list, u32 kv_count)
265{
266	int i;
267	u32 bytes_avail_towrite;
268	u32 totalbytes_towrite = sizeof(u64);
269	u32 next_write_location;
270	u32 old_write;
271	u64 prev_indices;
272	unsigned long flags;
273	struct hv_ring_buffer_info *outring_info = &channel->outbound;
274
275	if (channel->rescind)
276		return -ENODEV;
277
278	for (i = 0; i < kv_count; i++)
279		totalbytes_towrite += kv_list[i].iov_len;
280
281	spin_lock_irqsave(&outring_info->ring_lock, flags);
282
283	bytes_avail_towrite = hv_get_bytes_to_write(outring_info);
284
285	/*
286	 * If there is only room for the packet, assume it is full.
287	 * Otherwise, the next time around, we think the ring buffer
288	 * is empty since the read index == write index.
289	 */
290	if (bytes_avail_towrite <= totalbytes_towrite) {
291		++channel->out_full_total;
292
293		if (!channel->out_full_flag) {
294			++channel->out_full_first;
295			channel->out_full_flag = true;
296		}
297
298		spin_unlock_irqrestore(&outring_info->ring_lock, flags);
299		return -EAGAIN;
300	}
301
302	channel->out_full_flag = false;
303
304	/* Write to the ring buffer */
305	next_write_location = hv_get_next_write_location(outring_info);
306
307	old_write = next_write_location;
308
309	for (i = 0; i < kv_count; i++) {
310		next_write_location = hv_copyto_ringbuffer(outring_info,
311						     next_write_location,
312						     kv_list[i].iov_base,
313						     kv_list[i].iov_len);
314	}
315
316	/* Set previous packet start */
317	prev_indices = hv_get_ring_bufferindices(outring_info);
318
319	next_write_location = hv_copyto_ringbuffer(outring_info,
320					     next_write_location,
321					     &prev_indices,
322					     sizeof(u64));
323
324	/* Issue a full memory barrier before updating the write index */
325	virt_mb();
326
327	/* Now, update the write location */
328	hv_set_next_write_location(outring_info, next_write_location);
329
330
331	spin_unlock_irqrestore(&outring_info->ring_lock, flags);
332
333	hv_signal_on_write(old_write, channel);
334
335	if (channel->rescind)
336		return -ENODEV;
337
338	return 0;
339}
340
341int hv_ringbuffer_read(struct vmbus_channel *channel,
342		       void *buffer, u32 buflen, u32 *buffer_actual_len,
343		       u64 *requestid, bool raw)
344{
345	struct vmpacket_descriptor *desc;
346	u32 packetlen, offset;
347
348	if (unlikely(buflen == 0))
349		return -EINVAL;
350
351	*buffer_actual_len = 0;
352	*requestid = 0;
353
354	/* Make sure there is something to read */
355	desc = hv_pkt_iter_first(channel);
356	if (desc == NULL) {
357		/*
358		 * No error is set when there is even no header, drivers are
359		 * supposed to analyze buffer_actual_len.
360		 */
361		return 0;
362	}
363
364	offset = raw ? 0 : (desc->offset8 << 3);
365	packetlen = (desc->len8 << 3) - offset;
366	*buffer_actual_len = packetlen;
367	*requestid = desc->trans_id;
368
369	if (unlikely(packetlen > buflen))
370		return -ENOBUFS;
371
372	/* since ring is double mapped, only one copy is necessary */
373	memcpy(buffer, (const char *)desc + offset, packetlen);
374
375	/* Advance ring index to next packet descriptor */
376	__hv_pkt_iter_next(channel, desc);
377
378	/* Notify host of update */
379	hv_pkt_iter_close(channel);
380
381	return 0;
382}
383
384/*
385 * Determine number of bytes available in ring buffer after
386 * the current iterator (priv_read_index) location.
387 *
388 * This is similar to hv_get_bytes_to_read but with private
389 * read index instead.
390 */
391static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
392{
393	u32 priv_read_loc = rbi->priv_read_index;
394	u32 write_loc;
395
396	/*
397	 * The Hyper-V host writes the packet data, then uses
398	 * store_release() to update the write_index.  Use load_acquire()
399	 * here to prevent loads of the packet data from being re-ordered
400	 * before the read of the write_index and potentially getting
401	 * stale data.
402	 */
403	write_loc = virt_load_acquire(&rbi->ring_buffer->write_index);
404
405	if (write_loc >= priv_read_loc)
406		return write_loc - priv_read_loc;
407	else
408		return (rbi->ring_datasize - priv_read_loc) + write_loc;
409}
410
411/*
412 * Get first vmbus packet from ring buffer after read_index
413 *
414 * If ring buffer is empty, returns NULL and no other action needed.
415 */
416struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
417{
418	struct hv_ring_buffer_info *rbi = &channel->inbound;
419	struct vmpacket_descriptor *desc;
420
421	hv_debug_delay_test(channel, MESSAGE_DELAY);
422	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
423		return NULL;
424
425	desc = hv_get_ring_buffer(rbi) + rbi->priv_read_index;
426	if (desc)
427		prefetch((char *)desc + (desc->len8 << 3));
428
429	return desc;
430}
431EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
432
433/*
434 * Get next vmbus packet from ring buffer.
435 *
436 * Advances the current location (priv_read_index) and checks for more
437 * data. If the end of the ring buffer is reached, then return NULL.
438 */
439struct vmpacket_descriptor *
440__hv_pkt_iter_next(struct vmbus_channel *channel,
441		   const struct vmpacket_descriptor *desc)
442{
443	struct hv_ring_buffer_info *rbi = &channel->inbound;
444	u32 packetlen = desc->len8 << 3;
445	u32 dsize = rbi->ring_datasize;
446
447	hv_debug_delay_test(channel, MESSAGE_DELAY);
448	/* bump offset to next potential packet */
449	rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
450	if (rbi->priv_read_index >= dsize)
451		rbi->priv_read_index -= dsize;
452
453	/* more data? */
454	return hv_pkt_iter_first(channel);
455}
456EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
457
458/* How many bytes were read in this iterator cycle */
459static u32 hv_pkt_iter_bytes_read(const struct hv_ring_buffer_info *rbi,
460					u32 start_read_index)
461{
462	if (rbi->priv_read_index >= start_read_index)
463		return rbi->priv_read_index - start_read_index;
464	else
465		return rbi->ring_datasize - start_read_index +
466			rbi->priv_read_index;
467}
468
469/*
470 * Update host ring buffer after iterating over packets. If the host has
471 * stopped queuing new entries because it found the ring buffer full, and
472 * sufficient space is being freed up, signal the host. But be careful to
473 * only signal the host when necessary, both for performance reasons and
474 * because Hyper-V protects itself by throttling guests that signal
475 * inappropriately.
476 *
477 * Determining when to signal is tricky. There are three key data inputs
478 * that must be handled in this order to avoid race conditions:
479 *
480 * 1. Update the read_index
481 * 2. Read the pending_send_sz
482 * 3. Read the current write_index
483 *
484 * The interrupt_mask is not used to determine when to signal. The
485 * interrupt_mask is used only on the guest->host ring buffer when
486 * sending requests to the host. The host does not use it on the host->
487 * guest ring buffer to indicate whether it should be signaled.
488 */
489void hv_pkt_iter_close(struct vmbus_channel *channel)
490{
491	struct hv_ring_buffer_info *rbi = &channel->inbound;
492	u32 curr_write_sz, pending_sz, bytes_read, start_read_index;
493
494	/*
495	 * Make sure all reads are done before we update the read index since
496	 * the writer may start writing to the read area once the read index
497	 * is updated.
498	 */
499	virt_rmb();
500	start_read_index = rbi->ring_buffer->read_index;
501	rbi->ring_buffer->read_index = rbi->priv_read_index;
502
503	/*
504	 * Older versions of Hyper-V (before WS2102 and Win8) do not
505	 * implement pending_send_sz and simply poll if the host->guest
506	 * ring buffer is full.  No signaling is needed or expected.
507	 */
508	if (!rbi->ring_buffer->feature_bits.feat_pending_send_sz)
509		return;
510
511	/*
512	 * Issue a full memory barrier before making the signaling decision.
513	 * If reading pending_send_sz were to be reordered and happen
514	 * before we commit the new read_index, a race could occur.  If the
515	 * host were to set the pending_send_sz after we have sampled
516	 * pending_send_sz, and the ring buffer blocks before we commit the
517	 * read index, we could miss sending the interrupt. Issue a full
518	 * memory barrier to address this.
519	 */
520	virt_mb();
521
522	/*
523	 * If the pending_send_sz is zero, then the ring buffer is not
524	 * blocked and there is no need to signal.  This is far by the
525	 * most common case, so exit quickly for best performance.
526	 */
527	pending_sz = READ_ONCE(rbi->ring_buffer->pending_send_sz);
528	if (!pending_sz)
529		return;
530
531	/*
532	 * Ensure the read of write_index in hv_get_bytes_to_write()
533	 * happens after the read of pending_send_sz.
534	 */
535	virt_rmb();
536	curr_write_sz = hv_get_bytes_to_write(rbi);
537	bytes_read = hv_pkt_iter_bytes_read(rbi, start_read_index);
538
539	/*
540	 * We want to signal the host only if we're transitioning
541	 * from a "not enough free space" state to a "enough free
542	 * space" state.  For example, it's possible that this function
543	 * could run and free up enough space to signal the host, and then
544	 * run again and free up additional space before the host has a
545	 * chance to clear the pending_send_sz.  The 2nd invocation would
546	 * be a null transition from "enough free space" to "enough free
547	 * space", which doesn't warrant a signal.
548	 *
549	 * Exactly filling the ring buffer is treated as "not enough
550	 * space". The ring buffer always must have at least one byte
551	 * empty so the empty and full conditions are distinguishable.
552	 * hv_get_bytes_to_write() doesn't fully tell the truth in
553	 * this regard.
554	 *
555	 * So first check if we were in the "enough free space" state
556	 * before we began the iteration. If so, the host was not
557	 * blocked, and there's no need to signal.
558	 */
559	if (curr_write_sz - bytes_read > pending_sz)
560		return;
561
562	/*
563	 * Similarly, if the new state is "not enough space", then
564	 * there's no need to signal.
565	 */
566	if (curr_write_sz <= pending_sz)
567		return;
568
569	++channel->intr_in_full;
570	vmbus_setevent(channel);
571}
572EXPORT_SYMBOL_GPL(hv_pkt_iter_close);
573