xref: /kernel/linux/linux-5.10/drivers/md/dm-stats.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/errno.h>
3#include <linux/numa.h>
4#include <linux/slab.h>
5#include <linux/rculist.h>
6#include <linux/threads.h>
7#include <linux/preempt.h>
8#include <linux/irqflags.h>
9#include <linux/vmalloc.h>
10#include <linux/mm.h>
11#include <linux/module.h>
12#include <linux/device-mapper.h>
13
14#include "dm-core.h"
15#include "dm-stats.h"
16
17#define DM_MSG_PREFIX "stats"
18
19static int dm_stat_need_rcu_barrier;
20
21/*
22 * Using 64-bit values to avoid overflow (which is a
23 * problem that block/genhd.c's IO accounting has).
24 */
25struct dm_stat_percpu {
26	unsigned long long sectors[2];
27	unsigned long long ios[2];
28	unsigned long long merges[2];
29	unsigned long long ticks[2];
30	unsigned long long io_ticks[2];
31	unsigned long long io_ticks_total;
32	unsigned long long time_in_queue;
33	unsigned long long *histogram;
34};
35
36struct dm_stat_shared {
37	atomic_t in_flight[2];
38	unsigned long long stamp;
39	struct dm_stat_percpu tmp;
40};
41
42struct dm_stat {
43	struct list_head list_entry;
44	int id;
45	unsigned stat_flags;
46	size_t n_entries;
47	sector_t start;
48	sector_t end;
49	sector_t step;
50	unsigned n_histogram_entries;
51	unsigned long long *histogram_boundaries;
52	const char *program_id;
53	const char *aux_data;
54	struct rcu_head rcu_head;
55	size_t shared_alloc_size;
56	size_t percpu_alloc_size;
57	size_t histogram_alloc_size;
58	struct dm_stat_percpu *stat_percpu[NR_CPUS];
59	struct dm_stat_shared stat_shared[];
60};
61
62#define STAT_PRECISE_TIMESTAMPS		1
63
64struct dm_stats_last_position {
65	sector_t last_sector;
66	unsigned last_rw;
67};
68
69/*
70 * A typo on the command line could possibly make the kernel run out of memory
71 * and crash. To prevent the crash we account all used memory. We fail if we
72 * exhaust 1/4 of all memory or 1/2 of vmalloc space.
73 */
74#define DM_STATS_MEMORY_FACTOR		4
75#define DM_STATS_VMALLOC_FACTOR		2
76
77static DEFINE_SPINLOCK(shared_memory_lock);
78
79static unsigned long shared_memory_amount;
80
81static bool __check_shared_memory(size_t alloc_size)
82{
83	size_t a;
84
85	a = shared_memory_amount + alloc_size;
86	if (a < shared_memory_amount)
87		return false;
88	if (a >> PAGE_SHIFT > totalram_pages() / DM_STATS_MEMORY_FACTOR)
89		return false;
90#ifdef CONFIG_MMU
91	if (a > (VMALLOC_END - VMALLOC_START) / DM_STATS_VMALLOC_FACTOR)
92		return false;
93#endif
94	return true;
95}
96
97static bool check_shared_memory(size_t alloc_size)
98{
99	bool ret;
100
101	spin_lock_irq(&shared_memory_lock);
102
103	ret = __check_shared_memory(alloc_size);
104
105	spin_unlock_irq(&shared_memory_lock);
106
107	return ret;
108}
109
110static bool claim_shared_memory(size_t alloc_size)
111{
112	spin_lock_irq(&shared_memory_lock);
113
114	if (!__check_shared_memory(alloc_size)) {
115		spin_unlock_irq(&shared_memory_lock);
116		return false;
117	}
118
119	shared_memory_amount += alloc_size;
120
121	spin_unlock_irq(&shared_memory_lock);
122
123	return true;
124}
125
126static void free_shared_memory(size_t alloc_size)
127{
128	unsigned long flags;
129
130	spin_lock_irqsave(&shared_memory_lock, flags);
131
132	if (WARN_ON_ONCE(shared_memory_amount < alloc_size)) {
133		spin_unlock_irqrestore(&shared_memory_lock, flags);
134		DMCRIT("Memory usage accounting bug.");
135		return;
136	}
137
138	shared_memory_amount -= alloc_size;
139
140	spin_unlock_irqrestore(&shared_memory_lock, flags);
141}
142
143static void *dm_kvzalloc(size_t alloc_size, int node)
144{
145	void *p;
146
147	if (!claim_shared_memory(alloc_size))
148		return NULL;
149
150	p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node);
151	if (p)
152		return p;
153
154	free_shared_memory(alloc_size);
155
156	return NULL;
157}
158
159static void dm_kvfree(void *ptr, size_t alloc_size)
160{
161	if (!ptr)
162		return;
163
164	free_shared_memory(alloc_size);
165
166	kvfree(ptr);
167}
168
169static void dm_stat_free(struct rcu_head *head)
170{
171	int cpu;
172	struct dm_stat *s = container_of(head, struct dm_stat, rcu_head);
173
174	kfree(s->histogram_boundaries);
175	kfree(s->program_id);
176	kfree(s->aux_data);
177	for_each_possible_cpu(cpu) {
178		dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
179		dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
180	}
181	dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
182	dm_kvfree(s, s->shared_alloc_size);
183}
184
185static int dm_stat_in_flight(struct dm_stat_shared *shared)
186{
187	return atomic_read(&shared->in_flight[READ]) +
188	       atomic_read(&shared->in_flight[WRITE]);
189}
190
191int dm_stats_init(struct dm_stats *stats)
192{
193	int cpu;
194	struct dm_stats_last_position *last;
195
196	mutex_init(&stats->mutex);
197	INIT_LIST_HEAD(&stats->list);
198	stats->last = alloc_percpu(struct dm_stats_last_position);
199	if (!stats->last)
200		return -ENOMEM;
201
202	for_each_possible_cpu(cpu) {
203		last = per_cpu_ptr(stats->last, cpu);
204		last->last_sector = (sector_t)ULLONG_MAX;
205		last->last_rw = UINT_MAX;
206	}
207
208	return 0;
209}
210
211void dm_stats_cleanup(struct dm_stats *stats)
212{
213	size_t ni;
214	struct dm_stat *s;
215	struct dm_stat_shared *shared;
216
217	while (!list_empty(&stats->list)) {
218		s = container_of(stats->list.next, struct dm_stat, list_entry);
219		list_del(&s->list_entry);
220		for (ni = 0; ni < s->n_entries; ni++) {
221			shared = &s->stat_shared[ni];
222			if (WARN_ON(dm_stat_in_flight(shared))) {
223				DMCRIT("leaked in-flight counter at index %lu "
224				       "(start %llu, end %llu, step %llu): reads %d, writes %d",
225				       (unsigned long)ni,
226				       (unsigned long long)s->start,
227				       (unsigned long long)s->end,
228				       (unsigned long long)s->step,
229				       atomic_read(&shared->in_flight[READ]),
230				       atomic_read(&shared->in_flight[WRITE]));
231			}
232			cond_resched();
233		}
234		dm_stat_free(&s->rcu_head);
235	}
236	free_percpu(stats->last);
237	mutex_destroy(&stats->mutex);
238}
239
240static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
241			   sector_t step, unsigned stat_flags,
242			   unsigned n_histogram_entries,
243			   unsigned long long *histogram_boundaries,
244			   const char *program_id, const char *aux_data,
245			   void (*suspend_callback)(struct mapped_device *),
246			   void (*resume_callback)(struct mapped_device *),
247			   struct mapped_device *md)
248{
249	struct list_head *l;
250	struct dm_stat *s, *tmp_s;
251	sector_t n_entries;
252	size_t ni;
253	size_t shared_alloc_size;
254	size_t percpu_alloc_size;
255	size_t histogram_alloc_size;
256	struct dm_stat_percpu *p;
257	int cpu;
258	int ret_id;
259	int r;
260
261	if (end < start || !step)
262		return -EINVAL;
263
264	n_entries = end - start;
265	if (dm_sector_div64(n_entries, step))
266		n_entries++;
267
268	if (n_entries != (size_t)n_entries || !(size_t)(n_entries + 1))
269		return -EOVERFLOW;
270
271	shared_alloc_size = struct_size(s, stat_shared, n_entries);
272	if ((shared_alloc_size - sizeof(struct dm_stat)) / sizeof(struct dm_stat_shared) != n_entries)
273		return -EOVERFLOW;
274
275	percpu_alloc_size = (size_t)n_entries * sizeof(struct dm_stat_percpu);
276	if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
277		return -EOVERFLOW;
278
279	histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
280	if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
281		return -EOVERFLOW;
282
283	if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
284				 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
285		return -ENOMEM;
286
287	s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
288	if (!s)
289		return -ENOMEM;
290
291	s->stat_flags = stat_flags;
292	s->n_entries = n_entries;
293	s->start = start;
294	s->end = end;
295	s->step = step;
296	s->shared_alloc_size = shared_alloc_size;
297	s->percpu_alloc_size = percpu_alloc_size;
298	s->histogram_alloc_size = histogram_alloc_size;
299
300	s->n_histogram_entries = n_histogram_entries;
301	s->histogram_boundaries = kmemdup(histogram_boundaries,
302					  s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
303	if (!s->histogram_boundaries) {
304		r = -ENOMEM;
305		goto out;
306	}
307
308	s->program_id = kstrdup(program_id, GFP_KERNEL);
309	if (!s->program_id) {
310		r = -ENOMEM;
311		goto out;
312	}
313	s->aux_data = kstrdup(aux_data, GFP_KERNEL);
314	if (!s->aux_data) {
315		r = -ENOMEM;
316		goto out;
317	}
318
319	for (ni = 0; ni < n_entries; ni++) {
320		atomic_set(&s->stat_shared[ni].in_flight[READ], 0);
321		atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
322		cond_resched();
323	}
324
325	if (s->n_histogram_entries) {
326		unsigned long long *hi;
327		hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
328		if (!hi) {
329			r = -ENOMEM;
330			goto out;
331		}
332		for (ni = 0; ni < n_entries; ni++) {
333			s->stat_shared[ni].tmp.histogram = hi;
334			hi += s->n_histogram_entries + 1;
335			cond_resched();
336		}
337	}
338
339	for_each_possible_cpu(cpu) {
340		p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
341		if (!p) {
342			r = -ENOMEM;
343			goto out;
344		}
345		s->stat_percpu[cpu] = p;
346		if (s->n_histogram_entries) {
347			unsigned long long *hi;
348			hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
349			if (!hi) {
350				r = -ENOMEM;
351				goto out;
352			}
353			for (ni = 0; ni < n_entries; ni++) {
354				p[ni].histogram = hi;
355				hi += s->n_histogram_entries + 1;
356				cond_resched();
357			}
358		}
359	}
360
361	/*
362	 * Suspend/resume to make sure there is no i/o in flight,
363	 * so that newly created statistics will be exact.
364	 *
365	 * (note: we couldn't suspend earlier because we must not
366	 * allocate memory while suspended)
367	 */
368	suspend_callback(md);
369
370	mutex_lock(&stats->mutex);
371	s->id = 0;
372	list_for_each(l, &stats->list) {
373		tmp_s = container_of(l, struct dm_stat, list_entry);
374		if (WARN_ON(tmp_s->id < s->id)) {
375			r = -EINVAL;
376			goto out_unlock_resume;
377		}
378		if (tmp_s->id > s->id)
379			break;
380		if (unlikely(s->id == INT_MAX)) {
381			r = -ENFILE;
382			goto out_unlock_resume;
383		}
384		s->id++;
385	}
386	ret_id = s->id;
387	list_add_tail_rcu(&s->list_entry, l);
388	mutex_unlock(&stats->mutex);
389
390	resume_callback(md);
391
392	return ret_id;
393
394out_unlock_resume:
395	mutex_unlock(&stats->mutex);
396	resume_callback(md);
397out:
398	dm_stat_free(&s->rcu_head);
399	return r;
400}
401
402static struct dm_stat *__dm_stats_find(struct dm_stats *stats, int id)
403{
404	struct dm_stat *s;
405
406	list_for_each_entry(s, &stats->list, list_entry) {
407		if (s->id > id)
408			break;
409		if (s->id == id)
410			return s;
411	}
412
413	return NULL;
414}
415
416static int dm_stats_delete(struct dm_stats *stats, int id)
417{
418	struct dm_stat *s;
419	int cpu;
420
421	mutex_lock(&stats->mutex);
422
423	s = __dm_stats_find(stats, id);
424	if (!s) {
425		mutex_unlock(&stats->mutex);
426		return -ENOENT;
427	}
428
429	list_del_rcu(&s->list_entry);
430	mutex_unlock(&stats->mutex);
431
432	/*
433	 * vfree can't be called from RCU callback
434	 */
435	for_each_possible_cpu(cpu)
436		if (is_vmalloc_addr(s->stat_percpu) ||
437		    is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
438			goto do_sync_free;
439	if (is_vmalloc_addr(s) ||
440	    is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
441do_sync_free:
442		synchronize_rcu_expedited();
443		dm_stat_free(&s->rcu_head);
444	} else {
445		WRITE_ONCE(dm_stat_need_rcu_barrier, 1);
446		call_rcu(&s->rcu_head, dm_stat_free);
447	}
448	return 0;
449}
450
451static int dm_stats_list(struct dm_stats *stats, const char *program,
452			 char *result, unsigned maxlen)
453{
454	struct dm_stat *s;
455	sector_t len;
456	unsigned sz = 0;
457
458	/*
459	 * Output format:
460	 *   <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
461	 */
462
463	mutex_lock(&stats->mutex);
464	list_for_each_entry(s, &stats->list, list_entry) {
465		if (!program || !strcmp(program, s->program_id)) {
466			len = s->end - s->start;
467			DMEMIT("%d: %llu+%llu %llu %s %s", s->id,
468				(unsigned long long)s->start,
469				(unsigned long long)len,
470				(unsigned long long)s->step,
471				s->program_id,
472				s->aux_data);
473			if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
474				DMEMIT(" precise_timestamps");
475			if (s->n_histogram_entries) {
476				unsigned i;
477				DMEMIT(" histogram:");
478				for (i = 0; i < s->n_histogram_entries; i++) {
479					if (i)
480						DMEMIT(",");
481					DMEMIT("%llu", s->histogram_boundaries[i]);
482				}
483			}
484			DMEMIT("\n");
485		}
486		cond_resched();
487	}
488	mutex_unlock(&stats->mutex);
489
490	return 1;
491}
492
493static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
494			  struct dm_stat_percpu *p)
495{
496	/*
497	 * This is racy, but so is part_round_stats_single.
498	 */
499	unsigned long long now, difference;
500	unsigned in_flight_read, in_flight_write;
501
502	if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
503		now = jiffies;
504	else
505		now = ktime_to_ns(ktime_get());
506
507	difference = now - shared->stamp;
508	if (!difference)
509		return;
510
511	in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
512	in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
513	if (in_flight_read)
514		p->io_ticks[READ] += difference;
515	if (in_flight_write)
516		p->io_ticks[WRITE] += difference;
517	if (in_flight_read + in_flight_write) {
518		p->io_ticks_total += difference;
519		p->time_in_queue += (in_flight_read + in_flight_write) * difference;
520	}
521	shared->stamp = now;
522}
523
524static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
525			      int idx, sector_t len,
526			      struct dm_stats_aux *stats_aux, bool end,
527			      unsigned long duration_jiffies)
528{
529	struct dm_stat_shared *shared = &s->stat_shared[entry];
530	struct dm_stat_percpu *p;
531
532	/*
533	 * For strict correctness we should use local_irq_save/restore
534	 * instead of preempt_disable/enable.
535	 *
536	 * preempt_disable/enable is racy if the driver finishes bios
537	 * from non-interrupt context as well as from interrupt context
538	 * or from more different interrupts.
539	 *
540	 * On 64-bit architectures the race only results in not counting some
541	 * events, so it is acceptable.  On 32-bit architectures the race could
542	 * cause the counter going off by 2^32, so we need to do proper locking
543	 * there.
544	 *
545	 * part_stat_lock()/part_stat_unlock() have this race too.
546	 */
547#if BITS_PER_LONG == 32
548	unsigned long flags;
549	local_irq_save(flags);
550#else
551	preempt_disable();
552#endif
553	p = &s->stat_percpu[smp_processor_id()][entry];
554
555	if (!end) {
556		dm_stat_round(s, shared, p);
557		atomic_inc(&shared->in_flight[idx]);
558	} else {
559		unsigned long long duration;
560		dm_stat_round(s, shared, p);
561		atomic_dec(&shared->in_flight[idx]);
562		p->sectors[idx] += len;
563		p->ios[idx] += 1;
564		p->merges[idx] += stats_aux->merged;
565		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
566			p->ticks[idx] += duration_jiffies;
567			duration = jiffies_to_msecs(duration_jiffies);
568		} else {
569			p->ticks[idx] += stats_aux->duration_ns;
570			duration = stats_aux->duration_ns;
571		}
572		if (s->n_histogram_entries) {
573			unsigned lo = 0, hi = s->n_histogram_entries + 1;
574			while (lo + 1 < hi) {
575				unsigned mid = (lo + hi) / 2;
576				if (s->histogram_boundaries[mid - 1] > duration) {
577					hi = mid;
578				} else {
579					lo = mid;
580				}
581
582			}
583			p->histogram[lo]++;
584		}
585	}
586
587#if BITS_PER_LONG == 32
588	local_irq_restore(flags);
589#else
590	preempt_enable();
591#endif
592}
593
594static void __dm_stat_bio(struct dm_stat *s, int bi_rw,
595			  sector_t bi_sector, sector_t end_sector,
596			  bool end, unsigned long duration_jiffies,
597			  struct dm_stats_aux *stats_aux)
598{
599	sector_t rel_sector, offset, todo, fragment_len;
600	size_t entry;
601
602	if (end_sector <= s->start || bi_sector >= s->end)
603		return;
604	if (unlikely(bi_sector < s->start)) {
605		rel_sector = 0;
606		todo = end_sector - s->start;
607	} else {
608		rel_sector = bi_sector - s->start;
609		todo = end_sector - bi_sector;
610	}
611	if (unlikely(end_sector > s->end))
612		todo -= (end_sector - s->end);
613
614	offset = dm_sector_div64(rel_sector, s->step);
615	entry = rel_sector;
616	do {
617		if (WARN_ON_ONCE(entry >= s->n_entries)) {
618			DMCRIT("Invalid area access in region id %d", s->id);
619			return;
620		}
621		fragment_len = todo;
622		if (fragment_len > s->step - offset)
623			fragment_len = s->step - offset;
624		dm_stat_for_entry(s, entry, bi_rw, fragment_len,
625				  stats_aux, end, duration_jiffies);
626		todo -= fragment_len;
627		entry++;
628		offset = 0;
629	} while (unlikely(todo != 0));
630}
631
632void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
633			 sector_t bi_sector, unsigned bi_sectors, bool end,
634			 unsigned long duration_jiffies,
635			 struct dm_stats_aux *stats_aux)
636{
637	struct dm_stat *s;
638	sector_t end_sector;
639	struct dm_stats_last_position *last;
640	bool got_precise_time;
641
642	if (unlikely(!bi_sectors))
643		return;
644
645	end_sector = bi_sector + bi_sectors;
646
647	if (!end) {
648		/*
649		 * A race condition can at worst result in the merged flag being
650		 * misrepresented, so we don't have to disable preemption here.
651		 */
652		last = raw_cpu_ptr(stats->last);
653		stats_aux->merged =
654			(bi_sector == (READ_ONCE(last->last_sector) &&
655				       ((bi_rw == WRITE) ==
656					(READ_ONCE(last->last_rw) == WRITE))
657				       ));
658		WRITE_ONCE(last->last_sector, end_sector);
659		WRITE_ONCE(last->last_rw, bi_rw);
660	}
661
662	rcu_read_lock();
663
664	got_precise_time = false;
665	list_for_each_entry_rcu(s, &stats->list, list_entry) {
666		if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
667			if (!end)
668				stats_aux->duration_ns = ktime_to_ns(ktime_get());
669			else
670				stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
671			got_precise_time = true;
672		}
673		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
674	}
675
676	rcu_read_unlock();
677}
678
679static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared,
680						   struct dm_stat *s, size_t x)
681{
682	int cpu;
683	struct dm_stat_percpu *p;
684
685	local_irq_disable();
686	p = &s->stat_percpu[smp_processor_id()][x];
687	dm_stat_round(s, shared, p);
688	local_irq_enable();
689
690	shared->tmp.sectors[READ] = 0;
691	shared->tmp.sectors[WRITE] = 0;
692	shared->tmp.ios[READ] = 0;
693	shared->tmp.ios[WRITE] = 0;
694	shared->tmp.merges[READ] = 0;
695	shared->tmp.merges[WRITE] = 0;
696	shared->tmp.ticks[READ] = 0;
697	shared->tmp.ticks[WRITE] = 0;
698	shared->tmp.io_ticks[READ] = 0;
699	shared->tmp.io_ticks[WRITE] = 0;
700	shared->tmp.io_ticks_total = 0;
701	shared->tmp.time_in_queue = 0;
702
703	if (s->n_histogram_entries)
704		memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
705
706	for_each_possible_cpu(cpu) {
707		p = &s->stat_percpu[cpu][x];
708		shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]);
709		shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]);
710		shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]);
711		shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]);
712		shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]);
713		shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]);
714		shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]);
715		shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]);
716		shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]);
717		shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]);
718		shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total);
719		shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue);
720		if (s->n_histogram_entries) {
721			unsigned i;
722			for (i = 0; i < s->n_histogram_entries + 1; i++)
723				shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]);
724		}
725	}
726}
727
728static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
729			    bool init_tmp_percpu_totals)
730{
731	size_t x;
732	struct dm_stat_shared *shared;
733	struct dm_stat_percpu *p;
734
735	for (x = idx_start; x < idx_end; x++) {
736		shared = &s->stat_shared[x];
737		if (init_tmp_percpu_totals)
738			__dm_stat_init_temporary_percpu_totals(shared, s, x);
739		local_irq_disable();
740		p = &s->stat_percpu[smp_processor_id()][x];
741		p->sectors[READ] -= shared->tmp.sectors[READ];
742		p->sectors[WRITE] -= shared->tmp.sectors[WRITE];
743		p->ios[READ] -= shared->tmp.ios[READ];
744		p->ios[WRITE] -= shared->tmp.ios[WRITE];
745		p->merges[READ] -= shared->tmp.merges[READ];
746		p->merges[WRITE] -= shared->tmp.merges[WRITE];
747		p->ticks[READ] -= shared->tmp.ticks[READ];
748		p->ticks[WRITE] -= shared->tmp.ticks[WRITE];
749		p->io_ticks[READ] -= shared->tmp.io_ticks[READ];
750		p->io_ticks[WRITE] -= shared->tmp.io_ticks[WRITE];
751		p->io_ticks_total -= shared->tmp.io_ticks_total;
752		p->time_in_queue -= shared->tmp.time_in_queue;
753		local_irq_enable();
754		if (s->n_histogram_entries) {
755			unsigned i;
756			for (i = 0; i < s->n_histogram_entries + 1; i++) {
757				local_irq_disable();
758				p = &s->stat_percpu[smp_processor_id()][x];
759				p->histogram[i] -= shared->tmp.histogram[i];
760				local_irq_enable();
761			}
762		}
763		cond_resched();
764	}
765}
766
767static int dm_stats_clear(struct dm_stats *stats, int id)
768{
769	struct dm_stat *s;
770
771	mutex_lock(&stats->mutex);
772
773	s = __dm_stats_find(stats, id);
774	if (!s) {
775		mutex_unlock(&stats->mutex);
776		return -ENOENT;
777	}
778
779	__dm_stat_clear(s, 0, s->n_entries, true);
780
781	mutex_unlock(&stats->mutex);
782
783	return 1;
784}
785
786/*
787 * This is like jiffies_to_msec, but works for 64-bit values.
788 */
789static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
790{
791	unsigned long long result;
792	unsigned mult;
793
794	if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
795		return j;
796
797	result = 0;
798	if (j)
799		result = jiffies_to_msecs(j & 0x3fffff);
800	if (j >= 1 << 22) {
801		mult = jiffies_to_msecs(1 << 22);
802		result += (unsigned long long)mult * (unsigned long long)jiffies_to_msecs((j >> 22) & 0x3fffff);
803	}
804	if (j >= 1ULL << 44)
805		result += (unsigned long long)mult * (unsigned long long)mult * (unsigned long long)jiffies_to_msecs(j >> 44);
806
807	return result;
808}
809
810static int dm_stats_print(struct dm_stats *stats, int id,
811			  size_t idx_start, size_t idx_len,
812			  bool clear, char *result, unsigned maxlen)
813{
814	unsigned sz = 0;
815	struct dm_stat *s;
816	size_t x;
817	sector_t start, end, step;
818	size_t idx_end;
819	struct dm_stat_shared *shared;
820
821	/*
822	 * Output format:
823	 *   <start_sector>+<length> counters
824	 */
825
826	mutex_lock(&stats->mutex);
827
828	s = __dm_stats_find(stats, id);
829	if (!s) {
830		mutex_unlock(&stats->mutex);
831		return -ENOENT;
832	}
833
834	idx_end = idx_start + idx_len;
835	if (idx_end < idx_start ||
836	    idx_end > s->n_entries)
837		idx_end = s->n_entries;
838
839	if (idx_start > idx_end)
840		idx_start = idx_end;
841
842	step = s->step;
843	start = s->start + (step * idx_start);
844
845	for (x = idx_start; x < idx_end; x++, start = end) {
846		shared = &s->stat_shared[x];
847		end = start + step;
848		if (unlikely(end > s->end))
849			end = s->end;
850
851		__dm_stat_init_temporary_percpu_totals(shared, s, x);
852
853		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
854		       (unsigned long long)start,
855		       (unsigned long long)step,
856		       shared->tmp.ios[READ],
857		       shared->tmp.merges[READ],
858		       shared->tmp.sectors[READ],
859		       dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
860		       shared->tmp.ios[WRITE],
861		       shared->tmp.merges[WRITE],
862		       shared->tmp.sectors[WRITE],
863		       dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
864		       dm_stat_in_flight(shared),
865		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
866		       dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
867		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
868		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
869		if (s->n_histogram_entries) {
870			unsigned i;
871			for (i = 0; i < s->n_histogram_entries + 1; i++) {
872				DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
873			}
874		}
875		DMEMIT("\n");
876
877		if (unlikely(sz + 1 >= maxlen))
878			goto buffer_overflow;
879
880		cond_resched();
881	}
882
883	if (clear)
884		__dm_stat_clear(s, idx_start, idx_end, false);
885
886buffer_overflow:
887	mutex_unlock(&stats->mutex);
888
889	return 1;
890}
891
892static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data)
893{
894	struct dm_stat *s;
895	const char *new_aux_data;
896
897	mutex_lock(&stats->mutex);
898
899	s = __dm_stats_find(stats, id);
900	if (!s) {
901		mutex_unlock(&stats->mutex);
902		return -ENOENT;
903	}
904
905	new_aux_data = kstrdup(aux_data, GFP_KERNEL);
906	if (!new_aux_data) {
907		mutex_unlock(&stats->mutex);
908		return -ENOMEM;
909	}
910
911	kfree(s->aux_data);
912	s->aux_data = new_aux_data;
913
914	mutex_unlock(&stats->mutex);
915
916	return 0;
917}
918
919static int parse_histogram(const char *h, unsigned *n_histogram_entries,
920			   unsigned long long **histogram_boundaries)
921{
922	const char *q;
923	unsigned n;
924	unsigned long long last;
925
926	*n_histogram_entries = 1;
927	for (q = h; *q; q++)
928		if (*q == ',')
929			(*n_histogram_entries)++;
930
931	*histogram_boundaries = kmalloc_array(*n_histogram_entries,
932					      sizeof(unsigned long long),
933					      GFP_KERNEL);
934	if (!*histogram_boundaries)
935		return -ENOMEM;
936
937	n = 0;
938	last = 0;
939	while (1) {
940		unsigned long long hi;
941		int s;
942		char ch;
943		s = sscanf(h, "%llu%c", &hi, &ch);
944		if (!s || (s == 2 && ch != ','))
945			return -EINVAL;
946		if (hi <= last)
947			return -EINVAL;
948		last = hi;
949		(*histogram_boundaries)[n] = hi;
950		if (s == 1)
951			return 0;
952		h = strchr(h, ',') + 1;
953		n++;
954	}
955}
956
957static int message_stats_create(struct mapped_device *md,
958				unsigned argc, char **argv,
959				char *result, unsigned maxlen)
960{
961	int r;
962	int id;
963	char dummy;
964	unsigned long long start, end, len, step;
965	unsigned divisor;
966	const char *program_id, *aux_data;
967	unsigned stat_flags = 0;
968
969	unsigned n_histogram_entries = 0;
970	unsigned long long *histogram_boundaries = NULL;
971
972	struct dm_arg_set as, as_backup;
973	const char *a;
974	unsigned feature_args;
975
976	/*
977	 * Input format:
978	 *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
979	 */
980
981	if (argc < 3)
982		goto ret_einval;
983
984	as.argc = argc;
985	as.argv = argv;
986	dm_consume_args(&as, 1);
987
988	a = dm_shift_arg(&as);
989	if (!strcmp(a, "-")) {
990		start = 0;
991		len = dm_get_size(md);
992		if (!len)
993			len = 1;
994	} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
995		   start != (sector_t)start || len != (sector_t)len)
996		goto ret_einval;
997
998	end = start + len;
999	if (start >= end)
1000		goto ret_einval;
1001
1002	a = dm_shift_arg(&as);
1003	if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
1004		if (!divisor)
1005			return -EINVAL;
1006		step = end - start;
1007		if (do_div(step, divisor))
1008			step++;
1009		if (!step)
1010			step = 1;
1011	} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
1012		   step != (sector_t)step || !step)
1013		goto ret_einval;
1014
1015	as_backup = as;
1016	a = dm_shift_arg(&as);
1017	if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
1018		while (feature_args--) {
1019			a = dm_shift_arg(&as);
1020			if (!a)
1021				goto ret_einval;
1022			if (!strcasecmp(a, "precise_timestamps"))
1023				stat_flags |= STAT_PRECISE_TIMESTAMPS;
1024			else if (!strncasecmp(a, "histogram:", 10)) {
1025				if (n_histogram_entries)
1026					goto ret_einval;
1027				if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
1028					goto ret;
1029			} else
1030				goto ret_einval;
1031		}
1032	} else {
1033		as = as_backup;
1034	}
1035
1036	program_id = "-";
1037	aux_data = "-";
1038
1039	a = dm_shift_arg(&as);
1040	if (a)
1041		program_id = a;
1042
1043	a = dm_shift_arg(&as);
1044	if (a)
1045		aux_data = a;
1046
1047	if (as.argc)
1048		goto ret_einval;
1049
1050	/*
1051	 * If a buffer overflow happens after we created the region,
1052	 * it's too late (the userspace would retry with a larger
1053	 * buffer, but the region id that caused the overflow is already
1054	 * leaked).  So we must detect buffer overflow in advance.
1055	 */
1056	snprintf(result, maxlen, "%d", INT_MAX);
1057	if (dm_message_test_buffer_overflow(result, maxlen)) {
1058		r = 1;
1059		goto ret;
1060	}
1061
1062	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
1063			     n_histogram_entries, histogram_boundaries, program_id, aux_data,
1064			     dm_internal_suspend_fast, dm_internal_resume_fast, md);
1065	if (id < 0) {
1066		r = id;
1067		goto ret;
1068	}
1069
1070	snprintf(result, maxlen, "%d", id);
1071
1072	r = 1;
1073	goto ret;
1074
1075ret_einval:
1076	r = -EINVAL;
1077ret:
1078	kfree(histogram_boundaries);
1079	return r;
1080}
1081
1082static int message_stats_delete(struct mapped_device *md,
1083				unsigned argc, char **argv)
1084{
1085	int id;
1086	char dummy;
1087
1088	if (argc != 2)
1089		return -EINVAL;
1090
1091	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1092		return -EINVAL;
1093
1094	return dm_stats_delete(dm_get_stats(md), id);
1095}
1096
1097static int message_stats_clear(struct mapped_device *md,
1098			       unsigned argc, char **argv)
1099{
1100	int id;
1101	char dummy;
1102
1103	if (argc != 2)
1104		return -EINVAL;
1105
1106	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1107		return -EINVAL;
1108
1109	return dm_stats_clear(dm_get_stats(md), id);
1110}
1111
1112static int message_stats_list(struct mapped_device *md,
1113			      unsigned argc, char **argv,
1114			      char *result, unsigned maxlen)
1115{
1116	int r;
1117	const char *program = NULL;
1118
1119	if (argc < 1 || argc > 2)
1120		return -EINVAL;
1121
1122	if (argc > 1) {
1123		program = kstrdup(argv[1], GFP_KERNEL);
1124		if (!program)
1125			return -ENOMEM;
1126	}
1127
1128	r = dm_stats_list(dm_get_stats(md), program, result, maxlen);
1129
1130	kfree(program);
1131
1132	return r;
1133}
1134
1135static int message_stats_print(struct mapped_device *md,
1136			       unsigned argc, char **argv, bool clear,
1137			       char *result, unsigned maxlen)
1138{
1139	int id;
1140	char dummy;
1141	unsigned long idx_start = 0, idx_len = ULONG_MAX;
1142
1143	if (argc != 2 && argc != 4)
1144		return -EINVAL;
1145
1146	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1147		return -EINVAL;
1148
1149	if (argc > 3) {
1150		if (strcmp(argv[2], "-") &&
1151		    sscanf(argv[2], "%lu%c", &idx_start, &dummy) != 1)
1152			return -EINVAL;
1153		if (strcmp(argv[3], "-") &&
1154		    sscanf(argv[3], "%lu%c", &idx_len, &dummy) != 1)
1155			return -EINVAL;
1156	}
1157
1158	return dm_stats_print(dm_get_stats(md), id, idx_start, idx_len, clear,
1159			      result, maxlen);
1160}
1161
1162static int message_stats_set_aux(struct mapped_device *md,
1163				 unsigned argc, char **argv)
1164{
1165	int id;
1166	char dummy;
1167
1168	if (argc != 3)
1169		return -EINVAL;
1170
1171	if (sscanf(argv[1], "%d%c", &id, &dummy) != 1 || id < 0)
1172		return -EINVAL;
1173
1174	return dm_stats_set_aux(dm_get_stats(md), id, argv[2]);
1175}
1176
1177int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
1178		     char *result, unsigned maxlen)
1179{
1180	int r;
1181
1182	/* All messages here must start with '@' */
1183	if (!strcasecmp(argv[0], "@stats_create"))
1184		r = message_stats_create(md, argc, argv, result, maxlen);
1185	else if (!strcasecmp(argv[0], "@stats_delete"))
1186		r = message_stats_delete(md, argc, argv);
1187	else if (!strcasecmp(argv[0], "@stats_clear"))
1188		r = message_stats_clear(md, argc, argv);
1189	else if (!strcasecmp(argv[0], "@stats_list"))
1190		r = message_stats_list(md, argc, argv, result, maxlen);
1191	else if (!strcasecmp(argv[0], "@stats_print"))
1192		r = message_stats_print(md, argc, argv, false, result, maxlen);
1193	else if (!strcasecmp(argv[0], "@stats_print_clear"))
1194		r = message_stats_print(md, argc, argv, true, result, maxlen);
1195	else if (!strcasecmp(argv[0], "@stats_set_aux"))
1196		r = message_stats_set_aux(md, argc, argv);
1197	else
1198		return 2; /* this wasn't a stats message */
1199
1200	if (r == -EINVAL)
1201		DMWARN("Invalid parameters for message %s", argv[0]);
1202
1203	return r;
1204}
1205
1206int __init dm_statistics_init(void)
1207{
1208	shared_memory_amount = 0;
1209	dm_stat_need_rcu_barrier = 0;
1210	return 0;
1211}
1212
1213void dm_statistics_exit(void)
1214{
1215	if (dm_stat_need_rcu_barrier)
1216		rcu_barrier();
1217	if (WARN_ON(shared_memory_amount))
1218		DMCRIT("shared_memory_amount leaked: %lu", shared_memory_amount);
1219}
1220
1221module_param_named(stats_current_allocated_bytes, shared_memory_amount, ulong, S_IRUGO);
1222MODULE_PARM_DESC(stats_current_allocated_bytes, "Memory currently used by statistics");
1223