1/*
2 * Copyright (C) 2015 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-cache-background-tracker.h"
8#include "dm-cache-policy-internal.h"
9#include "dm-cache-policy.h"
10#include "dm.h"
11
12#include <linux/hash.h>
13#include <linux/jiffies.h>
14#include <linux/module.h>
15#include <linux/mutex.h>
16#include <linux/vmalloc.h>
17#include <linux/math64.h>
18
19#define DM_MSG_PREFIX "cache-policy-smq"
20
21/*----------------------------------------------------------------*/
22
23/*
24 * Safe division functions that return zero on divide by zero.
25 */
26static unsigned safe_div(unsigned n, unsigned d)
27{
28	return d ? n / d : 0u;
29}
30
31static unsigned safe_mod(unsigned n, unsigned d)
32{
33	return d ? n % d : 0u;
34}
35
36/*----------------------------------------------------------------*/
37
38struct entry {
39	unsigned hash_next:28;
40	unsigned prev:28;
41	unsigned next:28;
42	unsigned level:6;
43	bool dirty:1;
44	bool allocated:1;
45	bool sentinel:1;
46	bool pending_work:1;
47
48	dm_oblock_t oblock;
49};
50
51/*----------------------------------------------------------------*/
52
53#define INDEXER_NULL ((1u << 28u) - 1u)
54
55/*
56 * An entry_space manages a set of entries that we use for the queues.
57 * The clean and dirty queues share entries, so this object is separate
58 * from the queue itself.
59 */
60struct entry_space {
61	struct entry *begin;
62	struct entry *end;
63};
64
65static int space_init(struct entry_space *es, unsigned nr_entries)
66{
67	if (!nr_entries) {
68		es->begin = es->end = NULL;
69		return 0;
70	}
71
72	es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
73	if (!es->begin)
74		return -ENOMEM;
75
76	es->end = es->begin + nr_entries;
77	return 0;
78}
79
80static void space_exit(struct entry_space *es)
81{
82	vfree(es->begin);
83}
84
85static struct entry *__get_entry(struct entry_space *es, unsigned block)
86{
87	struct entry *e;
88
89	e = es->begin + block;
90	BUG_ON(e >= es->end);
91
92	return e;
93}
94
95static unsigned to_index(struct entry_space *es, struct entry *e)
96{
97	BUG_ON(e < es->begin || e >= es->end);
98	return e - es->begin;
99}
100
101static struct entry *to_entry(struct entry_space *es, unsigned block)
102{
103	if (block == INDEXER_NULL)
104		return NULL;
105
106	return __get_entry(es, block);
107}
108
109/*----------------------------------------------------------------*/
110
111struct ilist {
112	unsigned nr_elts;	/* excluding sentinel entries */
113	unsigned head, tail;
114};
115
116static void l_init(struct ilist *l)
117{
118	l->nr_elts = 0;
119	l->head = l->tail = INDEXER_NULL;
120}
121
122static struct entry *l_head(struct entry_space *es, struct ilist *l)
123{
124	return to_entry(es, l->head);
125}
126
127static struct entry *l_tail(struct entry_space *es, struct ilist *l)
128{
129	return to_entry(es, l->tail);
130}
131
132static struct entry *l_next(struct entry_space *es, struct entry *e)
133{
134	return to_entry(es, e->next);
135}
136
137static struct entry *l_prev(struct entry_space *es, struct entry *e)
138{
139	return to_entry(es, e->prev);
140}
141
142static bool l_empty(struct ilist *l)
143{
144	return l->head == INDEXER_NULL;
145}
146
147static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
148{
149	struct entry *head = l_head(es, l);
150
151	e->next = l->head;
152	e->prev = INDEXER_NULL;
153
154	if (head)
155		head->prev = l->head = to_index(es, e);
156	else
157		l->head = l->tail = to_index(es, e);
158
159	if (!e->sentinel)
160		l->nr_elts++;
161}
162
163static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
164{
165	struct entry *tail = l_tail(es, l);
166
167	e->next = INDEXER_NULL;
168	e->prev = l->tail;
169
170	if (tail)
171		tail->next = l->tail = to_index(es, e);
172	else
173		l->head = l->tail = to_index(es, e);
174
175	if (!e->sentinel)
176		l->nr_elts++;
177}
178
179static void l_add_before(struct entry_space *es, struct ilist *l,
180			 struct entry *old, struct entry *e)
181{
182	struct entry *prev = l_prev(es, old);
183
184	if (!prev)
185		l_add_head(es, l, e);
186
187	else {
188		e->prev = old->prev;
189		e->next = to_index(es, old);
190		prev->next = old->prev = to_index(es, e);
191
192		if (!e->sentinel)
193			l->nr_elts++;
194	}
195}
196
197static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
198{
199	struct entry *prev = l_prev(es, e);
200	struct entry *next = l_next(es, e);
201
202	if (prev)
203		prev->next = e->next;
204	else
205		l->head = e->next;
206
207	if (next)
208		next->prev = e->prev;
209	else
210		l->tail = e->prev;
211
212	if (!e->sentinel)
213		l->nr_elts--;
214}
215
216static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
217{
218	struct entry *e;
219
220	for (e = l_head(es, l); e; e = l_next(es, e))
221		if (!e->sentinel) {
222			l_del(es, l, e);
223			return e;
224		}
225
226	return NULL;
227}
228
229static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
230{
231	struct entry *e;
232
233	for (e = l_tail(es, l); e; e = l_prev(es, e))
234		if (!e->sentinel) {
235			l_del(es, l, e);
236			return e;
237		}
238
239	return NULL;
240}
241
242/*----------------------------------------------------------------*/
243
244/*
245 * The stochastic-multi-queue is a set of lru lists stacked into levels.
246 * Entries are moved up levels when they are used, which loosely orders the
247 * most accessed entries in the top levels and least in the bottom.  This
248 * structure is *much* better than a single lru list.
249 */
250#define MAX_LEVELS 64u
251
252struct queue {
253	struct entry_space *es;
254
255	unsigned nr_elts;
256	unsigned nr_levels;
257	struct ilist qs[MAX_LEVELS];
258
259	/*
260	 * We maintain a count of the number of entries we would like in each
261	 * level.
262	 */
263	unsigned last_target_nr_elts;
264	unsigned nr_top_levels;
265	unsigned nr_in_top_levels;
266	unsigned target_count[MAX_LEVELS];
267};
268
269static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
270{
271	unsigned i;
272
273	q->es = es;
274	q->nr_elts = 0;
275	q->nr_levels = nr_levels;
276
277	for (i = 0; i < q->nr_levels; i++) {
278		l_init(q->qs + i);
279		q->target_count[i] = 0u;
280	}
281
282	q->last_target_nr_elts = 0u;
283	q->nr_top_levels = 0u;
284	q->nr_in_top_levels = 0u;
285}
286
287static unsigned q_size(struct queue *q)
288{
289	return q->nr_elts;
290}
291
292/*
293 * Insert an entry to the back of the given level.
294 */
295static void q_push(struct queue *q, struct entry *e)
296{
297	BUG_ON(e->pending_work);
298
299	if (!e->sentinel)
300		q->nr_elts++;
301
302	l_add_tail(q->es, q->qs + e->level, e);
303}
304
305static void q_push_front(struct queue *q, struct entry *e)
306{
307	BUG_ON(e->pending_work);
308
309	if (!e->sentinel)
310		q->nr_elts++;
311
312	l_add_head(q->es, q->qs + e->level, e);
313}
314
315static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
316{
317	BUG_ON(e->pending_work);
318
319	if (!e->sentinel)
320		q->nr_elts++;
321
322	l_add_before(q->es, q->qs + e->level, old, e);
323}
324
325static void q_del(struct queue *q, struct entry *e)
326{
327	l_del(q->es, q->qs + e->level, e);
328	if (!e->sentinel)
329		q->nr_elts--;
330}
331
332/*
333 * Return the oldest entry of the lowest populated level.
334 */
335static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
336{
337	unsigned level;
338	struct entry *e;
339
340	max_level = min(max_level, q->nr_levels);
341
342	for (level = 0; level < max_level; level++)
343		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
344			if (e->sentinel) {
345				if (can_cross_sentinel)
346					continue;
347				else
348					break;
349			}
350
351			return e;
352		}
353
354	return NULL;
355}
356
357static struct entry *q_pop(struct queue *q)
358{
359	struct entry *e = q_peek(q, q->nr_levels, true);
360
361	if (e)
362		q_del(q, e);
363
364	return e;
365}
366
367/*
368 * This function assumes there is a non-sentinel entry to pop.  It's only
369 * used by redistribute, so we know this is true.  It also doesn't adjust
370 * the q->nr_elts count.
371 */
372static struct entry *__redist_pop_from(struct queue *q, unsigned level)
373{
374	struct entry *e;
375
376	for (; level < q->nr_levels; level++)
377		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
378			if (!e->sentinel) {
379				l_del(q->es, q->qs + e->level, e);
380				return e;
381			}
382
383	return NULL;
384}
385
386static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
387{
388	unsigned level, nr_levels, entries_per_level, remainder;
389
390	BUG_ON(lbegin > lend);
391	BUG_ON(lend > q->nr_levels);
392	nr_levels = lend - lbegin;
393	entries_per_level = safe_div(nr_elts, nr_levels);
394	remainder = safe_mod(nr_elts, nr_levels);
395
396	for (level = lbegin; level < lend; level++)
397		q->target_count[level] =
398			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
399}
400
401/*
402 * Typically we have fewer elements in the top few levels which allows us
403 * to adjust the promote threshold nicely.
404 */
405static void q_set_targets(struct queue *q)
406{
407	if (q->last_target_nr_elts == q->nr_elts)
408		return;
409
410	q->last_target_nr_elts = q->nr_elts;
411
412	if (q->nr_top_levels > q->nr_levels)
413		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
414
415	else {
416		q_set_targets_subrange_(q, q->nr_in_top_levels,
417					q->nr_levels - q->nr_top_levels, q->nr_levels);
418
419		if (q->nr_in_top_levels < q->nr_elts)
420			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
421						0, q->nr_levels - q->nr_top_levels);
422		else
423			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
424	}
425}
426
427static void q_redistribute(struct queue *q)
428{
429	unsigned target, level;
430	struct ilist *l, *l_above;
431	struct entry *e;
432
433	q_set_targets(q);
434
435	for (level = 0u; level < q->nr_levels - 1u; level++) {
436		l = q->qs + level;
437		target = q->target_count[level];
438
439		/*
440		 * Pull down some entries from the level above.
441		 */
442		while (l->nr_elts < target) {
443			e = __redist_pop_from(q, level + 1u);
444			if (!e) {
445				/* bug in nr_elts */
446				break;
447			}
448
449			e->level = level;
450			l_add_tail(q->es, l, e);
451		}
452
453		/*
454		 * Push some entries up.
455		 */
456		l_above = q->qs + level + 1u;
457		while (l->nr_elts > target) {
458			e = l_pop_tail(q->es, l);
459
460			if (!e)
461				/* bug in nr_elts */
462				break;
463
464			e->level = level + 1u;
465			l_add_tail(q->es, l_above, e);
466		}
467	}
468}
469
470static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
471		      struct entry *s1, struct entry *s2)
472{
473	struct entry *de;
474	unsigned sentinels_passed = 0;
475	unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
476
477	/* try and find an entry to swap with */
478	if (extra_levels && (e->level < q->nr_levels - 1u)) {
479		for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
480			sentinels_passed++;
481
482		if (de) {
483			q_del(q, de);
484			de->level = e->level;
485			if (s1) {
486				switch (sentinels_passed) {
487				case 0:
488					q_push_before(q, s1, de);
489					break;
490
491				case 1:
492					q_push_before(q, s2, de);
493					break;
494
495				default:
496					q_push(q, de);
497				}
498			} else
499				q_push(q, de);
500		}
501	}
502
503	q_del(q, e);
504	e->level = new_level;
505	q_push(q, e);
506}
507
508/*----------------------------------------------------------------*/
509
510#define FP_SHIFT 8
511#define SIXTEENTH (1u << (FP_SHIFT - 4u))
512#define EIGHTH (1u << (FP_SHIFT - 3u))
513
514struct stats {
515	unsigned hit_threshold;
516	unsigned hits;
517	unsigned misses;
518};
519
520enum performance {
521	Q_POOR,
522	Q_FAIR,
523	Q_WELL
524};
525
526static void stats_init(struct stats *s, unsigned nr_levels)
527{
528	s->hit_threshold = (nr_levels * 3u) / 4u;
529	s->hits = 0u;
530	s->misses = 0u;
531}
532
533static void stats_reset(struct stats *s)
534{
535	s->hits = s->misses = 0u;
536}
537
538static void stats_level_accessed(struct stats *s, unsigned level)
539{
540	if (level >= s->hit_threshold)
541		s->hits++;
542	else
543		s->misses++;
544}
545
546static void stats_miss(struct stats *s)
547{
548	s->misses++;
549}
550
551/*
552 * There are times when we don't have any confidence in the hotspot queue.
553 * Such as when a fresh cache is created and the blocks have been spread
554 * out across the levels, or if an io load changes.  We detect this by
555 * seeing how often a lookup is in the top levels of the hotspot queue.
556 */
557static enum performance stats_assess(struct stats *s)
558{
559	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
560
561	if (confidence < SIXTEENTH)
562		return Q_POOR;
563
564	else if (confidence < EIGHTH)
565		return Q_FAIR;
566
567	else
568		return Q_WELL;
569}
570
571/*----------------------------------------------------------------*/
572
573struct smq_hash_table {
574	struct entry_space *es;
575	unsigned long long hash_bits;
576	unsigned *buckets;
577};
578
579/*
580 * All cache entries are stored in a chained hash table.  To save space we
581 * use indexing again, and only store indexes to the next entry.
582 */
583static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
584{
585	unsigned i, nr_buckets;
586
587	ht->es = es;
588	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
589	ht->hash_bits = __ffs(nr_buckets);
590
591	ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
592	if (!ht->buckets)
593		return -ENOMEM;
594
595	for (i = 0; i < nr_buckets; i++)
596		ht->buckets[i] = INDEXER_NULL;
597
598	return 0;
599}
600
601static void h_exit(struct smq_hash_table *ht)
602{
603	vfree(ht->buckets);
604}
605
606static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
607{
608	return to_entry(ht->es, ht->buckets[bucket]);
609}
610
611static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
612{
613	return to_entry(ht->es, e->hash_next);
614}
615
616static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
617{
618	e->hash_next = ht->buckets[bucket];
619	ht->buckets[bucket] = to_index(ht->es, e);
620}
621
622static void h_insert(struct smq_hash_table *ht, struct entry *e)
623{
624	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
625	__h_insert(ht, h, e);
626}
627
628static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
629				struct entry **prev)
630{
631	struct entry *e;
632
633	*prev = NULL;
634	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
635		if (e->oblock == oblock)
636			return e;
637
638		*prev = e;
639	}
640
641	return NULL;
642}
643
644static void __h_unlink(struct smq_hash_table *ht, unsigned h,
645		       struct entry *e, struct entry *prev)
646{
647	if (prev)
648		prev->hash_next = e->hash_next;
649	else
650		ht->buckets[h] = e->hash_next;
651}
652
653/*
654 * Also moves each entry to the front of the bucket.
655 */
656static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
657{
658	struct entry *e, *prev;
659	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
660
661	e = __h_lookup(ht, h, oblock, &prev);
662	if (e && prev) {
663		/*
664		 * Move to the front because this entry is likely
665		 * to be hit again.
666		 */
667		__h_unlink(ht, h, e, prev);
668		__h_insert(ht, h, e);
669	}
670
671	return e;
672}
673
674static void h_remove(struct smq_hash_table *ht, struct entry *e)
675{
676	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
677	struct entry *prev;
678
679	/*
680	 * The down side of using a singly linked list is we have to
681	 * iterate the bucket to remove an item.
682	 */
683	e = __h_lookup(ht, h, e->oblock, &prev);
684	if (e)
685		__h_unlink(ht, h, e, prev);
686}
687
688/*----------------------------------------------------------------*/
689
690struct entry_alloc {
691	struct entry_space *es;
692	unsigned begin;
693
694	unsigned nr_allocated;
695	struct ilist free;
696};
697
698static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
699			   unsigned begin, unsigned end)
700{
701	unsigned i;
702
703	ea->es = es;
704	ea->nr_allocated = 0u;
705	ea->begin = begin;
706
707	l_init(&ea->free);
708	for (i = begin; i != end; i++)
709		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
710}
711
712static void init_entry(struct entry *e)
713{
714	/*
715	 * We can't memset because that would clear the hotspot and
716	 * sentinel bits which remain constant.
717	 */
718	e->hash_next = INDEXER_NULL;
719	e->next = INDEXER_NULL;
720	e->prev = INDEXER_NULL;
721	e->level = 0u;
722	e->dirty = true;	/* FIXME: audit */
723	e->allocated = true;
724	e->sentinel = false;
725	e->pending_work = false;
726}
727
728static struct entry *alloc_entry(struct entry_alloc *ea)
729{
730	struct entry *e;
731
732	if (l_empty(&ea->free))
733		return NULL;
734
735	e = l_pop_head(ea->es, &ea->free);
736	init_entry(e);
737	ea->nr_allocated++;
738
739	return e;
740}
741
742/*
743 * This assumes the cblock hasn't already been allocated.
744 */
745static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
746{
747	struct entry *e = __get_entry(ea->es, ea->begin + i);
748
749	BUG_ON(e->allocated);
750
751	l_del(ea->es, &ea->free, e);
752	init_entry(e);
753	ea->nr_allocated++;
754
755	return e;
756}
757
758static void free_entry(struct entry_alloc *ea, struct entry *e)
759{
760	BUG_ON(!ea->nr_allocated);
761	BUG_ON(!e->allocated);
762
763	ea->nr_allocated--;
764	e->allocated = false;
765	l_add_tail(ea->es, &ea->free, e);
766}
767
768static bool allocator_empty(struct entry_alloc *ea)
769{
770	return l_empty(&ea->free);
771}
772
773static unsigned get_index(struct entry_alloc *ea, struct entry *e)
774{
775	return to_index(ea->es, e) - ea->begin;
776}
777
778static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
779{
780	return __get_entry(ea->es, ea->begin + index);
781}
782
783/*----------------------------------------------------------------*/
784
785#define NR_HOTSPOT_LEVELS 64u
786#define NR_CACHE_LEVELS 64u
787
788#define WRITEBACK_PERIOD (10ul * HZ)
789#define DEMOTE_PERIOD (60ul * HZ)
790
791#define HOTSPOT_UPDATE_PERIOD (HZ)
792#define CACHE_UPDATE_PERIOD (60ul * HZ)
793
794struct smq_policy {
795	struct dm_cache_policy policy;
796
797	/* protects everything */
798	spinlock_t lock;
799	dm_cblock_t cache_size;
800	sector_t cache_block_size;
801
802	sector_t hotspot_block_size;
803	unsigned nr_hotspot_blocks;
804	unsigned cache_blocks_per_hotspot_block;
805	unsigned hotspot_level_jump;
806
807	struct entry_space es;
808	struct entry_alloc writeback_sentinel_alloc;
809	struct entry_alloc demote_sentinel_alloc;
810	struct entry_alloc hotspot_alloc;
811	struct entry_alloc cache_alloc;
812
813	unsigned long *hotspot_hit_bits;
814	unsigned long *cache_hit_bits;
815
816	/*
817	 * We maintain three queues of entries.  The cache proper,
818	 * consisting of a clean and dirty queue, containing the currently
819	 * active mappings.  The hotspot queue uses a larger block size to
820	 * track blocks that are being hit frequently and potential
821	 * candidates for promotion to the cache.
822	 */
823	struct queue hotspot;
824	struct queue clean;
825	struct queue dirty;
826
827	struct stats hotspot_stats;
828	struct stats cache_stats;
829
830	/*
831	 * Keeps track of time, incremented by the core.  We use this to
832	 * avoid attributing multiple hits within the same tick.
833	 */
834	unsigned tick;
835
836	/*
837	 * The hash tables allows us to quickly find an entry by origin
838	 * block.
839	 */
840	struct smq_hash_table table;
841	struct smq_hash_table hotspot_table;
842
843	bool current_writeback_sentinels;
844	unsigned long next_writeback_period;
845
846	bool current_demote_sentinels;
847	unsigned long next_demote_period;
848
849	unsigned write_promote_level;
850	unsigned read_promote_level;
851
852	unsigned long next_hotspot_period;
853	unsigned long next_cache_period;
854
855	struct background_tracker *bg_work;
856
857	bool migrations_allowed:1;
858
859	/*
860	 * If this is set the policy will try and clean the whole cache
861	 * even if the device is not idle.
862	 */
863	bool cleaner:1;
864};
865
866/*----------------------------------------------------------------*/
867
868static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
869{
870	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
871}
872
873static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
874{
875	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
876}
877
878static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
879{
880	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
881}
882
883static void __update_writeback_sentinels(struct smq_policy *mq)
884{
885	unsigned level;
886	struct queue *q = &mq->dirty;
887	struct entry *sentinel;
888
889	for (level = 0; level < q->nr_levels; level++) {
890		sentinel = writeback_sentinel(mq, level);
891		q_del(q, sentinel);
892		q_push(q, sentinel);
893	}
894}
895
896static void __update_demote_sentinels(struct smq_policy *mq)
897{
898	unsigned level;
899	struct queue *q = &mq->clean;
900	struct entry *sentinel;
901
902	for (level = 0; level < q->nr_levels; level++) {
903		sentinel = demote_sentinel(mq, level);
904		q_del(q, sentinel);
905		q_push(q, sentinel);
906	}
907}
908
909static void update_sentinels(struct smq_policy *mq)
910{
911	if (time_after(jiffies, mq->next_writeback_period)) {
912		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
913		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
914		__update_writeback_sentinels(mq);
915	}
916
917	if (time_after(jiffies, mq->next_demote_period)) {
918		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
919		mq->current_demote_sentinels = !mq->current_demote_sentinels;
920		__update_demote_sentinels(mq);
921	}
922}
923
924static void __sentinels_init(struct smq_policy *mq)
925{
926	unsigned level;
927	struct entry *sentinel;
928
929	for (level = 0; level < NR_CACHE_LEVELS; level++) {
930		sentinel = writeback_sentinel(mq, level);
931		sentinel->level = level;
932		q_push(&mq->dirty, sentinel);
933
934		sentinel = demote_sentinel(mq, level);
935		sentinel->level = level;
936		q_push(&mq->clean, sentinel);
937	}
938}
939
940static void sentinels_init(struct smq_policy *mq)
941{
942	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
943	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
944
945	mq->current_writeback_sentinels = false;
946	mq->current_demote_sentinels = false;
947	__sentinels_init(mq);
948
949	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
950	mq->current_demote_sentinels = !mq->current_demote_sentinels;
951	__sentinels_init(mq);
952}
953
954/*----------------------------------------------------------------*/
955
956static void del_queue(struct smq_policy *mq, struct entry *e)
957{
958	q_del(e->dirty ? &mq->dirty : &mq->clean, e);
959}
960
961static void push_queue(struct smq_policy *mq, struct entry *e)
962{
963	if (e->dirty)
964		q_push(&mq->dirty, e);
965	else
966		q_push(&mq->clean, e);
967}
968
969// !h, !q, a -> h, q, a
970static void push(struct smq_policy *mq, struct entry *e)
971{
972	h_insert(&mq->table, e);
973	if (!e->pending_work)
974		push_queue(mq, e);
975}
976
977static void push_queue_front(struct smq_policy *mq, struct entry *e)
978{
979	if (e->dirty)
980		q_push_front(&mq->dirty, e);
981	else
982		q_push_front(&mq->clean, e);
983}
984
985static void push_front(struct smq_policy *mq, struct entry *e)
986{
987	h_insert(&mq->table, e);
988	if (!e->pending_work)
989		push_queue_front(mq, e);
990}
991
992static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
993{
994	return to_cblock(get_index(&mq->cache_alloc, e));
995}
996
997static void requeue(struct smq_policy *mq, struct entry *e)
998{
999	/*
1000	 * Pending work has temporarily been taken out of the queues.
1001	 */
1002	if (e->pending_work)
1003		return;
1004
1005	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
1006		if (!e->dirty) {
1007			q_requeue(&mq->clean, e, 1u, NULL, NULL);
1008			return;
1009		}
1010
1011		q_requeue(&mq->dirty, e, 1u,
1012			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
1013			  get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
1014	}
1015}
1016
1017static unsigned default_promote_level(struct smq_policy *mq)
1018{
1019	/*
1020	 * The promote level depends on the current performance of the
1021	 * cache.
1022	 *
1023	 * If the cache is performing badly, then we can't afford
1024	 * to promote much without causing performance to drop below that
1025	 * of the origin device.
1026	 *
1027	 * If the cache is performing well, then we don't need to promote
1028	 * much.  If it isn't broken, don't fix it.
1029	 *
1030	 * If the cache is middling then we promote more.
1031	 *
1032	 * This scheme reminds me of a graph of entropy vs probability of a
1033	 * binary variable.
1034	 */
1035	static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
1036
1037	unsigned hits = mq->cache_stats.hits;
1038	unsigned misses = mq->cache_stats.misses;
1039	unsigned index = safe_div(hits << 4u, hits + misses);
1040	return table[index];
1041}
1042
1043static void update_promote_levels(struct smq_policy *mq)
1044{
1045	/*
1046	 * If there are unused cache entries then we want to be really
1047	 * eager to promote.
1048	 */
1049	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1050		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1051
1052	threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1053
1054	/*
1055	 * If the hotspot queue is performing badly then we have little
1056	 * confidence that we know which blocks to promote.  So we cut down
1057	 * the amount of promotions.
1058	 */
1059	switch (stats_assess(&mq->hotspot_stats)) {
1060	case Q_POOR:
1061		threshold_level /= 4u;
1062		break;
1063
1064	case Q_FAIR:
1065		threshold_level /= 2u;
1066		break;
1067
1068	case Q_WELL:
1069		break;
1070	}
1071
1072	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1073	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1074}
1075
1076/*
1077 * If the hotspot queue is performing badly, then we try and move entries
1078 * around more quickly.
1079 */
1080static void update_level_jump(struct smq_policy *mq)
1081{
1082	switch (stats_assess(&mq->hotspot_stats)) {
1083	case Q_POOR:
1084		mq->hotspot_level_jump = 4u;
1085		break;
1086
1087	case Q_FAIR:
1088		mq->hotspot_level_jump = 2u;
1089		break;
1090
1091	case Q_WELL:
1092		mq->hotspot_level_jump = 1u;
1093		break;
1094	}
1095}
1096
1097static void end_hotspot_period(struct smq_policy *mq)
1098{
1099	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1100	update_promote_levels(mq);
1101
1102	if (time_after(jiffies, mq->next_hotspot_period)) {
1103		update_level_jump(mq);
1104		q_redistribute(&mq->hotspot);
1105		stats_reset(&mq->hotspot_stats);
1106		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1107	}
1108}
1109
1110static void end_cache_period(struct smq_policy *mq)
1111{
1112	if (time_after(jiffies, mq->next_cache_period)) {
1113		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1114
1115		q_redistribute(&mq->dirty);
1116		q_redistribute(&mq->clean);
1117		stats_reset(&mq->cache_stats);
1118
1119		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1120	}
1121}
1122
1123/*----------------------------------------------------------------*/
1124
1125/*
1126 * Targets are given as a percentage.
1127 */
1128#define CLEAN_TARGET 25u
1129#define FREE_TARGET 25u
1130
1131static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
1132{
1133	return from_cblock(mq->cache_size) * p / 100u;
1134}
1135
1136static bool clean_target_met(struct smq_policy *mq, bool idle)
1137{
1138	/*
1139	 * Cache entries may not be populated.  So we cannot rely on the
1140	 * size of the clean queue.
1141	 */
1142	if (idle || mq->cleaner) {
1143		/*
1144		 * We'd like to clean everything.
1145		 */
1146		return q_size(&mq->dirty) == 0u;
1147	}
1148
1149	/*
1150	 * If we're busy we don't worry about cleaning at all.
1151	 */
1152	return true;
1153}
1154
1155static bool free_target_met(struct smq_policy *mq)
1156{
1157	unsigned nr_free;
1158
1159	nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
1160	return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1161		percent_to_target(mq, FREE_TARGET);
1162}
1163
1164/*----------------------------------------------------------------*/
1165
1166static void mark_pending(struct smq_policy *mq, struct entry *e)
1167{
1168	BUG_ON(e->sentinel);
1169	BUG_ON(!e->allocated);
1170	BUG_ON(e->pending_work);
1171	e->pending_work = true;
1172}
1173
1174static void clear_pending(struct smq_policy *mq, struct entry *e)
1175{
1176	BUG_ON(!e->pending_work);
1177	e->pending_work = false;
1178}
1179
1180static void queue_writeback(struct smq_policy *mq, bool idle)
1181{
1182	int r;
1183	struct policy_work work;
1184	struct entry *e;
1185
1186	e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
1187	if (e) {
1188		mark_pending(mq, e);
1189		q_del(&mq->dirty, e);
1190
1191		work.op = POLICY_WRITEBACK;
1192		work.oblock = e->oblock;
1193		work.cblock = infer_cblock(mq, e);
1194
1195		r = btracker_queue(mq->bg_work, &work, NULL);
1196		if (r) {
1197			clear_pending(mq, e);
1198			q_push_front(&mq->dirty, e);
1199		}
1200	}
1201}
1202
1203static void queue_demotion(struct smq_policy *mq)
1204{
1205	int r;
1206	struct policy_work work;
1207	struct entry *e;
1208
1209	if (WARN_ON_ONCE(!mq->migrations_allowed))
1210		return;
1211
1212	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
1213	if (!e) {
1214		if (!clean_target_met(mq, true))
1215			queue_writeback(mq, false);
1216		return;
1217	}
1218
1219	mark_pending(mq, e);
1220	q_del(&mq->clean, e);
1221
1222	work.op = POLICY_DEMOTE;
1223	work.oblock = e->oblock;
1224	work.cblock = infer_cblock(mq, e);
1225	r = btracker_queue(mq->bg_work, &work, NULL);
1226	if (r) {
1227		clear_pending(mq, e);
1228		q_push_front(&mq->clean, e);
1229	}
1230}
1231
1232static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1233			    struct policy_work **workp)
1234{
1235	int r;
1236	struct entry *e;
1237	struct policy_work work;
1238
1239	if (!mq->migrations_allowed)
1240		return;
1241
1242	if (allocator_empty(&mq->cache_alloc)) {
1243		/*
1244		 * We always claim to be 'idle' to ensure some demotions happen
1245		 * with continuous loads.
1246		 */
1247		if (!free_target_met(mq))
1248			queue_demotion(mq);
1249		return;
1250	}
1251
1252	if (btracker_promotion_already_present(mq->bg_work, oblock))
1253		return;
1254
1255	/*
1256	 * We allocate the entry now to reserve the cblock.  If the
1257	 * background work is aborted we must remember to free it.
1258	 */
1259	e = alloc_entry(&mq->cache_alloc);
1260	BUG_ON(!e);
1261	e->pending_work = true;
1262	work.op = POLICY_PROMOTE;
1263	work.oblock = oblock;
1264	work.cblock = infer_cblock(mq, e);
1265	r = btracker_queue(mq->bg_work, &work, workp);
1266	if (r)
1267		free_entry(&mq->cache_alloc, e);
1268}
1269
1270/*----------------------------------------------------------------*/
1271
1272enum promote_result {
1273	PROMOTE_NOT,
1274	PROMOTE_TEMPORARY,
1275	PROMOTE_PERMANENT
1276};
1277
1278/*
1279 * Converts a boolean into a promote result.
1280 */
1281static enum promote_result maybe_promote(bool promote)
1282{
1283	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1284}
1285
1286static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1287					  int data_dir, bool fast_promote)
1288{
1289	if (data_dir == WRITE) {
1290		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1291			return PROMOTE_TEMPORARY;
1292
1293		return maybe_promote(hs_e->level >= mq->write_promote_level);
1294	} else
1295		return maybe_promote(hs_e->level >= mq->read_promote_level);
1296}
1297
1298static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1299{
1300	sector_t r = from_oblock(b);
1301	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1302	return to_oblock(r);
1303}
1304
1305static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1306{
1307	unsigned hi;
1308	dm_oblock_t hb = to_hblock(mq, b);
1309	struct entry *e = h_lookup(&mq->hotspot_table, hb);
1310
1311	if (e) {
1312		stats_level_accessed(&mq->hotspot_stats, e->level);
1313
1314		hi = get_index(&mq->hotspot_alloc, e);
1315		q_requeue(&mq->hotspot, e,
1316			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1317			  0u : mq->hotspot_level_jump,
1318			  NULL, NULL);
1319
1320	} else {
1321		stats_miss(&mq->hotspot_stats);
1322
1323		e = alloc_entry(&mq->hotspot_alloc);
1324		if (!e) {
1325			e = q_pop(&mq->hotspot);
1326			if (e) {
1327				h_remove(&mq->hotspot_table, e);
1328				hi = get_index(&mq->hotspot_alloc, e);
1329				clear_bit(hi, mq->hotspot_hit_bits);
1330			}
1331
1332		}
1333
1334		if (e) {
1335			e->oblock = hb;
1336			q_push(&mq->hotspot, e);
1337			h_insert(&mq->hotspot_table, e);
1338		}
1339	}
1340
1341	return e;
1342}
1343
1344/*----------------------------------------------------------------*/
1345
1346/*
1347 * Public interface, via the policy struct.  See dm-cache-policy.h for a
1348 * description of these.
1349 */
1350
1351static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1352{
1353	return container_of(p, struct smq_policy, policy);
1354}
1355
1356static void smq_destroy(struct dm_cache_policy *p)
1357{
1358	struct smq_policy *mq = to_smq_policy(p);
1359
1360	btracker_destroy(mq->bg_work);
1361	h_exit(&mq->hotspot_table);
1362	h_exit(&mq->table);
1363	free_bitset(mq->hotspot_hit_bits);
1364	free_bitset(mq->cache_hit_bits);
1365	space_exit(&mq->es);
1366	kfree(mq);
1367}
1368
1369/*----------------------------------------------------------------*/
1370
1371static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1372		    int data_dir, bool fast_copy,
1373		    struct policy_work **work, bool *background_work)
1374{
1375	struct entry *e, *hs_e;
1376	enum promote_result pr;
1377
1378	*background_work = false;
1379
1380	e = h_lookup(&mq->table, oblock);
1381	if (e) {
1382		stats_level_accessed(&mq->cache_stats, e->level);
1383
1384		requeue(mq, e);
1385		*cblock = infer_cblock(mq, e);
1386		return 0;
1387
1388	} else {
1389		stats_miss(&mq->cache_stats);
1390
1391		/*
1392		 * The hotspot queue only gets updated with misses.
1393		 */
1394		hs_e = update_hotspot_queue(mq, oblock);
1395
1396		pr = should_promote(mq, hs_e, data_dir, fast_copy);
1397		if (pr != PROMOTE_NOT) {
1398			queue_promotion(mq, oblock, work);
1399			*background_work = true;
1400		}
1401
1402		return -ENOENT;
1403	}
1404}
1405
1406static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1407		      int data_dir, bool fast_copy,
1408		      bool *background_work)
1409{
1410	int r;
1411	unsigned long flags;
1412	struct smq_policy *mq = to_smq_policy(p);
1413
1414	spin_lock_irqsave(&mq->lock, flags);
1415	r = __lookup(mq, oblock, cblock,
1416		     data_dir, fast_copy,
1417		     NULL, background_work);
1418	spin_unlock_irqrestore(&mq->lock, flags);
1419
1420	return r;
1421}
1422
1423static int smq_lookup_with_work(struct dm_cache_policy *p,
1424				dm_oblock_t oblock, dm_cblock_t *cblock,
1425				int data_dir, bool fast_copy,
1426				struct policy_work **work)
1427{
1428	int r;
1429	bool background_queued;
1430	unsigned long flags;
1431	struct smq_policy *mq = to_smq_policy(p);
1432
1433	spin_lock_irqsave(&mq->lock, flags);
1434	r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1435	spin_unlock_irqrestore(&mq->lock, flags);
1436
1437	return r;
1438}
1439
1440static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1441				   struct policy_work **result)
1442{
1443	int r;
1444	unsigned long flags;
1445	struct smq_policy *mq = to_smq_policy(p);
1446
1447	spin_lock_irqsave(&mq->lock, flags);
1448	r = btracker_issue(mq->bg_work, result);
1449	if (r == -ENODATA) {
1450		if (!clean_target_met(mq, idle)) {
1451			queue_writeback(mq, idle);
1452			r = btracker_issue(mq->bg_work, result);
1453		}
1454	}
1455	spin_unlock_irqrestore(&mq->lock, flags);
1456
1457	return r;
1458}
1459
1460/*
1461 * We need to clear any pending work flags that have been set, and in the
1462 * case of promotion free the entry for the destination cblock.
1463 */
1464static void __complete_background_work(struct smq_policy *mq,
1465				       struct policy_work *work,
1466				       bool success)
1467{
1468	struct entry *e = get_entry(&mq->cache_alloc,
1469				    from_cblock(work->cblock));
1470
1471	switch (work->op) {
1472	case POLICY_PROMOTE:
1473		// !h, !q, a
1474		clear_pending(mq, e);
1475		if (success) {
1476			e->oblock = work->oblock;
1477			e->level = NR_CACHE_LEVELS - 1;
1478			push(mq, e);
1479			// h, q, a
1480		} else {
1481			free_entry(&mq->cache_alloc, e);
1482			// !h, !q, !a
1483		}
1484		break;
1485
1486	case POLICY_DEMOTE:
1487		// h, !q, a
1488		if (success) {
1489			h_remove(&mq->table, e);
1490			free_entry(&mq->cache_alloc, e);
1491			// !h, !q, !a
1492		} else {
1493			clear_pending(mq, e);
1494			push_queue(mq, e);
1495			// h, q, a
1496		}
1497		break;
1498
1499	case POLICY_WRITEBACK:
1500		// h, !q, a
1501		clear_pending(mq, e);
1502		push_queue(mq, e);
1503		// h, q, a
1504		break;
1505	}
1506
1507	btracker_complete(mq->bg_work, work);
1508}
1509
1510static void smq_complete_background_work(struct dm_cache_policy *p,
1511					 struct policy_work *work,
1512					 bool success)
1513{
1514	unsigned long flags;
1515	struct smq_policy *mq = to_smq_policy(p);
1516
1517	spin_lock_irqsave(&mq->lock, flags);
1518	__complete_background_work(mq, work, success);
1519	spin_unlock_irqrestore(&mq->lock, flags);
1520}
1521
1522// in_hash(oblock) -> in_hash(oblock)
1523static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1524{
1525	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1526
1527	if (e->pending_work)
1528		e->dirty = set;
1529	else {
1530		del_queue(mq, e);
1531		e->dirty = set;
1532		push_queue(mq, e);
1533	}
1534}
1535
1536static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1537{
1538	unsigned long flags;
1539	struct smq_policy *mq = to_smq_policy(p);
1540
1541	spin_lock_irqsave(&mq->lock, flags);
1542	__smq_set_clear_dirty(mq, cblock, true);
1543	spin_unlock_irqrestore(&mq->lock, flags);
1544}
1545
1546static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1547{
1548	struct smq_policy *mq = to_smq_policy(p);
1549	unsigned long flags;
1550
1551	spin_lock_irqsave(&mq->lock, flags);
1552	__smq_set_clear_dirty(mq, cblock, false);
1553	spin_unlock_irqrestore(&mq->lock, flags);
1554}
1555
1556static unsigned random_level(dm_cblock_t cblock)
1557{
1558	return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1559}
1560
1561static int smq_load_mapping(struct dm_cache_policy *p,
1562			    dm_oblock_t oblock, dm_cblock_t cblock,
1563			    bool dirty, uint32_t hint, bool hint_valid)
1564{
1565	struct smq_policy *mq = to_smq_policy(p);
1566	struct entry *e;
1567
1568	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1569	e->oblock = oblock;
1570	e->dirty = dirty;
1571	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1572	e->pending_work = false;
1573
1574	/*
1575	 * When we load mappings we push ahead of both sentinels in order to
1576	 * allow demotions and cleaning to occur immediately.
1577	 */
1578	push_front(mq, e);
1579
1580	return 0;
1581}
1582
1583static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1584{
1585	struct smq_policy *mq = to_smq_policy(p);
1586	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1587
1588	if (!e->allocated)
1589		return -ENODATA;
1590
1591	// FIXME: what if this block has pending background work?
1592	del_queue(mq, e);
1593	h_remove(&mq->table, e);
1594	free_entry(&mq->cache_alloc, e);
1595	return 0;
1596}
1597
1598static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1599{
1600	struct smq_policy *mq = to_smq_policy(p);
1601	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1602
1603	if (!e->allocated)
1604		return 0;
1605
1606	return e->level;
1607}
1608
1609static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1610{
1611	dm_cblock_t r;
1612	unsigned long flags;
1613	struct smq_policy *mq = to_smq_policy(p);
1614
1615	spin_lock_irqsave(&mq->lock, flags);
1616	r = to_cblock(mq->cache_alloc.nr_allocated);
1617	spin_unlock_irqrestore(&mq->lock, flags);
1618
1619	return r;
1620}
1621
1622static void smq_tick(struct dm_cache_policy *p, bool can_block)
1623{
1624	struct smq_policy *mq = to_smq_policy(p);
1625	unsigned long flags;
1626
1627	spin_lock_irqsave(&mq->lock, flags);
1628	mq->tick++;
1629	update_sentinels(mq);
1630	end_hotspot_period(mq);
1631	end_cache_period(mq);
1632	spin_unlock_irqrestore(&mq->lock, flags);
1633}
1634
1635static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1636{
1637	struct smq_policy *mq = to_smq_policy(p);
1638	mq->migrations_allowed = allow;
1639}
1640
1641/*
1642 * smq has no config values, but the old mq policy did.  To avoid breaking
1643 * software we continue to accept these configurables for the mq policy,
1644 * but they have no effect.
1645 */
1646static int mq_set_config_value(struct dm_cache_policy *p,
1647			       const char *key, const char *value)
1648{
1649	unsigned long tmp;
1650
1651	if (kstrtoul(value, 10, &tmp))
1652		return -EINVAL;
1653
1654	if (!strcasecmp(key, "random_threshold") ||
1655	    !strcasecmp(key, "sequential_threshold") ||
1656	    !strcasecmp(key, "discard_promote_adjustment") ||
1657	    !strcasecmp(key, "read_promote_adjustment") ||
1658	    !strcasecmp(key, "write_promote_adjustment")) {
1659		DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1660		return 0;
1661	}
1662
1663	return -EINVAL;
1664}
1665
1666static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1667				 unsigned maxlen, ssize_t *sz_ptr)
1668{
1669	ssize_t sz = *sz_ptr;
1670
1671	DMEMIT("10 random_threshold 0 "
1672	       "sequential_threshold 0 "
1673	       "discard_promote_adjustment 0 "
1674	       "read_promote_adjustment 0 "
1675	       "write_promote_adjustment 0 ");
1676
1677	*sz_ptr = sz;
1678	return 0;
1679}
1680
1681/* Init the policy plugin interface function pointers. */
1682static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1683{
1684	mq->policy.destroy = smq_destroy;
1685	mq->policy.lookup = smq_lookup;
1686	mq->policy.lookup_with_work = smq_lookup_with_work;
1687	mq->policy.get_background_work = smq_get_background_work;
1688	mq->policy.complete_background_work = smq_complete_background_work;
1689	mq->policy.set_dirty = smq_set_dirty;
1690	mq->policy.clear_dirty = smq_clear_dirty;
1691	mq->policy.load_mapping = smq_load_mapping;
1692	mq->policy.invalidate_mapping = smq_invalidate_mapping;
1693	mq->policy.get_hint = smq_get_hint;
1694	mq->policy.residency = smq_residency;
1695	mq->policy.tick = smq_tick;
1696	mq->policy.allow_migrations = smq_allow_migrations;
1697
1698	if (mimic_mq) {
1699		mq->policy.set_config_value = mq_set_config_value;
1700		mq->policy.emit_config_values = mq_emit_config_values;
1701	}
1702}
1703
1704static bool too_many_hotspot_blocks(sector_t origin_size,
1705				    sector_t hotspot_block_size,
1706				    unsigned nr_hotspot_blocks)
1707{
1708	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1709}
1710
1711static void calc_hotspot_params(sector_t origin_size,
1712				sector_t cache_block_size,
1713				unsigned nr_cache_blocks,
1714				sector_t *hotspot_block_size,
1715				unsigned *nr_hotspot_blocks)
1716{
1717	*hotspot_block_size = cache_block_size * 16u;
1718	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1719
1720	while ((*hotspot_block_size > cache_block_size) &&
1721	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1722		*hotspot_block_size /= 2u;
1723}
1724
1725static struct dm_cache_policy *
1726__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size,
1727	     bool mimic_mq, bool migrations_allowed, bool cleaner)
1728{
1729	unsigned i;
1730	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1731	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
1732	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1733
1734	if (!mq)
1735		return NULL;
1736
1737	init_policy_functions(mq, mimic_mq);
1738	mq->cache_size = cache_size;
1739	mq->cache_block_size = cache_block_size;
1740
1741	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1742			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1743
1744	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1745	mq->hotspot_level_jump = 1u;
1746	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1747		DMERR("couldn't initialize entry space");
1748		goto bad_pool_init;
1749	}
1750
1751	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1752	for (i = 0; i < nr_sentinels_per_queue; i++)
1753		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1754
1755	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1756	for (i = 0; i < nr_sentinels_per_queue; i++)
1757		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1758
1759	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1760		       total_sentinels + mq->nr_hotspot_blocks);
1761
1762	init_allocator(&mq->cache_alloc, &mq->es,
1763		       total_sentinels + mq->nr_hotspot_blocks,
1764		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1765
1766	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1767	if (!mq->hotspot_hit_bits) {
1768		DMERR("couldn't allocate hotspot hit bitset");
1769		goto bad_hotspot_hit_bits;
1770	}
1771	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1772
1773	if (from_cblock(cache_size)) {
1774		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1775		if (!mq->cache_hit_bits) {
1776			DMERR("couldn't allocate cache hit bitset");
1777			goto bad_cache_hit_bits;
1778		}
1779		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1780	} else
1781		mq->cache_hit_bits = NULL;
1782
1783	mq->tick = 0;
1784	spin_lock_init(&mq->lock);
1785
1786	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1787	mq->hotspot.nr_top_levels = 8;
1788	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1789					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1790
1791	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1792	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1793
1794	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1795	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1796
1797	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1798		goto bad_alloc_table;
1799
1800	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1801		goto bad_alloc_hotspot_table;
1802
1803	sentinels_init(mq);
1804	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1805
1806	mq->next_hotspot_period = jiffies;
1807	mq->next_cache_period = jiffies;
1808
1809	mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
1810	if (!mq->bg_work)
1811		goto bad_btracker;
1812
1813	mq->migrations_allowed = migrations_allowed;
1814	mq->cleaner = cleaner;
1815
1816	return &mq->policy;
1817
1818bad_btracker:
1819	h_exit(&mq->hotspot_table);
1820bad_alloc_hotspot_table:
1821	h_exit(&mq->table);
1822bad_alloc_table:
1823	free_bitset(mq->cache_hit_bits);
1824bad_cache_hit_bits:
1825	free_bitset(mq->hotspot_hit_bits);
1826bad_hotspot_hit_bits:
1827	space_exit(&mq->es);
1828bad_pool_init:
1829	kfree(mq);
1830
1831	return NULL;
1832}
1833
1834static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1835					  sector_t origin_size,
1836					  sector_t cache_block_size)
1837{
1838	return __smq_create(cache_size, origin_size, cache_block_size,
1839			    false, true, false);
1840}
1841
1842static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1843					 sector_t origin_size,
1844					 sector_t cache_block_size)
1845{
1846	return __smq_create(cache_size, origin_size, cache_block_size,
1847			    true, true, false);
1848}
1849
1850static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1851					      sector_t origin_size,
1852					      sector_t cache_block_size)
1853{
1854	return __smq_create(cache_size, origin_size, cache_block_size,
1855			    false, false, true);
1856}
1857
1858/*----------------------------------------------------------------*/
1859
1860static struct dm_cache_policy_type smq_policy_type = {
1861	.name = "smq",
1862	.version = {2, 0, 0},
1863	.hint_size = 4,
1864	.owner = THIS_MODULE,
1865	.create = smq_create
1866};
1867
1868static struct dm_cache_policy_type mq_policy_type = {
1869	.name = "mq",
1870	.version = {2, 0, 0},
1871	.hint_size = 4,
1872	.owner = THIS_MODULE,
1873	.create = mq_create,
1874};
1875
1876static struct dm_cache_policy_type cleaner_policy_type = {
1877	.name = "cleaner",
1878	.version = {2, 0, 0},
1879	.hint_size = 4,
1880	.owner = THIS_MODULE,
1881	.create = cleaner_create,
1882};
1883
1884static struct dm_cache_policy_type default_policy_type = {
1885	.name = "default",
1886	.version = {2, 0, 0},
1887	.hint_size = 4,
1888	.owner = THIS_MODULE,
1889	.create = smq_create,
1890	.real = &smq_policy_type
1891};
1892
1893static int __init smq_init(void)
1894{
1895	int r;
1896
1897	r = dm_cache_policy_register(&smq_policy_type);
1898	if (r) {
1899		DMERR("register failed %d", r);
1900		return -ENOMEM;
1901	}
1902
1903	r = dm_cache_policy_register(&mq_policy_type);
1904	if (r) {
1905		DMERR("register failed (as mq) %d", r);
1906		goto out_mq;
1907	}
1908
1909	r = dm_cache_policy_register(&cleaner_policy_type);
1910	if (r) {
1911		DMERR("register failed (as cleaner) %d", r);
1912		goto out_cleaner;
1913	}
1914
1915	r = dm_cache_policy_register(&default_policy_type);
1916	if (r) {
1917		DMERR("register failed (as default) %d", r);
1918		goto out_default;
1919	}
1920
1921	return 0;
1922
1923out_default:
1924	dm_cache_policy_unregister(&cleaner_policy_type);
1925out_cleaner:
1926	dm_cache_policy_unregister(&mq_policy_type);
1927out_mq:
1928	dm_cache_policy_unregister(&smq_policy_type);
1929
1930	return -ENOMEM;
1931}
1932
1933static void __exit smq_exit(void)
1934{
1935	dm_cache_policy_unregister(&cleaner_policy_type);
1936	dm_cache_policy_unregister(&smq_policy_type);
1937	dm_cache_policy_unregister(&mq_policy_type);
1938	dm_cache_policy_unregister(&default_policy_type);
1939}
1940
1941module_init(smq_init);
1942module_exit(smq_exit);
1943
1944MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1945MODULE_LICENSE("GPL");
1946MODULE_DESCRIPTION("smq cache policy");
1947
1948MODULE_ALIAS("dm-cache-default");
1949MODULE_ALIAS("dm-cache-mq");
1950MODULE_ALIAS("dm-cache-cleaner");
1951