162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *
462306a36Sopenharmony_ci * Copyright (C) 2005 Oracle.  All rights reserved.
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci/* This quorum hack is only here until we transition to some more rational
862306a36Sopenharmony_ci * approach that is driven from userspace.  Honest.  No foolin'.
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci * Imagine two nodes lose network connectivity to each other but they're still
1162306a36Sopenharmony_ci * up and operating in every other way.  Presumably a network timeout indicates
1262306a36Sopenharmony_ci * that a node is broken and should be recovered.  They can't both recover each
1362306a36Sopenharmony_ci * other and both carry on without serialising their access to the file system.
1462306a36Sopenharmony_ci * They need to decide who is authoritative.  Now extend that problem to
1562306a36Sopenharmony_ci * arbitrary groups of nodes losing connectivity between each other.
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * So we declare that a node which has given up on connecting to a majority
1862306a36Sopenharmony_ci * of nodes who are still heartbeating will fence itself.
1962306a36Sopenharmony_ci *
2062306a36Sopenharmony_ci * There are huge opportunities for races here.  After we give up on a node's
2162306a36Sopenharmony_ci * connection we need to wait long enough to give heartbeat an opportunity
2262306a36Sopenharmony_ci * to declare the node as truly dead.  We also need to be careful with the
2362306a36Sopenharmony_ci * race between when we see a node start heartbeating and when we connect
2462306a36Sopenharmony_ci * to it.
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci * So nodes that are in this transtion put a hold on the quorum decision
2762306a36Sopenharmony_ci * with a counter.  As they fall out of this transition they drop the count
2862306a36Sopenharmony_ci * and if they're the last, they fire off the decision.
2962306a36Sopenharmony_ci */
3062306a36Sopenharmony_ci#include <linux/kernel.h>
3162306a36Sopenharmony_ci#include <linux/workqueue.h>
3262306a36Sopenharmony_ci#include <linux/reboot.h>
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#include "heartbeat.h"
3562306a36Sopenharmony_ci#include "nodemanager.h"
3662306a36Sopenharmony_ci#define MLOG_MASK_PREFIX ML_QUORUM
3762306a36Sopenharmony_ci#include "masklog.h"
3862306a36Sopenharmony_ci#include "quorum.h"
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_cistatic struct o2quo_state {
4162306a36Sopenharmony_ci	spinlock_t		qs_lock;
4262306a36Sopenharmony_ci	struct work_struct	qs_work;
4362306a36Sopenharmony_ci	int			qs_pending;
4462306a36Sopenharmony_ci	int			qs_heartbeating;
4562306a36Sopenharmony_ci	unsigned long		qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
4662306a36Sopenharmony_ci	int			qs_connected;
4762306a36Sopenharmony_ci	unsigned long		qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
4862306a36Sopenharmony_ci	int			qs_holds;
4962306a36Sopenharmony_ci	unsigned long		qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
5062306a36Sopenharmony_ci} o2quo_state;
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci/* this is horribly heavy-handed.  It should instead flip the file
5362306a36Sopenharmony_ci * system RO and call some userspace script. */
5462306a36Sopenharmony_cistatic void o2quo_fence_self(void)
5562306a36Sopenharmony_ci{
5662306a36Sopenharmony_ci	/* panic spins with interrupts enabled.  with preempt
5762306a36Sopenharmony_ci	 * threads can still schedule, etc, etc */
5862306a36Sopenharmony_ci	o2hb_stop_all_regions();
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci	switch (o2nm_single_cluster->cl_fence_method) {
6162306a36Sopenharmony_ci	case O2NM_FENCE_PANIC:
6262306a36Sopenharmony_ci		panic("*** ocfs2 is very sorry to be fencing this system by "
6362306a36Sopenharmony_ci		      "panicing ***\n");
6462306a36Sopenharmony_ci		break;
6562306a36Sopenharmony_ci	default:
6662306a36Sopenharmony_ci		WARN_ON(o2nm_single_cluster->cl_fence_method >=
6762306a36Sopenharmony_ci			O2NM_FENCE_METHODS);
6862306a36Sopenharmony_ci		fallthrough;
6962306a36Sopenharmony_ci	case O2NM_FENCE_RESET:
7062306a36Sopenharmony_ci		printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
7162306a36Sopenharmony_ci		       "system by restarting ***\n");
7262306a36Sopenharmony_ci		emergency_restart();
7362306a36Sopenharmony_ci		break;
7462306a36Sopenharmony_ci	}
7562306a36Sopenharmony_ci}
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci/* Indicate that a timeout occurred on a heartbeat region write. The
7862306a36Sopenharmony_ci * other nodes in the cluster may consider us dead at that time so we
7962306a36Sopenharmony_ci * want to "fence" ourselves so that we don't scribble on the disk
8062306a36Sopenharmony_ci * after they think they've recovered us. This can't solve all
8162306a36Sopenharmony_ci * problems related to writeout after recovery but this hack can at
8262306a36Sopenharmony_ci * least close some of those gaps. When we have real fencing, this can
8362306a36Sopenharmony_ci * go away as our node would be fenced externally before other nodes
8462306a36Sopenharmony_ci * begin recovery. */
8562306a36Sopenharmony_civoid o2quo_disk_timeout(void)
8662306a36Sopenharmony_ci{
8762306a36Sopenharmony_ci	o2quo_fence_self();
8862306a36Sopenharmony_ci}
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_cistatic void o2quo_make_decision(struct work_struct *work)
9162306a36Sopenharmony_ci{
9262306a36Sopenharmony_ci	int quorum;
9362306a36Sopenharmony_ci	int lowest_hb, lowest_reachable = 0, fence = 0;
9462306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	spin_lock_bh(&qs->qs_lock);
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci	lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
9962306a36Sopenharmony_ci	if (lowest_hb != O2NM_MAX_NODES)
10062306a36Sopenharmony_ci		lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci	mlog(0, "heartbeating: %d, connected: %d, "
10362306a36Sopenharmony_ci	     "lowest: %d (%sreachable)\n", qs->qs_heartbeating,
10462306a36Sopenharmony_ci	     qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci	if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
10762306a36Sopenharmony_ci	    qs->qs_heartbeating == 1)
10862306a36Sopenharmony_ci		goto out;
10962306a36Sopenharmony_ci
11062306a36Sopenharmony_ci	if (qs->qs_heartbeating & 1) {
11162306a36Sopenharmony_ci		/* the odd numbered cluster case is straight forward --
11262306a36Sopenharmony_ci		 * if we can't talk to the majority we're hosed */
11362306a36Sopenharmony_ci		quorum = (qs->qs_heartbeating + 1)/2;
11462306a36Sopenharmony_ci		if (qs->qs_connected < quorum) {
11562306a36Sopenharmony_ci			mlog(ML_ERROR, "fencing this node because it is "
11662306a36Sopenharmony_ci			     "only connected to %u nodes and %u is needed "
11762306a36Sopenharmony_ci			     "to make a quorum out of %u heartbeating nodes\n",
11862306a36Sopenharmony_ci			     qs->qs_connected, quorum,
11962306a36Sopenharmony_ci			     qs->qs_heartbeating);
12062306a36Sopenharmony_ci			fence = 1;
12162306a36Sopenharmony_ci		}
12262306a36Sopenharmony_ci	} else {
12362306a36Sopenharmony_ci		/* the even numbered cluster adds the possibility of each half
12462306a36Sopenharmony_ci		 * of the cluster being able to talk amongst themselves.. in
12562306a36Sopenharmony_ci		 * that case we're hosed if we can't talk to the group that has
12662306a36Sopenharmony_ci		 * the lowest numbered node */
12762306a36Sopenharmony_ci		quorum = qs->qs_heartbeating / 2;
12862306a36Sopenharmony_ci		if (qs->qs_connected < quorum) {
12962306a36Sopenharmony_ci			mlog(ML_ERROR, "fencing this node because it is "
13062306a36Sopenharmony_ci			     "only connected to %u nodes and %u is needed "
13162306a36Sopenharmony_ci			     "to make a quorum out of %u heartbeating nodes\n",
13262306a36Sopenharmony_ci			     qs->qs_connected, quorum,
13362306a36Sopenharmony_ci			     qs->qs_heartbeating);
13462306a36Sopenharmony_ci			fence = 1;
13562306a36Sopenharmony_ci		}
13662306a36Sopenharmony_ci		else if ((qs->qs_connected == quorum) &&
13762306a36Sopenharmony_ci			 !lowest_reachable) {
13862306a36Sopenharmony_ci			mlog(ML_ERROR, "fencing this node because it is "
13962306a36Sopenharmony_ci			     "connected to a half-quorum of %u out of %u "
14062306a36Sopenharmony_ci			     "nodes which doesn't include the lowest active "
14162306a36Sopenharmony_ci			     "node %u\n", quorum, qs->qs_heartbeating,
14262306a36Sopenharmony_ci			     lowest_hb);
14362306a36Sopenharmony_ci			fence = 1;
14462306a36Sopenharmony_ci		}
14562306a36Sopenharmony_ci	}
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ciout:
14862306a36Sopenharmony_ci	if (fence) {
14962306a36Sopenharmony_ci		spin_unlock_bh(&qs->qs_lock);
15062306a36Sopenharmony_ci		o2quo_fence_self();
15162306a36Sopenharmony_ci	} else {
15262306a36Sopenharmony_ci		mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
15362306a36Sopenharmony_ci			"connected: %d, lowest: %d (%sreachable)\n",
15462306a36Sopenharmony_ci			qs->qs_heartbeating, qs->qs_connected, lowest_hb,
15562306a36Sopenharmony_ci			lowest_reachable ? "" : "un");
15662306a36Sopenharmony_ci		spin_unlock_bh(&qs->qs_lock);
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci	}
15962306a36Sopenharmony_ci
16062306a36Sopenharmony_ci}
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_cistatic void o2quo_set_hold(struct o2quo_state *qs, u8 node)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	assert_spin_locked(&qs->qs_lock);
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	if (!test_and_set_bit(node, qs->qs_hold_bm)) {
16762306a36Sopenharmony_ci		qs->qs_holds++;
16862306a36Sopenharmony_ci		mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
16962306a36Sopenharmony_ci			        "node %u\n", node);
17062306a36Sopenharmony_ci		mlog(0, "node %u, %d total\n", node, qs->qs_holds);
17162306a36Sopenharmony_ci	}
17262306a36Sopenharmony_ci}
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_cistatic void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
17562306a36Sopenharmony_ci{
17662306a36Sopenharmony_ci	assert_spin_locked(&qs->qs_lock);
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	if (test_and_clear_bit(node, qs->qs_hold_bm)) {
17962306a36Sopenharmony_ci		mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
18062306a36Sopenharmony_ci		if (--qs->qs_holds == 0) {
18162306a36Sopenharmony_ci			if (qs->qs_pending) {
18262306a36Sopenharmony_ci				qs->qs_pending = 0;
18362306a36Sopenharmony_ci				schedule_work(&qs->qs_work);
18462306a36Sopenharmony_ci			}
18562306a36Sopenharmony_ci		}
18662306a36Sopenharmony_ci		mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
18762306a36Sopenharmony_ci				node, qs->qs_holds);
18862306a36Sopenharmony_ci	}
18962306a36Sopenharmony_ci}
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci/* as a node comes up we delay the quorum decision until we know the fate of
19262306a36Sopenharmony_ci * the connection.  the hold will be droped in conn_up or hb_down.  it might be
19362306a36Sopenharmony_ci * perpetuated by con_err until hb_down.  if we already have a conn, we might
19462306a36Sopenharmony_ci * be dropping a hold that conn_up got. */
19562306a36Sopenharmony_civoid o2quo_hb_up(u8 node)
19662306a36Sopenharmony_ci{
19762306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci	spin_lock_bh(&qs->qs_lock);
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	qs->qs_heartbeating++;
20262306a36Sopenharmony_ci	mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
20362306a36Sopenharmony_ci		        "node %u\n", node);
20462306a36Sopenharmony_ci	mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
20562306a36Sopenharmony_ci	set_bit(node, qs->qs_hb_bm);
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci	if (!test_bit(node, qs->qs_conn_bm))
21062306a36Sopenharmony_ci		o2quo_set_hold(qs, node);
21162306a36Sopenharmony_ci	else
21262306a36Sopenharmony_ci		o2quo_clear_hold(qs, node);
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci	spin_unlock_bh(&qs->qs_lock);
21562306a36Sopenharmony_ci}
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci/* hb going down releases any holds we might have had due to this node from
21862306a36Sopenharmony_ci * conn_up, conn_err, or hb_up */
21962306a36Sopenharmony_civoid o2quo_hb_down(u8 node)
22062306a36Sopenharmony_ci{
22162306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
22262306a36Sopenharmony_ci
22362306a36Sopenharmony_ci	spin_lock_bh(&qs->qs_lock);
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci	qs->qs_heartbeating--;
22662306a36Sopenharmony_ci	mlog_bug_on_msg(qs->qs_heartbeating < 0,
22762306a36Sopenharmony_ci			"node %u, %d heartbeating\n",
22862306a36Sopenharmony_ci			node, qs->qs_heartbeating);
22962306a36Sopenharmony_ci	mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
23062306a36Sopenharmony_ci	clear_bit(node, qs->qs_hb_bm);
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_ci	o2quo_clear_hold(qs, node);
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci	spin_unlock_bh(&qs->qs_lock);
23762306a36Sopenharmony_ci}
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_ci/* this tells us that we've decided that the node is still heartbeating
24062306a36Sopenharmony_ci * even though we've lost it's conn.  it must only be called after conn_err
24162306a36Sopenharmony_ci * and indicates that we must now make a quorum decision in the future,
24262306a36Sopenharmony_ci * though we might be doing so after waiting for holds to drain.  Here
24362306a36Sopenharmony_ci * we'll be dropping the hold from conn_err. */
24462306a36Sopenharmony_civoid o2quo_hb_still_up(u8 node)
24562306a36Sopenharmony_ci{
24662306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci	spin_lock_bh(&qs->qs_lock);
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci	mlog(0, "node %u\n", node);
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci	qs->qs_pending = 1;
25362306a36Sopenharmony_ci	o2quo_clear_hold(qs, node);
25462306a36Sopenharmony_ci
25562306a36Sopenharmony_ci	spin_unlock_bh(&qs->qs_lock);
25662306a36Sopenharmony_ci}
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci/* This is analogous to hb_up.  as a node's connection comes up we delay the
25962306a36Sopenharmony_ci * quorum decision until we see it heartbeating.  the hold will be droped in
26062306a36Sopenharmony_ci * hb_up or hb_down.  it might be perpetuated by con_err until hb_down.  if
26162306a36Sopenharmony_ci * it's already heartbeating we might be dropping a hold that conn_up got.
26262306a36Sopenharmony_ci * */
26362306a36Sopenharmony_civoid o2quo_conn_up(u8 node)
26462306a36Sopenharmony_ci{
26562306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	spin_lock_bh(&qs->qs_lock);
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	qs->qs_connected++;
27062306a36Sopenharmony_ci	mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
27162306a36Sopenharmony_ci		        "node %u\n", node);
27262306a36Sopenharmony_ci	mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
27362306a36Sopenharmony_ci	set_bit(node, qs->qs_conn_bm);
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ci	if (!test_bit(node, qs->qs_hb_bm))
27862306a36Sopenharmony_ci		o2quo_set_hold(qs, node);
27962306a36Sopenharmony_ci	else
28062306a36Sopenharmony_ci		o2quo_clear_hold(qs, node);
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci	spin_unlock_bh(&qs->qs_lock);
28362306a36Sopenharmony_ci}
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci/* we've decided that we won't ever be connecting to the node again.  if it's
28662306a36Sopenharmony_ci * still heartbeating we grab a hold that will delay decisions until either the
28762306a36Sopenharmony_ci * node stops heartbeating from hb_down or the caller decides that the node is
28862306a36Sopenharmony_ci * still up and calls still_up */
28962306a36Sopenharmony_civoid o2quo_conn_err(u8 node)
29062306a36Sopenharmony_ci{
29162306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	spin_lock_bh(&qs->qs_lock);
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	if (test_bit(node, qs->qs_conn_bm)) {
29662306a36Sopenharmony_ci		qs->qs_connected--;
29762306a36Sopenharmony_ci		mlog_bug_on_msg(qs->qs_connected < 0,
29862306a36Sopenharmony_ci				"node %u, connected %d\n",
29962306a36Sopenharmony_ci				node, qs->qs_connected);
30062306a36Sopenharmony_ci
30162306a36Sopenharmony_ci		clear_bit(node, qs->qs_conn_bm);
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci		if (test_bit(node, qs->qs_hb_bm))
30462306a36Sopenharmony_ci			o2quo_set_hold(qs, node);
30562306a36Sopenharmony_ci	}
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	spin_unlock_bh(&qs->qs_lock);
31162306a36Sopenharmony_ci}
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_civoid o2quo_init(void)
31462306a36Sopenharmony_ci{
31562306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci	spin_lock_init(&qs->qs_lock);
31862306a36Sopenharmony_ci	INIT_WORK(&qs->qs_work, o2quo_make_decision);
31962306a36Sopenharmony_ci}
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_civoid o2quo_exit(void)
32262306a36Sopenharmony_ci{
32362306a36Sopenharmony_ci	struct o2quo_state *qs = &o2quo_state;
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci	flush_work(&qs->qs_work);
32662306a36Sopenharmony_ci}
327