162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Copyright (C) 2005 Oracle. All rights reserved. 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci/* This quorum hack is only here until we transition to some more rational 862306a36Sopenharmony_ci * approach that is driven from userspace. Honest. No foolin'. 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Imagine two nodes lose network connectivity to each other but they're still 1162306a36Sopenharmony_ci * up and operating in every other way. Presumably a network timeout indicates 1262306a36Sopenharmony_ci * that a node is broken and should be recovered. They can't both recover each 1362306a36Sopenharmony_ci * other and both carry on without serialising their access to the file system. 1462306a36Sopenharmony_ci * They need to decide who is authoritative. Now extend that problem to 1562306a36Sopenharmony_ci * arbitrary groups of nodes losing connectivity between each other. 1662306a36Sopenharmony_ci * 1762306a36Sopenharmony_ci * So we declare that a node which has given up on connecting to a majority 1862306a36Sopenharmony_ci * of nodes who are still heartbeating will fence itself. 1962306a36Sopenharmony_ci * 2062306a36Sopenharmony_ci * There are huge opportunities for races here. After we give up on a node's 2162306a36Sopenharmony_ci * connection we need to wait long enough to give heartbeat an opportunity 2262306a36Sopenharmony_ci * to declare the node as truly dead. We also need to be careful with the 2362306a36Sopenharmony_ci * race between when we see a node start heartbeating and when we connect 2462306a36Sopenharmony_ci * to it. 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * So nodes that are in this transtion put a hold on the quorum decision 2762306a36Sopenharmony_ci * with a counter. As they fall out of this transition they drop the count 2862306a36Sopenharmony_ci * and if they're the last, they fire off the decision. 2962306a36Sopenharmony_ci */ 3062306a36Sopenharmony_ci#include <linux/kernel.h> 3162306a36Sopenharmony_ci#include <linux/workqueue.h> 3262306a36Sopenharmony_ci#include <linux/reboot.h> 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci#include "heartbeat.h" 3562306a36Sopenharmony_ci#include "nodemanager.h" 3662306a36Sopenharmony_ci#define MLOG_MASK_PREFIX ML_QUORUM 3762306a36Sopenharmony_ci#include "masklog.h" 3862306a36Sopenharmony_ci#include "quorum.h" 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_cistatic struct o2quo_state { 4162306a36Sopenharmony_ci spinlock_t qs_lock; 4262306a36Sopenharmony_ci struct work_struct qs_work; 4362306a36Sopenharmony_ci int qs_pending; 4462306a36Sopenharmony_ci int qs_heartbeating; 4562306a36Sopenharmony_ci unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 4662306a36Sopenharmony_ci int qs_connected; 4762306a36Sopenharmony_ci unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 4862306a36Sopenharmony_ci int qs_holds; 4962306a36Sopenharmony_ci unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 5062306a36Sopenharmony_ci} o2quo_state; 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci/* this is horribly heavy-handed. It should instead flip the file 5362306a36Sopenharmony_ci * system RO and call some userspace script. */ 5462306a36Sopenharmony_cistatic void o2quo_fence_self(void) 5562306a36Sopenharmony_ci{ 5662306a36Sopenharmony_ci /* panic spins with interrupts enabled. with preempt 5762306a36Sopenharmony_ci * threads can still schedule, etc, etc */ 5862306a36Sopenharmony_ci o2hb_stop_all_regions(); 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci switch (o2nm_single_cluster->cl_fence_method) { 6162306a36Sopenharmony_ci case O2NM_FENCE_PANIC: 6262306a36Sopenharmony_ci panic("*** ocfs2 is very sorry to be fencing this system by " 6362306a36Sopenharmony_ci "panicing ***\n"); 6462306a36Sopenharmony_ci break; 6562306a36Sopenharmony_ci default: 6662306a36Sopenharmony_ci WARN_ON(o2nm_single_cluster->cl_fence_method >= 6762306a36Sopenharmony_ci O2NM_FENCE_METHODS); 6862306a36Sopenharmony_ci fallthrough; 6962306a36Sopenharmony_ci case O2NM_FENCE_RESET: 7062306a36Sopenharmony_ci printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " 7162306a36Sopenharmony_ci "system by restarting ***\n"); 7262306a36Sopenharmony_ci emergency_restart(); 7362306a36Sopenharmony_ci break; 7462306a36Sopenharmony_ci } 7562306a36Sopenharmony_ci} 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci/* Indicate that a timeout occurred on a heartbeat region write. The 7862306a36Sopenharmony_ci * other nodes in the cluster may consider us dead at that time so we 7962306a36Sopenharmony_ci * want to "fence" ourselves so that we don't scribble on the disk 8062306a36Sopenharmony_ci * after they think they've recovered us. This can't solve all 8162306a36Sopenharmony_ci * problems related to writeout after recovery but this hack can at 8262306a36Sopenharmony_ci * least close some of those gaps. When we have real fencing, this can 8362306a36Sopenharmony_ci * go away as our node would be fenced externally before other nodes 8462306a36Sopenharmony_ci * begin recovery. */ 8562306a36Sopenharmony_civoid o2quo_disk_timeout(void) 8662306a36Sopenharmony_ci{ 8762306a36Sopenharmony_ci o2quo_fence_self(); 8862306a36Sopenharmony_ci} 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_cistatic void o2quo_make_decision(struct work_struct *work) 9162306a36Sopenharmony_ci{ 9262306a36Sopenharmony_ci int quorum; 9362306a36Sopenharmony_ci int lowest_hb, lowest_reachable = 0, fence = 0; 9462306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci spin_lock_bh(&qs->qs_lock); 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); 9962306a36Sopenharmony_ci if (lowest_hb != O2NM_MAX_NODES) 10062306a36Sopenharmony_ci lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci mlog(0, "heartbeating: %d, connected: %d, " 10362306a36Sopenharmony_ci "lowest: %d (%sreachable)\n", qs->qs_heartbeating, 10462306a36Sopenharmony_ci qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || 10762306a36Sopenharmony_ci qs->qs_heartbeating == 1) 10862306a36Sopenharmony_ci goto out; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci if (qs->qs_heartbeating & 1) { 11162306a36Sopenharmony_ci /* the odd numbered cluster case is straight forward -- 11262306a36Sopenharmony_ci * if we can't talk to the majority we're hosed */ 11362306a36Sopenharmony_ci quorum = (qs->qs_heartbeating + 1)/2; 11462306a36Sopenharmony_ci if (qs->qs_connected < quorum) { 11562306a36Sopenharmony_ci mlog(ML_ERROR, "fencing this node because it is " 11662306a36Sopenharmony_ci "only connected to %u nodes and %u is needed " 11762306a36Sopenharmony_ci "to make a quorum out of %u heartbeating nodes\n", 11862306a36Sopenharmony_ci qs->qs_connected, quorum, 11962306a36Sopenharmony_ci qs->qs_heartbeating); 12062306a36Sopenharmony_ci fence = 1; 12162306a36Sopenharmony_ci } 12262306a36Sopenharmony_ci } else { 12362306a36Sopenharmony_ci /* the even numbered cluster adds the possibility of each half 12462306a36Sopenharmony_ci * of the cluster being able to talk amongst themselves.. in 12562306a36Sopenharmony_ci * that case we're hosed if we can't talk to the group that has 12662306a36Sopenharmony_ci * the lowest numbered node */ 12762306a36Sopenharmony_ci quorum = qs->qs_heartbeating / 2; 12862306a36Sopenharmony_ci if (qs->qs_connected < quorum) { 12962306a36Sopenharmony_ci mlog(ML_ERROR, "fencing this node because it is " 13062306a36Sopenharmony_ci "only connected to %u nodes and %u is needed " 13162306a36Sopenharmony_ci "to make a quorum out of %u heartbeating nodes\n", 13262306a36Sopenharmony_ci qs->qs_connected, quorum, 13362306a36Sopenharmony_ci qs->qs_heartbeating); 13462306a36Sopenharmony_ci fence = 1; 13562306a36Sopenharmony_ci } 13662306a36Sopenharmony_ci else if ((qs->qs_connected == quorum) && 13762306a36Sopenharmony_ci !lowest_reachable) { 13862306a36Sopenharmony_ci mlog(ML_ERROR, "fencing this node because it is " 13962306a36Sopenharmony_ci "connected to a half-quorum of %u out of %u " 14062306a36Sopenharmony_ci "nodes which doesn't include the lowest active " 14162306a36Sopenharmony_ci "node %u\n", quorum, qs->qs_heartbeating, 14262306a36Sopenharmony_ci lowest_hb); 14362306a36Sopenharmony_ci fence = 1; 14462306a36Sopenharmony_ci } 14562306a36Sopenharmony_ci } 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ciout: 14862306a36Sopenharmony_ci if (fence) { 14962306a36Sopenharmony_ci spin_unlock_bh(&qs->qs_lock); 15062306a36Sopenharmony_ci o2quo_fence_self(); 15162306a36Sopenharmony_ci } else { 15262306a36Sopenharmony_ci mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " 15362306a36Sopenharmony_ci "connected: %d, lowest: %d (%sreachable)\n", 15462306a36Sopenharmony_ci qs->qs_heartbeating, qs->qs_connected, lowest_hb, 15562306a36Sopenharmony_ci lowest_reachable ? "" : "un"); 15662306a36Sopenharmony_ci spin_unlock_bh(&qs->qs_lock); 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci } 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci} 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_cistatic void o2quo_set_hold(struct o2quo_state *qs, u8 node) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci assert_spin_locked(&qs->qs_lock); 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci if (!test_and_set_bit(node, qs->qs_hold_bm)) { 16762306a36Sopenharmony_ci qs->qs_holds++; 16862306a36Sopenharmony_ci mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, 16962306a36Sopenharmony_ci "node %u\n", node); 17062306a36Sopenharmony_ci mlog(0, "node %u, %d total\n", node, qs->qs_holds); 17162306a36Sopenharmony_ci } 17262306a36Sopenharmony_ci} 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_cistatic void o2quo_clear_hold(struct o2quo_state *qs, u8 node) 17562306a36Sopenharmony_ci{ 17662306a36Sopenharmony_ci assert_spin_locked(&qs->qs_lock); 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci if (test_and_clear_bit(node, qs->qs_hold_bm)) { 17962306a36Sopenharmony_ci mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); 18062306a36Sopenharmony_ci if (--qs->qs_holds == 0) { 18162306a36Sopenharmony_ci if (qs->qs_pending) { 18262306a36Sopenharmony_ci qs->qs_pending = 0; 18362306a36Sopenharmony_ci schedule_work(&qs->qs_work); 18462306a36Sopenharmony_ci } 18562306a36Sopenharmony_ci } 18662306a36Sopenharmony_ci mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", 18762306a36Sopenharmony_ci node, qs->qs_holds); 18862306a36Sopenharmony_ci } 18962306a36Sopenharmony_ci} 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci/* as a node comes up we delay the quorum decision until we know the fate of 19262306a36Sopenharmony_ci * the connection. the hold will be droped in conn_up or hb_down. it might be 19362306a36Sopenharmony_ci * perpetuated by con_err until hb_down. if we already have a conn, we might 19462306a36Sopenharmony_ci * be dropping a hold that conn_up got. */ 19562306a36Sopenharmony_civoid o2quo_hb_up(u8 node) 19662306a36Sopenharmony_ci{ 19762306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci spin_lock_bh(&qs->qs_lock); 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci qs->qs_heartbeating++; 20262306a36Sopenharmony_ci mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, 20362306a36Sopenharmony_ci "node %u\n", node); 20462306a36Sopenharmony_ci mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); 20562306a36Sopenharmony_ci set_bit(node, qs->qs_hb_bm); 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci if (!test_bit(node, qs->qs_conn_bm)) 21062306a36Sopenharmony_ci o2quo_set_hold(qs, node); 21162306a36Sopenharmony_ci else 21262306a36Sopenharmony_ci o2quo_clear_hold(qs, node); 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci spin_unlock_bh(&qs->qs_lock); 21562306a36Sopenharmony_ci} 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci/* hb going down releases any holds we might have had due to this node from 21862306a36Sopenharmony_ci * conn_up, conn_err, or hb_up */ 21962306a36Sopenharmony_civoid o2quo_hb_down(u8 node) 22062306a36Sopenharmony_ci{ 22162306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ci spin_lock_bh(&qs->qs_lock); 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci qs->qs_heartbeating--; 22662306a36Sopenharmony_ci mlog_bug_on_msg(qs->qs_heartbeating < 0, 22762306a36Sopenharmony_ci "node %u, %d heartbeating\n", 22862306a36Sopenharmony_ci node, qs->qs_heartbeating); 22962306a36Sopenharmony_ci mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); 23062306a36Sopenharmony_ci clear_bit(node, qs->qs_hb_bm); 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci o2quo_clear_hold(qs, node); 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci spin_unlock_bh(&qs->qs_lock); 23762306a36Sopenharmony_ci} 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci/* this tells us that we've decided that the node is still heartbeating 24062306a36Sopenharmony_ci * even though we've lost it's conn. it must only be called after conn_err 24162306a36Sopenharmony_ci * and indicates that we must now make a quorum decision in the future, 24262306a36Sopenharmony_ci * though we might be doing so after waiting for holds to drain. Here 24362306a36Sopenharmony_ci * we'll be dropping the hold from conn_err. */ 24462306a36Sopenharmony_civoid o2quo_hb_still_up(u8 node) 24562306a36Sopenharmony_ci{ 24662306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci spin_lock_bh(&qs->qs_lock); 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci mlog(0, "node %u\n", node); 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci qs->qs_pending = 1; 25362306a36Sopenharmony_ci o2quo_clear_hold(qs, node); 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci spin_unlock_bh(&qs->qs_lock); 25662306a36Sopenharmony_ci} 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci/* This is analogous to hb_up. as a node's connection comes up we delay the 25962306a36Sopenharmony_ci * quorum decision until we see it heartbeating. the hold will be droped in 26062306a36Sopenharmony_ci * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if 26162306a36Sopenharmony_ci * it's already heartbeating we might be dropping a hold that conn_up got. 26262306a36Sopenharmony_ci * */ 26362306a36Sopenharmony_civoid o2quo_conn_up(u8 node) 26462306a36Sopenharmony_ci{ 26562306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci spin_lock_bh(&qs->qs_lock); 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci qs->qs_connected++; 27062306a36Sopenharmony_ci mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, 27162306a36Sopenharmony_ci "node %u\n", node); 27262306a36Sopenharmony_ci mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); 27362306a36Sopenharmony_ci set_bit(node, qs->qs_conn_bm); 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci mlog(0, "node %u, %d total\n", node, qs->qs_connected); 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci if (!test_bit(node, qs->qs_hb_bm)) 27862306a36Sopenharmony_ci o2quo_set_hold(qs, node); 27962306a36Sopenharmony_ci else 28062306a36Sopenharmony_ci o2quo_clear_hold(qs, node); 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci spin_unlock_bh(&qs->qs_lock); 28362306a36Sopenharmony_ci} 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci/* we've decided that we won't ever be connecting to the node again. if it's 28662306a36Sopenharmony_ci * still heartbeating we grab a hold that will delay decisions until either the 28762306a36Sopenharmony_ci * node stops heartbeating from hb_down or the caller decides that the node is 28862306a36Sopenharmony_ci * still up and calls still_up */ 28962306a36Sopenharmony_civoid o2quo_conn_err(u8 node) 29062306a36Sopenharmony_ci{ 29162306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci spin_lock_bh(&qs->qs_lock); 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci if (test_bit(node, qs->qs_conn_bm)) { 29662306a36Sopenharmony_ci qs->qs_connected--; 29762306a36Sopenharmony_ci mlog_bug_on_msg(qs->qs_connected < 0, 29862306a36Sopenharmony_ci "node %u, connected %d\n", 29962306a36Sopenharmony_ci node, qs->qs_connected); 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci clear_bit(node, qs->qs_conn_bm); 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci if (test_bit(node, qs->qs_hb_bm)) 30462306a36Sopenharmony_ci o2quo_set_hold(qs, node); 30562306a36Sopenharmony_ci } 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci mlog(0, "node %u, %d total\n", node, qs->qs_connected); 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci spin_unlock_bh(&qs->qs_lock); 31162306a36Sopenharmony_ci} 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_civoid o2quo_init(void) 31462306a36Sopenharmony_ci{ 31562306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci spin_lock_init(&qs->qs_lock); 31862306a36Sopenharmony_ci INIT_WORK(&qs->qs_work, o2quo_make_decision); 31962306a36Sopenharmony_ci} 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_civoid o2quo_exit(void) 32262306a36Sopenharmony_ci{ 32362306a36Sopenharmony_ci struct o2quo_state *qs = &o2quo_state; 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci flush_work(&qs->qs_work); 32662306a36Sopenharmony_ci} 327