18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/******************************************************************************
38c2ecf20Sopenharmony_ci*******************************************************************************
48c2ecf20Sopenharmony_ci**
58c2ecf20Sopenharmony_ci**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
68c2ecf20Sopenharmony_ci**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
78c2ecf20Sopenharmony_ci**
88c2ecf20Sopenharmony_ci**
98c2ecf20Sopenharmony_ci*******************************************************************************
108c2ecf20Sopenharmony_ci******************************************************************************/
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci#include "dlm_internal.h"
138c2ecf20Sopenharmony_ci#include "lockspace.h"
148c2ecf20Sopenharmony_ci#include "dir.h"
158c2ecf20Sopenharmony_ci#include "config.h"
168c2ecf20Sopenharmony_ci#include "ast.h"
178c2ecf20Sopenharmony_ci#include "memory.h"
188c2ecf20Sopenharmony_ci#include "rcom.h"
198c2ecf20Sopenharmony_ci#include "lock.h"
208c2ecf20Sopenharmony_ci#include "lowcomms.h"
218c2ecf20Sopenharmony_ci#include "member.h"
228c2ecf20Sopenharmony_ci#include "recover.h"
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci
258c2ecf20Sopenharmony_ci/*
268c2ecf20Sopenharmony_ci * Recovery waiting routines: these functions wait for a particular reply from
278c2ecf20Sopenharmony_ci * a remote node, or for the remote node to report a certain status.  They need
288c2ecf20Sopenharmony_ci * to abort if the lockspace is stopped indicating a node has failed (perhaps
298c2ecf20Sopenharmony_ci * the one being waited for).
308c2ecf20Sopenharmony_ci */
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_ci/*
338c2ecf20Sopenharmony_ci * Wait until given function returns non-zero or lockspace is stopped
348c2ecf20Sopenharmony_ci * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes).  When another
358c2ecf20Sopenharmony_ci * function thinks it could have completed the waited-on task, they should wake
368c2ecf20Sopenharmony_ci * up ls_wait_general to get an immediate response rather than waiting for the
378c2ecf20Sopenharmony_ci * timeout.  This uses a timeout so it can check periodically if the wait
388c2ecf20Sopenharmony_ci * should abort due to node failure (which doesn't cause a wake_up).
398c2ecf20Sopenharmony_ci * This should only be called by the dlm_recoverd thread.
408c2ecf20Sopenharmony_ci */
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ciint dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
438c2ecf20Sopenharmony_ci{
448c2ecf20Sopenharmony_ci	int error = 0;
458c2ecf20Sopenharmony_ci	int rv;
468c2ecf20Sopenharmony_ci
478c2ecf20Sopenharmony_ci	while (1) {
488c2ecf20Sopenharmony_ci		rv = wait_event_timeout(ls->ls_wait_general,
498c2ecf20Sopenharmony_ci					testfn(ls) || dlm_recovery_stopped(ls),
508c2ecf20Sopenharmony_ci					dlm_config.ci_recover_timer * HZ);
518c2ecf20Sopenharmony_ci		if (rv)
528c2ecf20Sopenharmony_ci			break;
538c2ecf20Sopenharmony_ci		if (test_bit(LSFL_RCOM_WAIT, &ls->ls_flags)) {
548c2ecf20Sopenharmony_ci			log_debug(ls, "dlm_wait_function timed out");
558c2ecf20Sopenharmony_ci			return -ETIMEDOUT;
568c2ecf20Sopenharmony_ci		}
578c2ecf20Sopenharmony_ci	}
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci	if (dlm_recovery_stopped(ls)) {
608c2ecf20Sopenharmony_ci		log_debug(ls, "dlm_wait_function aborted");
618c2ecf20Sopenharmony_ci		error = -EINTR;
628c2ecf20Sopenharmony_ci	}
638c2ecf20Sopenharmony_ci	return error;
648c2ecf20Sopenharmony_ci}
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci/*
678c2ecf20Sopenharmony_ci * An efficient way for all nodes to wait for all others to have a certain
688c2ecf20Sopenharmony_ci * status.  The node with the lowest nodeid polls all the others for their
698c2ecf20Sopenharmony_ci * status (wait_status_all) and all the others poll the node with the low id
708c2ecf20Sopenharmony_ci * for its accumulated result (wait_status_low).  When all nodes have set
718c2ecf20Sopenharmony_ci * status flag X, then status flag X_ALL will be set on the low nodeid.
728c2ecf20Sopenharmony_ci */
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ciuint32_t dlm_recover_status(struct dlm_ls *ls)
758c2ecf20Sopenharmony_ci{
768c2ecf20Sopenharmony_ci	uint32_t status;
778c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_lock);
788c2ecf20Sopenharmony_ci	status = ls->ls_recover_status;
798c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_lock);
808c2ecf20Sopenharmony_ci	return status;
818c2ecf20Sopenharmony_ci}
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_cistatic void _set_recover_status(struct dlm_ls *ls, uint32_t status)
848c2ecf20Sopenharmony_ci{
858c2ecf20Sopenharmony_ci	ls->ls_recover_status |= status;
868c2ecf20Sopenharmony_ci}
878c2ecf20Sopenharmony_ci
888c2ecf20Sopenharmony_civoid dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
898c2ecf20Sopenharmony_ci{
908c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_lock);
918c2ecf20Sopenharmony_ci	_set_recover_status(ls, status);
928c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_lock);
938c2ecf20Sopenharmony_ci}
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_cistatic int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
968c2ecf20Sopenharmony_ci			   int save_slots)
978c2ecf20Sopenharmony_ci{
988c2ecf20Sopenharmony_ci	struct dlm_rcom *rc = ls->ls_recover_buf;
998c2ecf20Sopenharmony_ci	struct dlm_member *memb;
1008c2ecf20Sopenharmony_ci	int error = 0, delay;
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci	list_for_each_entry(memb, &ls->ls_nodes, list) {
1038c2ecf20Sopenharmony_ci		delay = 0;
1048c2ecf20Sopenharmony_ci		for (;;) {
1058c2ecf20Sopenharmony_ci			if (dlm_recovery_stopped(ls)) {
1068c2ecf20Sopenharmony_ci				error = -EINTR;
1078c2ecf20Sopenharmony_ci				goto out;
1088c2ecf20Sopenharmony_ci			}
1098c2ecf20Sopenharmony_ci
1108c2ecf20Sopenharmony_ci			error = dlm_rcom_status(ls, memb->nodeid, 0);
1118c2ecf20Sopenharmony_ci			if (error)
1128c2ecf20Sopenharmony_ci				goto out;
1138c2ecf20Sopenharmony_ci
1148c2ecf20Sopenharmony_ci			if (save_slots)
1158c2ecf20Sopenharmony_ci				dlm_slot_save(ls, rc, memb);
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_ci			if (rc->rc_result & wait_status)
1188c2ecf20Sopenharmony_ci				break;
1198c2ecf20Sopenharmony_ci			if (delay < 1000)
1208c2ecf20Sopenharmony_ci				delay += 20;
1218c2ecf20Sopenharmony_ci			msleep(delay);
1228c2ecf20Sopenharmony_ci		}
1238c2ecf20Sopenharmony_ci	}
1248c2ecf20Sopenharmony_ci out:
1258c2ecf20Sopenharmony_ci	return error;
1268c2ecf20Sopenharmony_ci}
1278c2ecf20Sopenharmony_ci
1288c2ecf20Sopenharmony_cistatic int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
1298c2ecf20Sopenharmony_ci			   uint32_t status_flags)
1308c2ecf20Sopenharmony_ci{
1318c2ecf20Sopenharmony_ci	struct dlm_rcom *rc = ls->ls_recover_buf;
1328c2ecf20Sopenharmony_ci	int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci	for (;;) {
1358c2ecf20Sopenharmony_ci		if (dlm_recovery_stopped(ls)) {
1368c2ecf20Sopenharmony_ci			error = -EINTR;
1378c2ecf20Sopenharmony_ci			goto out;
1388c2ecf20Sopenharmony_ci		}
1398c2ecf20Sopenharmony_ci
1408c2ecf20Sopenharmony_ci		error = dlm_rcom_status(ls, nodeid, status_flags);
1418c2ecf20Sopenharmony_ci		if (error)
1428c2ecf20Sopenharmony_ci			break;
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci		if (rc->rc_result & wait_status)
1458c2ecf20Sopenharmony_ci			break;
1468c2ecf20Sopenharmony_ci		if (delay < 1000)
1478c2ecf20Sopenharmony_ci			delay += 20;
1488c2ecf20Sopenharmony_ci		msleep(delay);
1498c2ecf20Sopenharmony_ci	}
1508c2ecf20Sopenharmony_ci out:
1518c2ecf20Sopenharmony_ci	return error;
1528c2ecf20Sopenharmony_ci}
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_cistatic int wait_status(struct dlm_ls *ls, uint32_t status)
1558c2ecf20Sopenharmony_ci{
1568c2ecf20Sopenharmony_ci	uint32_t status_all = status << 1;
1578c2ecf20Sopenharmony_ci	int error;
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
1608c2ecf20Sopenharmony_ci		error = wait_status_all(ls, status, 0);
1618c2ecf20Sopenharmony_ci		if (!error)
1628c2ecf20Sopenharmony_ci			dlm_set_recover_status(ls, status_all);
1638c2ecf20Sopenharmony_ci	} else
1648c2ecf20Sopenharmony_ci		error = wait_status_low(ls, status_all, 0);
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	return error;
1678c2ecf20Sopenharmony_ci}
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_ciint dlm_recover_members_wait(struct dlm_ls *ls)
1708c2ecf20Sopenharmony_ci{
1718c2ecf20Sopenharmony_ci	struct dlm_member *memb;
1728c2ecf20Sopenharmony_ci	struct dlm_slot *slots;
1738c2ecf20Sopenharmony_ci	int num_slots, slots_size;
1748c2ecf20Sopenharmony_ci	int error, rv;
1758c2ecf20Sopenharmony_ci	uint32_t gen;
1768c2ecf20Sopenharmony_ci
1778c2ecf20Sopenharmony_ci	list_for_each_entry(memb, &ls->ls_nodes, list) {
1788c2ecf20Sopenharmony_ci		memb->slot = -1;
1798c2ecf20Sopenharmony_ci		memb->generation = 0;
1808c2ecf20Sopenharmony_ci	}
1818c2ecf20Sopenharmony_ci
1828c2ecf20Sopenharmony_ci	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
1838c2ecf20Sopenharmony_ci		error = wait_status_all(ls, DLM_RS_NODES, 1);
1848c2ecf20Sopenharmony_ci		if (error)
1858c2ecf20Sopenharmony_ci			goto out;
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci		/* slots array is sparse, slots_size may be > num_slots */
1888c2ecf20Sopenharmony_ci
1898c2ecf20Sopenharmony_ci		rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
1908c2ecf20Sopenharmony_ci		if (!rv) {
1918c2ecf20Sopenharmony_ci			spin_lock(&ls->ls_recover_lock);
1928c2ecf20Sopenharmony_ci			_set_recover_status(ls, DLM_RS_NODES_ALL);
1938c2ecf20Sopenharmony_ci			ls->ls_num_slots = num_slots;
1948c2ecf20Sopenharmony_ci			ls->ls_slots_size = slots_size;
1958c2ecf20Sopenharmony_ci			ls->ls_slots = slots;
1968c2ecf20Sopenharmony_ci			ls->ls_generation = gen;
1978c2ecf20Sopenharmony_ci			spin_unlock(&ls->ls_recover_lock);
1988c2ecf20Sopenharmony_ci		} else {
1998c2ecf20Sopenharmony_ci			dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
2008c2ecf20Sopenharmony_ci		}
2018c2ecf20Sopenharmony_ci	} else {
2028c2ecf20Sopenharmony_ci		error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
2038c2ecf20Sopenharmony_ci		if (error)
2048c2ecf20Sopenharmony_ci			goto out;
2058c2ecf20Sopenharmony_ci
2068c2ecf20Sopenharmony_ci		dlm_slots_copy_in(ls);
2078c2ecf20Sopenharmony_ci	}
2088c2ecf20Sopenharmony_ci out:
2098c2ecf20Sopenharmony_ci	return error;
2108c2ecf20Sopenharmony_ci}
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ciint dlm_recover_directory_wait(struct dlm_ls *ls)
2138c2ecf20Sopenharmony_ci{
2148c2ecf20Sopenharmony_ci	return wait_status(ls, DLM_RS_DIR);
2158c2ecf20Sopenharmony_ci}
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ciint dlm_recover_locks_wait(struct dlm_ls *ls)
2188c2ecf20Sopenharmony_ci{
2198c2ecf20Sopenharmony_ci	return wait_status(ls, DLM_RS_LOCKS);
2208c2ecf20Sopenharmony_ci}
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_ciint dlm_recover_done_wait(struct dlm_ls *ls)
2238c2ecf20Sopenharmony_ci{
2248c2ecf20Sopenharmony_ci	return wait_status(ls, DLM_RS_DONE);
2258c2ecf20Sopenharmony_ci}
2268c2ecf20Sopenharmony_ci
2278c2ecf20Sopenharmony_ci/*
2288c2ecf20Sopenharmony_ci * The recover_list contains all the rsb's for which we've requested the new
2298c2ecf20Sopenharmony_ci * master nodeid.  As replies are returned from the resource directories the
2308c2ecf20Sopenharmony_ci * rsb's are removed from the list.  When the list is empty we're done.
2318c2ecf20Sopenharmony_ci *
2328c2ecf20Sopenharmony_ci * The recover_list is later similarly used for all rsb's for which we've sent
2338c2ecf20Sopenharmony_ci * new lkb's and need to receive new corresponding lkid's.
2348c2ecf20Sopenharmony_ci *
2358c2ecf20Sopenharmony_ci * We use the address of the rsb struct as a simple local identifier for the
2368c2ecf20Sopenharmony_ci * rsb so we can match an rcom reply with the rsb it was sent for.
2378c2ecf20Sopenharmony_ci */
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_cistatic int recover_list_empty(struct dlm_ls *ls)
2408c2ecf20Sopenharmony_ci{
2418c2ecf20Sopenharmony_ci	int empty;
2428c2ecf20Sopenharmony_ci
2438c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_list_lock);
2448c2ecf20Sopenharmony_ci	empty = list_empty(&ls->ls_recover_list);
2458c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_list_lock);
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	return empty;
2488c2ecf20Sopenharmony_ci}
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_cistatic void recover_list_add(struct dlm_rsb *r)
2518c2ecf20Sopenharmony_ci{
2528c2ecf20Sopenharmony_ci	struct dlm_ls *ls = r->res_ls;
2538c2ecf20Sopenharmony_ci
2548c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_list_lock);
2558c2ecf20Sopenharmony_ci	if (list_empty(&r->res_recover_list)) {
2568c2ecf20Sopenharmony_ci		list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
2578c2ecf20Sopenharmony_ci		ls->ls_recover_list_count++;
2588c2ecf20Sopenharmony_ci		dlm_hold_rsb(r);
2598c2ecf20Sopenharmony_ci	}
2608c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_list_lock);
2618c2ecf20Sopenharmony_ci}
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_cistatic void recover_list_del(struct dlm_rsb *r)
2648c2ecf20Sopenharmony_ci{
2658c2ecf20Sopenharmony_ci	struct dlm_ls *ls = r->res_ls;
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_list_lock);
2688c2ecf20Sopenharmony_ci	list_del_init(&r->res_recover_list);
2698c2ecf20Sopenharmony_ci	ls->ls_recover_list_count--;
2708c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_list_lock);
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci	dlm_put_rsb(r);
2738c2ecf20Sopenharmony_ci}
2748c2ecf20Sopenharmony_ci
2758c2ecf20Sopenharmony_cistatic void recover_list_clear(struct dlm_ls *ls)
2768c2ecf20Sopenharmony_ci{
2778c2ecf20Sopenharmony_ci	struct dlm_rsb *r, *s;
2788c2ecf20Sopenharmony_ci
2798c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_list_lock);
2808c2ecf20Sopenharmony_ci	list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
2818c2ecf20Sopenharmony_ci		list_del_init(&r->res_recover_list);
2828c2ecf20Sopenharmony_ci		r->res_recover_locks_count = 0;
2838c2ecf20Sopenharmony_ci		dlm_put_rsb(r);
2848c2ecf20Sopenharmony_ci		ls->ls_recover_list_count--;
2858c2ecf20Sopenharmony_ci	}
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci	if (ls->ls_recover_list_count != 0) {
2888c2ecf20Sopenharmony_ci		log_error(ls, "warning: recover_list_count %d",
2898c2ecf20Sopenharmony_ci			  ls->ls_recover_list_count);
2908c2ecf20Sopenharmony_ci		ls->ls_recover_list_count = 0;
2918c2ecf20Sopenharmony_ci	}
2928c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_list_lock);
2938c2ecf20Sopenharmony_ci}
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_cistatic int recover_idr_empty(struct dlm_ls *ls)
2968c2ecf20Sopenharmony_ci{
2978c2ecf20Sopenharmony_ci	int empty = 1;
2988c2ecf20Sopenharmony_ci
2998c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_idr_lock);
3008c2ecf20Sopenharmony_ci	if (ls->ls_recover_list_count)
3018c2ecf20Sopenharmony_ci		empty = 0;
3028c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_idr_lock);
3038c2ecf20Sopenharmony_ci
3048c2ecf20Sopenharmony_ci	return empty;
3058c2ecf20Sopenharmony_ci}
3068c2ecf20Sopenharmony_ci
3078c2ecf20Sopenharmony_cistatic int recover_idr_add(struct dlm_rsb *r)
3088c2ecf20Sopenharmony_ci{
3098c2ecf20Sopenharmony_ci	struct dlm_ls *ls = r->res_ls;
3108c2ecf20Sopenharmony_ci	int rv;
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci	idr_preload(GFP_NOFS);
3138c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_idr_lock);
3148c2ecf20Sopenharmony_ci	if (r->res_id) {
3158c2ecf20Sopenharmony_ci		rv = -1;
3168c2ecf20Sopenharmony_ci		goto out_unlock;
3178c2ecf20Sopenharmony_ci	}
3188c2ecf20Sopenharmony_ci	rv = idr_alloc(&ls->ls_recover_idr, r, 1, 0, GFP_NOWAIT);
3198c2ecf20Sopenharmony_ci	if (rv < 0)
3208c2ecf20Sopenharmony_ci		goto out_unlock;
3218c2ecf20Sopenharmony_ci
3228c2ecf20Sopenharmony_ci	r->res_id = rv;
3238c2ecf20Sopenharmony_ci	ls->ls_recover_list_count++;
3248c2ecf20Sopenharmony_ci	dlm_hold_rsb(r);
3258c2ecf20Sopenharmony_ci	rv = 0;
3268c2ecf20Sopenharmony_ciout_unlock:
3278c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_idr_lock);
3288c2ecf20Sopenharmony_ci	idr_preload_end();
3298c2ecf20Sopenharmony_ci	return rv;
3308c2ecf20Sopenharmony_ci}
3318c2ecf20Sopenharmony_ci
3328c2ecf20Sopenharmony_cistatic void recover_idr_del(struct dlm_rsb *r)
3338c2ecf20Sopenharmony_ci{
3348c2ecf20Sopenharmony_ci	struct dlm_ls *ls = r->res_ls;
3358c2ecf20Sopenharmony_ci
3368c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_idr_lock);
3378c2ecf20Sopenharmony_ci	idr_remove(&ls->ls_recover_idr, r->res_id);
3388c2ecf20Sopenharmony_ci	r->res_id = 0;
3398c2ecf20Sopenharmony_ci	ls->ls_recover_list_count--;
3408c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_idr_lock);
3418c2ecf20Sopenharmony_ci
3428c2ecf20Sopenharmony_ci	dlm_put_rsb(r);
3438c2ecf20Sopenharmony_ci}
3448c2ecf20Sopenharmony_ci
3458c2ecf20Sopenharmony_cistatic struct dlm_rsb *recover_idr_find(struct dlm_ls *ls, uint64_t id)
3468c2ecf20Sopenharmony_ci{
3478c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_idr_lock);
3508c2ecf20Sopenharmony_ci	r = idr_find(&ls->ls_recover_idr, (int)id);
3518c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_idr_lock);
3528c2ecf20Sopenharmony_ci	return r;
3538c2ecf20Sopenharmony_ci}
3548c2ecf20Sopenharmony_ci
3558c2ecf20Sopenharmony_cistatic void recover_idr_clear(struct dlm_ls *ls)
3568c2ecf20Sopenharmony_ci{
3578c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
3588c2ecf20Sopenharmony_ci	int id;
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci	spin_lock(&ls->ls_recover_idr_lock);
3618c2ecf20Sopenharmony_ci
3628c2ecf20Sopenharmony_ci	idr_for_each_entry(&ls->ls_recover_idr, r, id) {
3638c2ecf20Sopenharmony_ci		idr_remove(&ls->ls_recover_idr, id);
3648c2ecf20Sopenharmony_ci		r->res_id = 0;
3658c2ecf20Sopenharmony_ci		r->res_recover_locks_count = 0;
3668c2ecf20Sopenharmony_ci		ls->ls_recover_list_count--;
3678c2ecf20Sopenharmony_ci
3688c2ecf20Sopenharmony_ci		dlm_put_rsb(r);
3698c2ecf20Sopenharmony_ci	}
3708c2ecf20Sopenharmony_ci
3718c2ecf20Sopenharmony_ci	if (ls->ls_recover_list_count != 0) {
3728c2ecf20Sopenharmony_ci		log_error(ls, "warning: recover_list_count %d",
3738c2ecf20Sopenharmony_ci			  ls->ls_recover_list_count);
3748c2ecf20Sopenharmony_ci		ls->ls_recover_list_count = 0;
3758c2ecf20Sopenharmony_ci	}
3768c2ecf20Sopenharmony_ci	spin_unlock(&ls->ls_recover_idr_lock);
3778c2ecf20Sopenharmony_ci}
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_ci/* Master recovery: find new master node for rsb's that were
3818c2ecf20Sopenharmony_ci   mastered on nodes that have been removed.
3828c2ecf20Sopenharmony_ci
3838c2ecf20Sopenharmony_ci   dlm_recover_masters
3848c2ecf20Sopenharmony_ci   recover_master
3858c2ecf20Sopenharmony_ci   dlm_send_rcom_lookup            ->  receive_rcom_lookup
3868c2ecf20Sopenharmony_ci                                       dlm_dir_lookup
3878c2ecf20Sopenharmony_ci   receive_rcom_lookup_reply       <-
3888c2ecf20Sopenharmony_ci   dlm_recover_master_reply
3898c2ecf20Sopenharmony_ci   set_new_master
3908c2ecf20Sopenharmony_ci   set_master_lkbs
3918c2ecf20Sopenharmony_ci   set_lock_master
3928c2ecf20Sopenharmony_ci*/
3938c2ecf20Sopenharmony_ci
3948c2ecf20Sopenharmony_ci/*
3958c2ecf20Sopenharmony_ci * Set the lock master for all LKBs in a lock queue
3968c2ecf20Sopenharmony_ci * If we are the new master of the rsb, we may have received new
3978c2ecf20Sopenharmony_ci * MSTCPY locks from other nodes already which we need to ignore
3988c2ecf20Sopenharmony_ci * when setting the new nodeid.
3998c2ecf20Sopenharmony_ci */
4008c2ecf20Sopenharmony_ci
4018c2ecf20Sopenharmony_cistatic void set_lock_master(struct list_head *queue, int nodeid)
4028c2ecf20Sopenharmony_ci{
4038c2ecf20Sopenharmony_ci	struct dlm_lkb *lkb;
4048c2ecf20Sopenharmony_ci
4058c2ecf20Sopenharmony_ci	list_for_each_entry(lkb, queue, lkb_statequeue) {
4068c2ecf20Sopenharmony_ci		if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) {
4078c2ecf20Sopenharmony_ci			lkb->lkb_nodeid = nodeid;
4088c2ecf20Sopenharmony_ci			lkb->lkb_remid = 0;
4098c2ecf20Sopenharmony_ci		}
4108c2ecf20Sopenharmony_ci	}
4118c2ecf20Sopenharmony_ci}
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_cistatic void set_master_lkbs(struct dlm_rsb *r)
4148c2ecf20Sopenharmony_ci{
4158c2ecf20Sopenharmony_ci	set_lock_master(&r->res_grantqueue, r->res_nodeid);
4168c2ecf20Sopenharmony_ci	set_lock_master(&r->res_convertqueue, r->res_nodeid);
4178c2ecf20Sopenharmony_ci	set_lock_master(&r->res_waitqueue, r->res_nodeid);
4188c2ecf20Sopenharmony_ci}
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci/*
4218c2ecf20Sopenharmony_ci * Propagate the new master nodeid to locks
4228c2ecf20Sopenharmony_ci * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
4238c2ecf20Sopenharmony_ci * The NEW_MASTER2 flag tells recover_lvb() and recover_grant() which
4248c2ecf20Sopenharmony_ci * rsb's to consider.
4258c2ecf20Sopenharmony_ci */
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_cistatic void set_new_master(struct dlm_rsb *r)
4288c2ecf20Sopenharmony_ci{
4298c2ecf20Sopenharmony_ci	set_master_lkbs(r);
4308c2ecf20Sopenharmony_ci	rsb_set_flag(r, RSB_NEW_MASTER);
4318c2ecf20Sopenharmony_ci	rsb_set_flag(r, RSB_NEW_MASTER2);
4328c2ecf20Sopenharmony_ci}
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci/*
4358c2ecf20Sopenharmony_ci * We do async lookups on rsb's that need new masters.  The rsb's
4368c2ecf20Sopenharmony_ci * waiting for a lookup reply are kept on the recover_list.
4378c2ecf20Sopenharmony_ci *
4388c2ecf20Sopenharmony_ci * Another node recovering the master may have sent us a rcom lookup,
4398c2ecf20Sopenharmony_ci * and our dlm_master_lookup() set it as the new master, along with
4408c2ecf20Sopenharmony_ci * NEW_MASTER so that we'll recover it here (this implies dir_nodeid
4418c2ecf20Sopenharmony_ci * equals our_nodeid below).
4428c2ecf20Sopenharmony_ci */
4438c2ecf20Sopenharmony_ci
4448c2ecf20Sopenharmony_cistatic int recover_master(struct dlm_rsb *r, unsigned int *count)
4458c2ecf20Sopenharmony_ci{
4468c2ecf20Sopenharmony_ci	struct dlm_ls *ls = r->res_ls;
4478c2ecf20Sopenharmony_ci	int our_nodeid, dir_nodeid;
4488c2ecf20Sopenharmony_ci	int is_removed = 0;
4498c2ecf20Sopenharmony_ci	int error;
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_ci	if (is_master(r))
4528c2ecf20Sopenharmony_ci		return 0;
4538c2ecf20Sopenharmony_ci
4548c2ecf20Sopenharmony_ci	is_removed = dlm_is_removed(ls, r->res_nodeid);
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	if (!is_removed && !rsb_flag(r, RSB_NEW_MASTER))
4578c2ecf20Sopenharmony_ci		return 0;
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	our_nodeid = dlm_our_nodeid();
4608c2ecf20Sopenharmony_ci	dir_nodeid = dlm_dir_nodeid(r);
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci	if (dir_nodeid == our_nodeid) {
4638c2ecf20Sopenharmony_ci		if (is_removed) {
4648c2ecf20Sopenharmony_ci			r->res_master_nodeid = our_nodeid;
4658c2ecf20Sopenharmony_ci			r->res_nodeid = 0;
4668c2ecf20Sopenharmony_ci		}
4678c2ecf20Sopenharmony_ci
4688c2ecf20Sopenharmony_ci		/* set master of lkbs to ourself when is_removed, or to
4698c2ecf20Sopenharmony_ci		   another new master which we set along with NEW_MASTER
4708c2ecf20Sopenharmony_ci		   in dlm_master_lookup */
4718c2ecf20Sopenharmony_ci		set_new_master(r);
4728c2ecf20Sopenharmony_ci		error = 0;
4738c2ecf20Sopenharmony_ci	} else {
4748c2ecf20Sopenharmony_ci		recover_idr_add(r);
4758c2ecf20Sopenharmony_ci		error = dlm_send_rcom_lookup(r, dir_nodeid);
4768c2ecf20Sopenharmony_ci	}
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ci	(*count)++;
4798c2ecf20Sopenharmony_ci	return error;
4808c2ecf20Sopenharmony_ci}
4818c2ecf20Sopenharmony_ci
4828c2ecf20Sopenharmony_ci/*
4838c2ecf20Sopenharmony_ci * All MSTCPY locks are purged and rebuilt, even if the master stayed the same.
4848c2ecf20Sopenharmony_ci * This is necessary because recovery can be started, aborted and restarted,
4858c2ecf20Sopenharmony_ci * causing the master nodeid to briefly change during the aborted recovery, and
4868c2ecf20Sopenharmony_ci * change back to the original value in the second recovery.  The MSTCPY locks
4878c2ecf20Sopenharmony_ci * may or may not have been purged during the aborted recovery.  Another node
4888c2ecf20Sopenharmony_ci * with an outstanding request in waiters list and a request reply saved in the
4898c2ecf20Sopenharmony_ci * requestqueue, cannot know whether it should ignore the reply and resend the
4908c2ecf20Sopenharmony_ci * request, or accept the reply and complete the request.  It must do the
4918c2ecf20Sopenharmony_ci * former if the remote node purged MSTCPY locks, and it must do the later if
4928c2ecf20Sopenharmony_ci * the remote node did not.  This is solved by always purging MSTCPY locks, in
4938c2ecf20Sopenharmony_ci * which case, the request reply would always be ignored and the request
4948c2ecf20Sopenharmony_ci * resent.
4958c2ecf20Sopenharmony_ci */
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_cistatic int recover_master_static(struct dlm_rsb *r, unsigned int *count)
4988c2ecf20Sopenharmony_ci{
4998c2ecf20Sopenharmony_ci	int dir_nodeid = dlm_dir_nodeid(r);
5008c2ecf20Sopenharmony_ci	int new_master = dir_nodeid;
5018c2ecf20Sopenharmony_ci
5028c2ecf20Sopenharmony_ci	if (dir_nodeid == dlm_our_nodeid())
5038c2ecf20Sopenharmony_ci		new_master = 0;
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci	dlm_purge_mstcpy_locks(r);
5068c2ecf20Sopenharmony_ci	r->res_master_nodeid = dir_nodeid;
5078c2ecf20Sopenharmony_ci	r->res_nodeid = new_master;
5088c2ecf20Sopenharmony_ci	set_new_master(r);
5098c2ecf20Sopenharmony_ci	(*count)++;
5108c2ecf20Sopenharmony_ci	return 0;
5118c2ecf20Sopenharmony_ci}
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_ci/*
5148c2ecf20Sopenharmony_ci * Go through local root resources and for each rsb which has a master which
5158c2ecf20Sopenharmony_ci * has departed, get the new master nodeid from the directory.  The dir will
5168c2ecf20Sopenharmony_ci * assign mastery to the first node to look up the new master.  That means
5178c2ecf20Sopenharmony_ci * we'll discover in this lookup if we're the new master of any rsb's.
5188c2ecf20Sopenharmony_ci *
5198c2ecf20Sopenharmony_ci * We fire off all the dir lookup requests individually and asynchronously to
5208c2ecf20Sopenharmony_ci * the correct dir node.
5218c2ecf20Sopenharmony_ci */
5228c2ecf20Sopenharmony_ci
5238c2ecf20Sopenharmony_ciint dlm_recover_masters(struct dlm_ls *ls)
5248c2ecf20Sopenharmony_ci{
5258c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
5268c2ecf20Sopenharmony_ci	unsigned int total = 0;
5278c2ecf20Sopenharmony_ci	unsigned int count = 0;
5288c2ecf20Sopenharmony_ci	int nodir = dlm_no_directory(ls);
5298c2ecf20Sopenharmony_ci	int error;
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci	log_rinfo(ls, "dlm_recover_masters");
5328c2ecf20Sopenharmony_ci
5338c2ecf20Sopenharmony_ci	down_read(&ls->ls_root_sem);
5348c2ecf20Sopenharmony_ci	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
5358c2ecf20Sopenharmony_ci		if (dlm_recovery_stopped(ls)) {
5368c2ecf20Sopenharmony_ci			up_read(&ls->ls_root_sem);
5378c2ecf20Sopenharmony_ci			error = -EINTR;
5388c2ecf20Sopenharmony_ci			goto out;
5398c2ecf20Sopenharmony_ci		}
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci		lock_rsb(r);
5428c2ecf20Sopenharmony_ci		if (nodir)
5438c2ecf20Sopenharmony_ci			error = recover_master_static(r, &count);
5448c2ecf20Sopenharmony_ci		else
5458c2ecf20Sopenharmony_ci			error = recover_master(r, &count);
5468c2ecf20Sopenharmony_ci		unlock_rsb(r);
5478c2ecf20Sopenharmony_ci		cond_resched();
5488c2ecf20Sopenharmony_ci		total++;
5498c2ecf20Sopenharmony_ci
5508c2ecf20Sopenharmony_ci		if (error) {
5518c2ecf20Sopenharmony_ci			up_read(&ls->ls_root_sem);
5528c2ecf20Sopenharmony_ci			goto out;
5538c2ecf20Sopenharmony_ci		}
5548c2ecf20Sopenharmony_ci	}
5558c2ecf20Sopenharmony_ci	up_read(&ls->ls_root_sem);
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_ci	log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
5588c2ecf20Sopenharmony_ci
5598c2ecf20Sopenharmony_ci	error = dlm_wait_function(ls, &recover_idr_empty);
5608c2ecf20Sopenharmony_ci out:
5618c2ecf20Sopenharmony_ci	if (error)
5628c2ecf20Sopenharmony_ci		recover_idr_clear(ls);
5638c2ecf20Sopenharmony_ci	return error;
5648c2ecf20Sopenharmony_ci}
5658c2ecf20Sopenharmony_ci
5668c2ecf20Sopenharmony_ciint dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
5678c2ecf20Sopenharmony_ci{
5688c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
5698c2ecf20Sopenharmony_ci	int ret_nodeid, new_master;
5708c2ecf20Sopenharmony_ci
5718c2ecf20Sopenharmony_ci	r = recover_idr_find(ls, rc->rc_id);
5728c2ecf20Sopenharmony_ci	if (!r) {
5738c2ecf20Sopenharmony_ci		log_error(ls, "dlm_recover_master_reply no id %llx",
5748c2ecf20Sopenharmony_ci			  (unsigned long long)rc->rc_id);
5758c2ecf20Sopenharmony_ci		goto out;
5768c2ecf20Sopenharmony_ci	}
5778c2ecf20Sopenharmony_ci
5788c2ecf20Sopenharmony_ci	ret_nodeid = rc->rc_result;
5798c2ecf20Sopenharmony_ci
5808c2ecf20Sopenharmony_ci	if (ret_nodeid == dlm_our_nodeid())
5818c2ecf20Sopenharmony_ci		new_master = 0;
5828c2ecf20Sopenharmony_ci	else
5838c2ecf20Sopenharmony_ci		new_master = ret_nodeid;
5848c2ecf20Sopenharmony_ci
5858c2ecf20Sopenharmony_ci	lock_rsb(r);
5868c2ecf20Sopenharmony_ci	r->res_master_nodeid = ret_nodeid;
5878c2ecf20Sopenharmony_ci	r->res_nodeid = new_master;
5888c2ecf20Sopenharmony_ci	set_new_master(r);
5898c2ecf20Sopenharmony_ci	unlock_rsb(r);
5908c2ecf20Sopenharmony_ci	recover_idr_del(r);
5918c2ecf20Sopenharmony_ci
5928c2ecf20Sopenharmony_ci	if (recover_idr_empty(ls))
5938c2ecf20Sopenharmony_ci		wake_up(&ls->ls_wait_general);
5948c2ecf20Sopenharmony_ci out:
5958c2ecf20Sopenharmony_ci	return 0;
5968c2ecf20Sopenharmony_ci}
5978c2ecf20Sopenharmony_ci
5988c2ecf20Sopenharmony_ci
5998c2ecf20Sopenharmony_ci/* Lock recovery: rebuild the process-copy locks we hold on a
6008c2ecf20Sopenharmony_ci   remastered rsb on the new rsb master.
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci   dlm_recover_locks
6038c2ecf20Sopenharmony_ci   recover_locks
6048c2ecf20Sopenharmony_ci   recover_locks_queue
6058c2ecf20Sopenharmony_ci   dlm_send_rcom_lock              ->  receive_rcom_lock
6068c2ecf20Sopenharmony_ci                                       dlm_recover_master_copy
6078c2ecf20Sopenharmony_ci   receive_rcom_lock_reply         <-
6088c2ecf20Sopenharmony_ci   dlm_recover_process_copy
6098c2ecf20Sopenharmony_ci*/
6108c2ecf20Sopenharmony_ci
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_ci/*
6138c2ecf20Sopenharmony_ci * keep a count of the number of lkb's we send to the new master; when we get
6148c2ecf20Sopenharmony_ci * an equal number of replies then recovery for the rsb is done
6158c2ecf20Sopenharmony_ci */
6168c2ecf20Sopenharmony_ci
6178c2ecf20Sopenharmony_cistatic int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
6188c2ecf20Sopenharmony_ci{
6198c2ecf20Sopenharmony_ci	struct dlm_lkb *lkb;
6208c2ecf20Sopenharmony_ci	int error = 0;
6218c2ecf20Sopenharmony_ci
6228c2ecf20Sopenharmony_ci	list_for_each_entry(lkb, head, lkb_statequeue) {
6238c2ecf20Sopenharmony_ci	   	error = dlm_send_rcom_lock(r, lkb);
6248c2ecf20Sopenharmony_ci		if (error)
6258c2ecf20Sopenharmony_ci			break;
6268c2ecf20Sopenharmony_ci		r->res_recover_locks_count++;
6278c2ecf20Sopenharmony_ci	}
6288c2ecf20Sopenharmony_ci
6298c2ecf20Sopenharmony_ci	return error;
6308c2ecf20Sopenharmony_ci}
6318c2ecf20Sopenharmony_ci
6328c2ecf20Sopenharmony_cistatic int recover_locks(struct dlm_rsb *r)
6338c2ecf20Sopenharmony_ci{
6348c2ecf20Sopenharmony_ci	int error = 0;
6358c2ecf20Sopenharmony_ci
6368c2ecf20Sopenharmony_ci	lock_rsb(r);
6378c2ecf20Sopenharmony_ci
6388c2ecf20Sopenharmony_ci	DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
6398c2ecf20Sopenharmony_ci
6408c2ecf20Sopenharmony_ci	error = recover_locks_queue(r, &r->res_grantqueue);
6418c2ecf20Sopenharmony_ci	if (error)
6428c2ecf20Sopenharmony_ci		goto out;
6438c2ecf20Sopenharmony_ci	error = recover_locks_queue(r, &r->res_convertqueue);
6448c2ecf20Sopenharmony_ci	if (error)
6458c2ecf20Sopenharmony_ci		goto out;
6468c2ecf20Sopenharmony_ci	error = recover_locks_queue(r, &r->res_waitqueue);
6478c2ecf20Sopenharmony_ci	if (error)
6488c2ecf20Sopenharmony_ci		goto out;
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_ci	if (r->res_recover_locks_count)
6518c2ecf20Sopenharmony_ci		recover_list_add(r);
6528c2ecf20Sopenharmony_ci	else
6538c2ecf20Sopenharmony_ci		rsb_clear_flag(r, RSB_NEW_MASTER);
6548c2ecf20Sopenharmony_ci out:
6558c2ecf20Sopenharmony_ci	unlock_rsb(r);
6568c2ecf20Sopenharmony_ci	return error;
6578c2ecf20Sopenharmony_ci}
6588c2ecf20Sopenharmony_ci
6598c2ecf20Sopenharmony_ciint dlm_recover_locks(struct dlm_ls *ls)
6608c2ecf20Sopenharmony_ci{
6618c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
6628c2ecf20Sopenharmony_ci	int error, count = 0;
6638c2ecf20Sopenharmony_ci
6648c2ecf20Sopenharmony_ci	down_read(&ls->ls_root_sem);
6658c2ecf20Sopenharmony_ci	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
6668c2ecf20Sopenharmony_ci		if (is_master(r)) {
6678c2ecf20Sopenharmony_ci			rsb_clear_flag(r, RSB_NEW_MASTER);
6688c2ecf20Sopenharmony_ci			continue;
6698c2ecf20Sopenharmony_ci		}
6708c2ecf20Sopenharmony_ci
6718c2ecf20Sopenharmony_ci		if (!rsb_flag(r, RSB_NEW_MASTER))
6728c2ecf20Sopenharmony_ci			continue;
6738c2ecf20Sopenharmony_ci
6748c2ecf20Sopenharmony_ci		if (dlm_recovery_stopped(ls)) {
6758c2ecf20Sopenharmony_ci			error = -EINTR;
6768c2ecf20Sopenharmony_ci			up_read(&ls->ls_root_sem);
6778c2ecf20Sopenharmony_ci			goto out;
6788c2ecf20Sopenharmony_ci		}
6798c2ecf20Sopenharmony_ci
6808c2ecf20Sopenharmony_ci		error = recover_locks(r);
6818c2ecf20Sopenharmony_ci		if (error) {
6828c2ecf20Sopenharmony_ci			up_read(&ls->ls_root_sem);
6838c2ecf20Sopenharmony_ci			goto out;
6848c2ecf20Sopenharmony_ci		}
6858c2ecf20Sopenharmony_ci
6868c2ecf20Sopenharmony_ci		count += r->res_recover_locks_count;
6878c2ecf20Sopenharmony_ci	}
6888c2ecf20Sopenharmony_ci	up_read(&ls->ls_root_sem);
6898c2ecf20Sopenharmony_ci
6908c2ecf20Sopenharmony_ci	log_rinfo(ls, "dlm_recover_locks %d out", count);
6918c2ecf20Sopenharmony_ci
6928c2ecf20Sopenharmony_ci	error = dlm_wait_function(ls, &recover_list_empty);
6938c2ecf20Sopenharmony_ci out:
6948c2ecf20Sopenharmony_ci	if (error)
6958c2ecf20Sopenharmony_ci		recover_list_clear(ls);
6968c2ecf20Sopenharmony_ci	return error;
6978c2ecf20Sopenharmony_ci}
6988c2ecf20Sopenharmony_ci
6998c2ecf20Sopenharmony_civoid dlm_recovered_lock(struct dlm_rsb *r)
7008c2ecf20Sopenharmony_ci{
7018c2ecf20Sopenharmony_ci	DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
7028c2ecf20Sopenharmony_ci
7038c2ecf20Sopenharmony_ci	r->res_recover_locks_count--;
7048c2ecf20Sopenharmony_ci	if (!r->res_recover_locks_count) {
7058c2ecf20Sopenharmony_ci		rsb_clear_flag(r, RSB_NEW_MASTER);
7068c2ecf20Sopenharmony_ci		recover_list_del(r);
7078c2ecf20Sopenharmony_ci	}
7088c2ecf20Sopenharmony_ci
7098c2ecf20Sopenharmony_ci	if (recover_list_empty(r->res_ls))
7108c2ecf20Sopenharmony_ci		wake_up(&r->res_ls->ls_wait_general);
7118c2ecf20Sopenharmony_ci}
7128c2ecf20Sopenharmony_ci
7138c2ecf20Sopenharmony_ci/*
7148c2ecf20Sopenharmony_ci * The lvb needs to be recovered on all master rsb's.  This includes setting
7158c2ecf20Sopenharmony_ci * the VALNOTVALID flag if necessary, and determining the correct lvb contents
7168c2ecf20Sopenharmony_ci * based on the lvb's of the locks held on the rsb.
7178c2ecf20Sopenharmony_ci *
7188c2ecf20Sopenharmony_ci * RSB_VALNOTVALID is set in two cases:
7198c2ecf20Sopenharmony_ci *
7208c2ecf20Sopenharmony_ci * 1. we are master, but not new, and we purged an EX/PW lock held by a
7218c2ecf20Sopenharmony_ci * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
7228c2ecf20Sopenharmony_ci *
7238c2ecf20Sopenharmony_ci * 2. we are a new master, and there are only NL/CR locks left.
7248c2ecf20Sopenharmony_ci * (We could probably improve this by only invaliding in this way when
7258c2ecf20Sopenharmony_ci * the previous master left uncleanly.  VMS docs mention that.)
7268c2ecf20Sopenharmony_ci *
7278c2ecf20Sopenharmony_ci * The LVB contents are only considered for changing when this is a new master
7288c2ecf20Sopenharmony_ci * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
7298c2ecf20Sopenharmony_ci * mode > CR.  If no lkb's exist with mode above CR, the lvb contents are taken
7308c2ecf20Sopenharmony_ci * from the lkb with the largest lvb sequence number.
7318c2ecf20Sopenharmony_ci */
7328c2ecf20Sopenharmony_ci
7338c2ecf20Sopenharmony_cistatic void recover_lvb(struct dlm_rsb *r)
7348c2ecf20Sopenharmony_ci{
7358c2ecf20Sopenharmony_ci	struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL;
7368c2ecf20Sopenharmony_ci	uint32_t high_seq = 0;
7378c2ecf20Sopenharmony_ci	int lock_lvb_exists = 0;
7388c2ecf20Sopenharmony_ci	int lvblen = r->res_ls->ls_lvblen;
7398c2ecf20Sopenharmony_ci
7408c2ecf20Sopenharmony_ci	if (!rsb_flag(r, RSB_NEW_MASTER2) &&
7418c2ecf20Sopenharmony_ci	    rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
7428c2ecf20Sopenharmony_ci		/* case 1 above */
7438c2ecf20Sopenharmony_ci		rsb_set_flag(r, RSB_VALNOTVALID);
7448c2ecf20Sopenharmony_ci		return;
7458c2ecf20Sopenharmony_ci	}
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci	if (!rsb_flag(r, RSB_NEW_MASTER2))
7488c2ecf20Sopenharmony_ci		return;
7498c2ecf20Sopenharmony_ci
7508c2ecf20Sopenharmony_ci	/* we are the new master, so figure out if VALNOTVALID should
7518c2ecf20Sopenharmony_ci	   be set, and set the rsb lvb from the best lkb available. */
7528c2ecf20Sopenharmony_ci
7538c2ecf20Sopenharmony_ci	list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) {
7548c2ecf20Sopenharmony_ci		if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
7558c2ecf20Sopenharmony_ci			continue;
7568c2ecf20Sopenharmony_ci
7578c2ecf20Sopenharmony_ci		lock_lvb_exists = 1;
7588c2ecf20Sopenharmony_ci
7598c2ecf20Sopenharmony_ci		if (iter->lkb_grmode > DLM_LOCK_CR) {
7608c2ecf20Sopenharmony_ci			big_lkb = iter;
7618c2ecf20Sopenharmony_ci			goto setflag;
7628c2ecf20Sopenharmony_ci		}
7638c2ecf20Sopenharmony_ci
7648c2ecf20Sopenharmony_ci		if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
7658c2ecf20Sopenharmony_ci			high_lkb = iter;
7668c2ecf20Sopenharmony_ci			high_seq = iter->lkb_lvbseq;
7678c2ecf20Sopenharmony_ci		}
7688c2ecf20Sopenharmony_ci	}
7698c2ecf20Sopenharmony_ci
7708c2ecf20Sopenharmony_ci	list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) {
7718c2ecf20Sopenharmony_ci		if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
7728c2ecf20Sopenharmony_ci			continue;
7738c2ecf20Sopenharmony_ci
7748c2ecf20Sopenharmony_ci		lock_lvb_exists = 1;
7758c2ecf20Sopenharmony_ci
7768c2ecf20Sopenharmony_ci		if (iter->lkb_grmode > DLM_LOCK_CR) {
7778c2ecf20Sopenharmony_ci			big_lkb = iter;
7788c2ecf20Sopenharmony_ci			goto setflag;
7798c2ecf20Sopenharmony_ci		}
7808c2ecf20Sopenharmony_ci
7818c2ecf20Sopenharmony_ci		if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
7828c2ecf20Sopenharmony_ci			high_lkb = iter;
7838c2ecf20Sopenharmony_ci			high_seq = iter->lkb_lvbseq;
7848c2ecf20Sopenharmony_ci		}
7858c2ecf20Sopenharmony_ci	}
7868c2ecf20Sopenharmony_ci
7878c2ecf20Sopenharmony_ci setflag:
7888c2ecf20Sopenharmony_ci	if (!lock_lvb_exists)
7898c2ecf20Sopenharmony_ci		goto out;
7908c2ecf20Sopenharmony_ci
7918c2ecf20Sopenharmony_ci	/* lvb is invalidated if only NL/CR locks remain */
7928c2ecf20Sopenharmony_ci	if (!big_lkb)
7938c2ecf20Sopenharmony_ci		rsb_set_flag(r, RSB_VALNOTVALID);
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_ci	if (!r->res_lvbptr) {
7968c2ecf20Sopenharmony_ci		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
7978c2ecf20Sopenharmony_ci		if (!r->res_lvbptr)
7988c2ecf20Sopenharmony_ci			goto out;
7998c2ecf20Sopenharmony_ci	}
8008c2ecf20Sopenharmony_ci
8018c2ecf20Sopenharmony_ci	if (big_lkb) {
8028c2ecf20Sopenharmony_ci		r->res_lvbseq = big_lkb->lkb_lvbseq;
8038c2ecf20Sopenharmony_ci		memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen);
8048c2ecf20Sopenharmony_ci	} else if (high_lkb) {
8058c2ecf20Sopenharmony_ci		r->res_lvbseq = high_lkb->lkb_lvbseq;
8068c2ecf20Sopenharmony_ci		memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
8078c2ecf20Sopenharmony_ci	} else {
8088c2ecf20Sopenharmony_ci		r->res_lvbseq = 0;
8098c2ecf20Sopenharmony_ci		memset(r->res_lvbptr, 0, lvblen);
8108c2ecf20Sopenharmony_ci	}
8118c2ecf20Sopenharmony_ci out:
8128c2ecf20Sopenharmony_ci	return;
8138c2ecf20Sopenharmony_ci}
8148c2ecf20Sopenharmony_ci
8158c2ecf20Sopenharmony_ci/* All master rsb's flagged RECOVER_CONVERT need to be looked at.  The locks
8168c2ecf20Sopenharmony_ci   converting PR->CW or CW->PR need to have their lkb_grmode set. */
8178c2ecf20Sopenharmony_ci
8188c2ecf20Sopenharmony_cistatic void recover_conversion(struct dlm_rsb *r)
8198c2ecf20Sopenharmony_ci{
8208c2ecf20Sopenharmony_ci	struct dlm_ls *ls = r->res_ls;
8218c2ecf20Sopenharmony_ci	struct dlm_lkb *lkb;
8228c2ecf20Sopenharmony_ci	int grmode = -1;
8238c2ecf20Sopenharmony_ci
8248c2ecf20Sopenharmony_ci	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
8258c2ecf20Sopenharmony_ci		if (lkb->lkb_grmode == DLM_LOCK_PR ||
8268c2ecf20Sopenharmony_ci		    lkb->lkb_grmode == DLM_LOCK_CW) {
8278c2ecf20Sopenharmony_ci			grmode = lkb->lkb_grmode;
8288c2ecf20Sopenharmony_ci			break;
8298c2ecf20Sopenharmony_ci		}
8308c2ecf20Sopenharmony_ci	}
8318c2ecf20Sopenharmony_ci
8328c2ecf20Sopenharmony_ci	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
8338c2ecf20Sopenharmony_ci		if (lkb->lkb_grmode != DLM_LOCK_IV)
8348c2ecf20Sopenharmony_ci			continue;
8358c2ecf20Sopenharmony_ci		if (grmode == -1) {
8368c2ecf20Sopenharmony_ci			log_debug(ls, "recover_conversion %x set gr to rq %d",
8378c2ecf20Sopenharmony_ci				  lkb->lkb_id, lkb->lkb_rqmode);
8388c2ecf20Sopenharmony_ci			lkb->lkb_grmode = lkb->lkb_rqmode;
8398c2ecf20Sopenharmony_ci		} else {
8408c2ecf20Sopenharmony_ci			log_debug(ls, "recover_conversion %x set gr %d",
8418c2ecf20Sopenharmony_ci				  lkb->lkb_id, grmode);
8428c2ecf20Sopenharmony_ci			lkb->lkb_grmode = grmode;
8438c2ecf20Sopenharmony_ci		}
8448c2ecf20Sopenharmony_ci	}
8458c2ecf20Sopenharmony_ci}
8468c2ecf20Sopenharmony_ci
8478c2ecf20Sopenharmony_ci/* We've become the new master for this rsb and waiting/converting locks may
8488c2ecf20Sopenharmony_ci   need to be granted in dlm_recover_grant() due to locks that may have
8498c2ecf20Sopenharmony_ci   existed from a removed node. */
8508c2ecf20Sopenharmony_ci
8518c2ecf20Sopenharmony_cistatic void recover_grant(struct dlm_rsb *r)
8528c2ecf20Sopenharmony_ci{
8538c2ecf20Sopenharmony_ci	if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
8548c2ecf20Sopenharmony_ci		rsb_set_flag(r, RSB_RECOVER_GRANT);
8558c2ecf20Sopenharmony_ci}
8568c2ecf20Sopenharmony_ci
8578c2ecf20Sopenharmony_civoid dlm_recover_rsbs(struct dlm_ls *ls)
8588c2ecf20Sopenharmony_ci{
8598c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
8608c2ecf20Sopenharmony_ci	unsigned int count = 0;
8618c2ecf20Sopenharmony_ci
8628c2ecf20Sopenharmony_ci	down_read(&ls->ls_root_sem);
8638c2ecf20Sopenharmony_ci	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
8648c2ecf20Sopenharmony_ci		lock_rsb(r);
8658c2ecf20Sopenharmony_ci		if (is_master(r)) {
8668c2ecf20Sopenharmony_ci			if (rsb_flag(r, RSB_RECOVER_CONVERT))
8678c2ecf20Sopenharmony_ci				recover_conversion(r);
8688c2ecf20Sopenharmony_ci
8698c2ecf20Sopenharmony_ci			/* recover lvb before granting locks so the updated
8708c2ecf20Sopenharmony_ci			   lvb/VALNOTVALID is presented in the completion */
8718c2ecf20Sopenharmony_ci			recover_lvb(r);
8728c2ecf20Sopenharmony_ci
8738c2ecf20Sopenharmony_ci			if (rsb_flag(r, RSB_NEW_MASTER2))
8748c2ecf20Sopenharmony_ci				recover_grant(r);
8758c2ecf20Sopenharmony_ci			count++;
8768c2ecf20Sopenharmony_ci		} else {
8778c2ecf20Sopenharmony_ci			rsb_clear_flag(r, RSB_VALNOTVALID);
8788c2ecf20Sopenharmony_ci		}
8798c2ecf20Sopenharmony_ci		rsb_clear_flag(r, RSB_RECOVER_CONVERT);
8808c2ecf20Sopenharmony_ci		rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
8818c2ecf20Sopenharmony_ci		rsb_clear_flag(r, RSB_NEW_MASTER2);
8828c2ecf20Sopenharmony_ci		unlock_rsb(r);
8838c2ecf20Sopenharmony_ci	}
8848c2ecf20Sopenharmony_ci	up_read(&ls->ls_root_sem);
8858c2ecf20Sopenharmony_ci
8868c2ecf20Sopenharmony_ci	if (count)
8878c2ecf20Sopenharmony_ci		log_rinfo(ls, "dlm_recover_rsbs %d done", count);
8888c2ecf20Sopenharmony_ci}
8898c2ecf20Sopenharmony_ci
8908c2ecf20Sopenharmony_ci/* Create a single list of all root rsb's to be used during recovery */
8918c2ecf20Sopenharmony_ci
8928c2ecf20Sopenharmony_ciint dlm_create_root_list(struct dlm_ls *ls)
8938c2ecf20Sopenharmony_ci{
8948c2ecf20Sopenharmony_ci	struct rb_node *n;
8958c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
8968c2ecf20Sopenharmony_ci	int i, error = 0;
8978c2ecf20Sopenharmony_ci
8988c2ecf20Sopenharmony_ci	down_write(&ls->ls_root_sem);
8998c2ecf20Sopenharmony_ci	if (!list_empty(&ls->ls_root_list)) {
9008c2ecf20Sopenharmony_ci		log_error(ls, "root list not empty");
9018c2ecf20Sopenharmony_ci		error = -EINVAL;
9028c2ecf20Sopenharmony_ci		goto out;
9038c2ecf20Sopenharmony_ci	}
9048c2ecf20Sopenharmony_ci
9058c2ecf20Sopenharmony_ci	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
9068c2ecf20Sopenharmony_ci		spin_lock(&ls->ls_rsbtbl[i].lock);
9078c2ecf20Sopenharmony_ci		for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
9088c2ecf20Sopenharmony_ci			r = rb_entry(n, struct dlm_rsb, res_hashnode);
9098c2ecf20Sopenharmony_ci			list_add(&r->res_root_list, &ls->ls_root_list);
9108c2ecf20Sopenharmony_ci			dlm_hold_rsb(r);
9118c2ecf20Sopenharmony_ci		}
9128c2ecf20Sopenharmony_ci
9138c2ecf20Sopenharmony_ci		if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[i].toss))
9148c2ecf20Sopenharmony_ci			log_error(ls, "dlm_create_root_list toss not empty");
9158c2ecf20Sopenharmony_ci		spin_unlock(&ls->ls_rsbtbl[i].lock);
9168c2ecf20Sopenharmony_ci	}
9178c2ecf20Sopenharmony_ci out:
9188c2ecf20Sopenharmony_ci	up_write(&ls->ls_root_sem);
9198c2ecf20Sopenharmony_ci	return error;
9208c2ecf20Sopenharmony_ci}
9218c2ecf20Sopenharmony_ci
9228c2ecf20Sopenharmony_civoid dlm_release_root_list(struct dlm_ls *ls)
9238c2ecf20Sopenharmony_ci{
9248c2ecf20Sopenharmony_ci	struct dlm_rsb *r, *safe;
9258c2ecf20Sopenharmony_ci
9268c2ecf20Sopenharmony_ci	down_write(&ls->ls_root_sem);
9278c2ecf20Sopenharmony_ci	list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
9288c2ecf20Sopenharmony_ci		list_del_init(&r->res_root_list);
9298c2ecf20Sopenharmony_ci		dlm_put_rsb(r);
9308c2ecf20Sopenharmony_ci	}
9318c2ecf20Sopenharmony_ci	up_write(&ls->ls_root_sem);
9328c2ecf20Sopenharmony_ci}
9338c2ecf20Sopenharmony_ci
9348c2ecf20Sopenharmony_civoid dlm_clear_toss(struct dlm_ls *ls)
9358c2ecf20Sopenharmony_ci{
9368c2ecf20Sopenharmony_ci	struct rb_node *n, *next;
9378c2ecf20Sopenharmony_ci	struct dlm_rsb *r;
9388c2ecf20Sopenharmony_ci	unsigned int count = 0;
9398c2ecf20Sopenharmony_ci	int i;
9408c2ecf20Sopenharmony_ci
9418c2ecf20Sopenharmony_ci	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
9428c2ecf20Sopenharmony_ci		spin_lock(&ls->ls_rsbtbl[i].lock);
9438c2ecf20Sopenharmony_ci		for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
9448c2ecf20Sopenharmony_ci			next = rb_next(n);
9458c2ecf20Sopenharmony_ci			r = rb_entry(n, struct dlm_rsb, res_hashnode);
9468c2ecf20Sopenharmony_ci			rb_erase(n, &ls->ls_rsbtbl[i].toss);
9478c2ecf20Sopenharmony_ci			dlm_free_rsb(r);
9488c2ecf20Sopenharmony_ci			count++;
9498c2ecf20Sopenharmony_ci		}
9508c2ecf20Sopenharmony_ci		spin_unlock(&ls->ls_rsbtbl[i].lock);
9518c2ecf20Sopenharmony_ci	}
9528c2ecf20Sopenharmony_ci
9538c2ecf20Sopenharmony_ci	if (count)
9548c2ecf20Sopenharmony_ci		log_rinfo(ls, "dlm_clear_toss %u done", count);
9558c2ecf20Sopenharmony_ci}
9568c2ecf20Sopenharmony_ci
957