18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/* -*- mode: c; c-basic-offset: 8; -*-
38c2ecf20Sopenharmony_ci * vim: noexpandtab sw=8 ts=8 sts=0:
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * dlmrecovery.c
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * recovery stuff
88c2ecf20Sopenharmony_ci *
98c2ecf20Sopenharmony_ci * Copyright (C) 2004 Oracle.  All rights reserved.
108c2ecf20Sopenharmony_ci */
118c2ecf20Sopenharmony_ci
128c2ecf20Sopenharmony_ci
138c2ecf20Sopenharmony_ci#include <linux/module.h>
148c2ecf20Sopenharmony_ci#include <linux/fs.h>
158c2ecf20Sopenharmony_ci#include <linux/types.h>
168c2ecf20Sopenharmony_ci#include <linux/slab.h>
178c2ecf20Sopenharmony_ci#include <linux/highmem.h>
188c2ecf20Sopenharmony_ci#include <linux/init.h>
198c2ecf20Sopenharmony_ci#include <linux/sysctl.h>
208c2ecf20Sopenharmony_ci#include <linux/random.h>
218c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
228c2ecf20Sopenharmony_ci#include <linux/socket.h>
238c2ecf20Sopenharmony_ci#include <linux/inet.h>
248c2ecf20Sopenharmony_ci#include <linux/timer.h>
258c2ecf20Sopenharmony_ci#include <linux/kthread.h>
268c2ecf20Sopenharmony_ci#include <linux/delay.h>
278c2ecf20Sopenharmony_ci
288c2ecf20Sopenharmony_ci
298c2ecf20Sopenharmony_ci#include "../cluster/heartbeat.h"
308c2ecf20Sopenharmony_ci#include "../cluster/nodemanager.h"
318c2ecf20Sopenharmony_ci#include "../cluster/tcp.h"
328c2ecf20Sopenharmony_ci
338c2ecf20Sopenharmony_ci#include "dlmapi.h"
348c2ecf20Sopenharmony_ci#include "dlmcommon.h"
358c2ecf20Sopenharmony_ci#include "dlmdomain.h"
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
388c2ecf20Sopenharmony_ci#include "../cluster/masklog.h"
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_cistatic void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_cistatic int dlm_recovery_thread(void *data);
438c2ecf20Sopenharmony_cistatic int dlm_do_recovery(struct dlm_ctxt *dlm);
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_cistatic int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
468c2ecf20Sopenharmony_cistatic int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
478c2ecf20Sopenharmony_cistatic int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
488c2ecf20Sopenharmony_cistatic int dlm_request_all_locks(struct dlm_ctxt *dlm,
498c2ecf20Sopenharmony_ci				 u8 request_from, u8 dead_node);
508c2ecf20Sopenharmony_cistatic void dlm_destroy_recovery_area(struct dlm_ctxt *dlm);
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_cistatic inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
538c2ecf20Sopenharmony_cistatic void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
548c2ecf20Sopenharmony_ci					const char *lockname, int namelen,
558c2ecf20Sopenharmony_ci					int total_locks, u64 cookie,
568c2ecf20Sopenharmony_ci					u8 flags, u8 master);
578c2ecf20Sopenharmony_cistatic int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
588c2ecf20Sopenharmony_ci				    struct dlm_migratable_lockres *mres,
598c2ecf20Sopenharmony_ci				    u8 send_to,
608c2ecf20Sopenharmony_ci				    struct dlm_lock_resource *res,
618c2ecf20Sopenharmony_ci				    int total_locks);
628c2ecf20Sopenharmony_cistatic int dlm_process_recovery_data(struct dlm_ctxt *dlm,
638c2ecf20Sopenharmony_ci				     struct dlm_lock_resource *res,
648c2ecf20Sopenharmony_ci				     struct dlm_migratable_lockres *mres);
658c2ecf20Sopenharmony_cistatic int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
668c2ecf20Sopenharmony_cistatic int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
678c2ecf20Sopenharmony_ci				 u8 dead_node, u8 send_to);
688c2ecf20Sopenharmony_cistatic int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
698c2ecf20Sopenharmony_cistatic void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
708c2ecf20Sopenharmony_ci					struct list_head *list, u8 dead_node);
718c2ecf20Sopenharmony_cistatic void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
728c2ecf20Sopenharmony_ci					      u8 dead_node, u8 new_master);
738c2ecf20Sopenharmony_cistatic void dlm_reco_ast(void *astdata);
748c2ecf20Sopenharmony_cistatic void dlm_reco_bast(void *astdata, int blocked_type);
758c2ecf20Sopenharmony_cistatic void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
768c2ecf20Sopenharmony_cistatic void dlm_request_all_locks_worker(struct dlm_work_item *item,
778c2ecf20Sopenharmony_ci					 void *data);
788c2ecf20Sopenharmony_cistatic void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
798c2ecf20Sopenharmony_cistatic int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
808c2ecf20Sopenharmony_ci				      struct dlm_lock_resource *res,
818c2ecf20Sopenharmony_ci				      u8 *real_master);
828c2ecf20Sopenharmony_ci
838c2ecf20Sopenharmony_cistatic u64 dlm_get_next_mig_cookie(void);
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(dlm_reco_state_lock);
868c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(dlm_mig_cookie_lock);
878c2ecf20Sopenharmony_cistatic u64 dlm_mig_cookie = 1;
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_cistatic u64 dlm_get_next_mig_cookie(void)
908c2ecf20Sopenharmony_ci{
918c2ecf20Sopenharmony_ci	u64 c;
928c2ecf20Sopenharmony_ci	spin_lock(&dlm_mig_cookie_lock);
938c2ecf20Sopenharmony_ci	c = dlm_mig_cookie;
948c2ecf20Sopenharmony_ci	if (dlm_mig_cookie == (~0ULL))
958c2ecf20Sopenharmony_ci		dlm_mig_cookie = 1;
968c2ecf20Sopenharmony_ci	else
978c2ecf20Sopenharmony_ci		dlm_mig_cookie++;
988c2ecf20Sopenharmony_ci	spin_unlock(&dlm_mig_cookie_lock);
998c2ecf20Sopenharmony_ci	return c;
1008c2ecf20Sopenharmony_ci}
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_cistatic inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
1038c2ecf20Sopenharmony_ci					  u8 dead_node)
1048c2ecf20Sopenharmony_ci{
1058c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
1068c2ecf20Sopenharmony_ci	if (dlm->reco.dead_node != dead_node)
1078c2ecf20Sopenharmony_ci		mlog(0, "%s: changing dead_node from %u to %u\n",
1088c2ecf20Sopenharmony_ci		     dlm->name, dlm->reco.dead_node, dead_node);
1098c2ecf20Sopenharmony_ci	dlm->reco.dead_node = dead_node;
1108c2ecf20Sopenharmony_ci}
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_cistatic inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
1138c2ecf20Sopenharmony_ci				       u8 master)
1148c2ecf20Sopenharmony_ci{
1158c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
1168c2ecf20Sopenharmony_ci	mlog(0, "%s: changing new_master from %u to %u\n",
1178c2ecf20Sopenharmony_ci	     dlm->name, dlm->reco.new_master, master);
1188c2ecf20Sopenharmony_ci	dlm->reco.new_master = master;
1198c2ecf20Sopenharmony_ci}
1208c2ecf20Sopenharmony_ci
1218c2ecf20Sopenharmony_cistatic inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
1228c2ecf20Sopenharmony_ci{
1238c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
1248c2ecf20Sopenharmony_ci	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
1258c2ecf20Sopenharmony_ci	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
1268c2ecf20Sopenharmony_ci	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
1278c2ecf20Sopenharmony_ci}
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_cistatic inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
1308c2ecf20Sopenharmony_ci{
1318c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
1328c2ecf20Sopenharmony_ci	__dlm_reset_recovery(dlm);
1338c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
1348c2ecf20Sopenharmony_ci}
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci/* Worker function used during recovery. */
1378c2ecf20Sopenharmony_civoid dlm_dispatch_work(struct work_struct *work)
1388c2ecf20Sopenharmony_ci{
1398c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm =
1408c2ecf20Sopenharmony_ci		container_of(work, struct dlm_ctxt, dispatched_work);
1418c2ecf20Sopenharmony_ci	LIST_HEAD(tmp_list);
1428c2ecf20Sopenharmony_ci	struct dlm_work_item *item, *next;
1438c2ecf20Sopenharmony_ci	dlm_workfunc_t *workfunc;
1448c2ecf20Sopenharmony_ci	int tot=0;
1458c2ecf20Sopenharmony_ci
1468c2ecf20Sopenharmony_ci	spin_lock(&dlm->work_lock);
1478c2ecf20Sopenharmony_ci	list_splice_init(&dlm->work_list, &tmp_list);
1488c2ecf20Sopenharmony_ci	spin_unlock(&dlm->work_lock);
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci	list_for_each_entry(item, &tmp_list, list) {
1518c2ecf20Sopenharmony_ci		tot++;
1528c2ecf20Sopenharmony_ci	}
1538c2ecf20Sopenharmony_ci	mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_ci	list_for_each_entry_safe(item, next, &tmp_list, list) {
1568c2ecf20Sopenharmony_ci		workfunc = item->func;
1578c2ecf20Sopenharmony_ci		list_del_init(&item->list);
1588c2ecf20Sopenharmony_ci
1598c2ecf20Sopenharmony_ci		/* already have ref on dlm to avoid having
1608c2ecf20Sopenharmony_ci		 * it disappear.  just double-check. */
1618c2ecf20Sopenharmony_ci		BUG_ON(item->dlm != dlm);
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_ci		/* this is allowed to sleep and
1648c2ecf20Sopenharmony_ci		 * call network stuff */
1658c2ecf20Sopenharmony_ci		workfunc(item, item->data);
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci		dlm_put(dlm);
1688c2ecf20Sopenharmony_ci		kfree(item);
1698c2ecf20Sopenharmony_ci	}
1708c2ecf20Sopenharmony_ci}
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci/*
1738c2ecf20Sopenharmony_ci * RECOVERY THREAD
1748c2ecf20Sopenharmony_ci */
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_civoid dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
1778c2ecf20Sopenharmony_ci{
1788c2ecf20Sopenharmony_ci	/* wake the recovery thread
1798c2ecf20Sopenharmony_ci	 * this will wake the reco thread in one of three places
1808c2ecf20Sopenharmony_ci	 * 1) sleeping with no recovery happening
1818c2ecf20Sopenharmony_ci	 * 2) sleeping with recovery mastered elsewhere
1828c2ecf20Sopenharmony_ci	 * 3) recovery mastered here, waiting on reco data */
1838c2ecf20Sopenharmony_ci
1848c2ecf20Sopenharmony_ci	wake_up(&dlm->dlm_reco_thread_wq);
1858c2ecf20Sopenharmony_ci}
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci/* Launch the recovery thread */
1888c2ecf20Sopenharmony_ciint dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
1898c2ecf20Sopenharmony_ci{
1908c2ecf20Sopenharmony_ci	mlog(0, "starting dlm recovery thread...\n");
1918c2ecf20Sopenharmony_ci
1928c2ecf20Sopenharmony_ci	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
1938c2ecf20Sopenharmony_ci			"dlm_reco-%s", dlm->name);
1948c2ecf20Sopenharmony_ci	if (IS_ERR(dlm->dlm_reco_thread_task)) {
1958c2ecf20Sopenharmony_ci		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
1968c2ecf20Sopenharmony_ci		dlm->dlm_reco_thread_task = NULL;
1978c2ecf20Sopenharmony_ci		return -EINVAL;
1988c2ecf20Sopenharmony_ci	}
1998c2ecf20Sopenharmony_ci
2008c2ecf20Sopenharmony_ci	return 0;
2018c2ecf20Sopenharmony_ci}
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_civoid dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
2048c2ecf20Sopenharmony_ci{
2058c2ecf20Sopenharmony_ci	if (dlm->dlm_reco_thread_task) {
2068c2ecf20Sopenharmony_ci		mlog(0, "waiting for dlm recovery thread to exit\n");
2078c2ecf20Sopenharmony_ci		kthread_stop(dlm->dlm_reco_thread_task);
2088c2ecf20Sopenharmony_ci		dlm->dlm_reco_thread_task = NULL;
2098c2ecf20Sopenharmony_ci	}
2108c2ecf20Sopenharmony_ci}
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_ci/*
2158c2ecf20Sopenharmony_ci * this is lame, but here's how recovery works...
2168c2ecf20Sopenharmony_ci * 1) all recovery threads cluster wide will work on recovering
2178c2ecf20Sopenharmony_ci *    ONE node at a time
2188c2ecf20Sopenharmony_ci * 2) negotiate who will take over all the locks for the dead node.
2198c2ecf20Sopenharmony_ci *    thats right... ALL the locks.
2208c2ecf20Sopenharmony_ci * 3) once a new master is chosen, everyone scans all locks
2218c2ecf20Sopenharmony_ci *    and moves aside those mastered by the dead guy
2228c2ecf20Sopenharmony_ci * 4) each of these locks should be locked until recovery is done
2238c2ecf20Sopenharmony_ci * 5) the new master collects up all of secondary lock queue info
2248c2ecf20Sopenharmony_ci *    one lock at a time, forcing each node to communicate back
2258c2ecf20Sopenharmony_ci *    before continuing
2268c2ecf20Sopenharmony_ci * 6) each secondary lock queue responds with the full known lock info
2278c2ecf20Sopenharmony_ci * 7) once the new master has run all its locks, it sends a ALLDONE!
2288c2ecf20Sopenharmony_ci *    message to everyone
2298c2ecf20Sopenharmony_ci * 8) upon receiving this message, the secondary queue node unlocks
2308c2ecf20Sopenharmony_ci *    and responds to the ALLDONE
2318c2ecf20Sopenharmony_ci * 9) once the new master gets responses from everyone, he unlocks
2328c2ecf20Sopenharmony_ci *    everything and recovery for this dead node is done
2338c2ecf20Sopenharmony_ci *10) go back to 2) while there are still dead nodes
2348c2ecf20Sopenharmony_ci *
2358c2ecf20Sopenharmony_ci */
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_cistatic void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
2388c2ecf20Sopenharmony_ci{
2398c2ecf20Sopenharmony_ci	struct dlm_reco_node_data *ndata;
2408c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res;
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci	mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
2438c2ecf20Sopenharmony_ci	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
2448c2ecf20Sopenharmony_ci	     dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
2458c2ecf20Sopenharmony_ci	     dlm->reco.dead_node, dlm->reco.new_master);
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
2488c2ecf20Sopenharmony_ci		char *st = "unknown";
2498c2ecf20Sopenharmony_ci		switch (ndata->state) {
2508c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_INIT:
2518c2ecf20Sopenharmony_ci				st = "init";
2528c2ecf20Sopenharmony_ci				break;
2538c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_REQUESTING:
2548c2ecf20Sopenharmony_ci				st = "requesting";
2558c2ecf20Sopenharmony_ci				break;
2568c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_DEAD:
2578c2ecf20Sopenharmony_ci				st = "dead";
2588c2ecf20Sopenharmony_ci				break;
2598c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_RECEIVING:
2608c2ecf20Sopenharmony_ci				st = "receiving";
2618c2ecf20Sopenharmony_ci				break;
2628c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_REQUESTED:
2638c2ecf20Sopenharmony_ci				st = "requested";
2648c2ecf20Sopenharmony_ci				break;
2658c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_DONE:
2668c2ecf20Sopenharmony_ci				st = "done";
2678c2ecf20Sopenharmony_ci				break;
2688c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
2698c2ecf20Sopenharmony_ci				st = "finalize-sent";
2708c2ecf20Sopenharmony_ci				break;
2718c2ecf20Sopenharmony_ci			default:
2728c2ecf20Sopenharmony_ci				st = "bad";
2738c2ecf20Sopenharmony_ci				break;
2748c2ecf20Sopenharmony_ci		}
2758c2ecf20Sopenharmony_ci		mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
2768c2ecf20Sopenharmony_ci		     dlm->name, ndata->node_num, st);
2778c2ecf20Sopenharmony_ci	}
2788c2ecf20Sopenharmony_ci	list_for_each_entry(res, &dlm->reco.resources, recovering) {
2798c2ecf20Sopenharmony_ci		mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
2808c2ecf20Sopenharmony_ci		     dlm->name, res->lockname.len, res->lockname.name);
2818c2ecf20Sopenharmony_ci	}
2828c2ecf20Sopenharmony_ci}
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
2858c2ecf20Sopenharmony_ci
2868c2ecf20Sopenharmony_cistatic int dlm_recovery_thread(void *data)
2878c2ecf20Sopenharmony_ci{
2888c2ecf20Sopenharmony_ci	int status;
2898c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
2908c2ecf20Sopenharmony_ci	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	mlog(0, "dlm thread running for %s...\n", dlm->name);
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	while (!kthread_should_stop()) {
2958c2ecf20Sopenharmony_ci		if (dlm_domain_fully_joined(dlm)) {
2968c2ecf20Sopenharmony_ci			status = dlm_do_recovery(dlm);
2978c2ecf20Sopenharmony_ci			if (status == -EAGAIN) {
2988c2ecf20Sopenharmony_ci				/* do not sleep, recheck immediately. */
2998c2ecf20Sopenharmony_ci				continue;
3008c2ecf20Sopenharmony_ci			}
3018c2ecf20Sopenharmony_ci			if (status < 0)
3028c2ecf20Sopenharmony_ci				mlog_errno(status);
3038c2ecf20Sopenharmony_ci		}
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_ci		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
3068c2ecf20Sopenharmony_ci						 kthread_should_stop(),
3078c2ecf20Sopenharmony_ci						 timeout);
3088c2ecf20Sopenharmony_ci	}
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_ci	mlog(0, "quitting DLM recovery thread\n");
3118c2ecf20Sopenharmony_ci	return 0;
3128c2ecf20Sopenharmony_ci}
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_ci/* returns true when the recovery master has contacted us */
3158c2ecf20Sopenharmony_cistatic int dlm_reco_master_ready(struct dlm_ctxt *dlm)
3168c2ecf20Sopenharmony_ci{
3178c2ecf20Sopenharmony_ci	int ready;
3188c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
3198c2ecf20Sopenharmony_ci	ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
3208c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
3218c2ecf20Sopenharmony_ci	return ready;
3228c2ecf20Sopenharmony_ci}
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_ci/* returns true if node is no longer in the domain
3258c2ecf20Sopenharmony_ci * could be dead or just not joined */
3268c2ecf20Sopenharmony_ciint dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
3278c2ecf20Sopenharmony_ci{
3288c2ecf20Sopenharmony_ci	int dead;
3298c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
3308c2ecf20Sopenharmony_ci	dead = !test_bit(node, dlm->domain_map);
3318c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
3328c2ecf20Sopenharmony_ci	return dead;
3338c2ecf20Sopenharmony_ci}
3348c2ecf20Sopenharmony_ci
3358c2ecf20Sopenharmony_ci/* returns true if node is no longer in the domain
3368c2ecf20Sopenharmony_ci * could be dead or just not joined */
3378c2ecf20Sopenharmony_cistatic int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
3388c2ecf20Sopenharmony_ci{
3398c2ecf20Sopenharmony_ci	int recovered;
3408c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
3418c2ecf20Sopenharmony_ci	recovered = !test_bit(node, dlm->recovery_map);
3428c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
3438c2ecf20Sopenharmony_ci	return recovered;
3448c2ecf20Sopenharmony_ci}
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_ci
3478c2ecf20Sopenharmony_civoid dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
3488c2ecf20Sopenharmony_ci{
3498c2ecf20Sopenharmony_ci	if (dlm_is_node_dead(dlm, node))
3508c2ecf20Sopenharmony_ci		return;
3518c2ecf20Sopenharmony_ci
3528c2ecf20Sopenharmony_ci	printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
3538c2ecf20Sopenharmony_ci	       "domain %s\n", node, dlm->name);
3548c2ecf20Sopenharmony_ci
3558c2ecf20Sopenharmony_ci	if (timeout)
3568c2ecf20Sopenharmony_ci		wait_event_timeout(dlm->dlm_reco_thread_wq,
3578c2ecf20Sopenharmony_ci				   dlm_is_node_dead(dlm, node),
3588c2ecf20Sopenharmony_ci				   msecs_to_jiffies(timeout));
3598c2ecf20Sopenharmony_ci	else
3608c2ecf20Sopenharmony_ci		wait_event(dlm->dlm_reco_thread_wq,
3618c2ecf20Sopenharmony_ci			   dlm_is_node_dead(dlm, node));
3628c2ecf20Sopenharmony_ci}
3638c2ecf20Sopenharmony_ci
3648c2ecf20Sopenharmony_civoid dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
3658c2ecf20Sopenharmony_ci{
3668c2ecf20Sopenharmony_ci	if (dlm_is_node_recovered(dlm, node))
3678c2ecf20Sopenharmony_ci		return;
3688c2ecf20Sopenharmony_ci
3698c2ecf20Sopenharmony_ci	printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
3708c2ecf20Sopenharmony_ci	       "domain %s\n", node, dlm->name);
3718c2ecf20Sopenharmony_ci
3728c2ecf20Sopenharmony_ci	if (timeout)
3738c2ecf20Sopenharmony_ci		wait_event_timeout(dlm->dlm_reco_thread_wq,
3748c2ecf20Sopenharmony_ci				   dlm_is_node_recovered(dlm, node),
3758c2ecf20Sopenharmony_ci				   msecs_to_jiffies(timeout));
3768c2ecf20Sopenharmony_ci	else
3778c2ecf20Sopenharmony_ci		wait_event(dlm->dlm_reco_thread_wq,
3788c2ecf20Sopenharmony_ci			   dlm_is_node_recovered(dlm, node));
3798c2ecf20Sopenharmony_ci}
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci/* callers of the top-level api calls (dlmlock/dlmunlock) should
3828c2ecf20Sopenharmony_ci * block on the dlm->reco.event when recovery is in progress.
3838c2ecf20Sopenharmony_ci * the dlm recovery thread will set this state when it begins
3848c2ecf20Sopenharmony_ci * recovering a dead node (as the new master or not) and clear
3858c2ecf20Sopenharmony_ci * the state and wake as soon as all affected lock resources have
3868c2ecf20Sopenharmony_ci * been marked with the RECOVERY flag */
3878c2ecf20Sopenharmony_cistatic int dlm_in_recovery(struct dlm_ctxt *dlm)
3888c2ecf20Sopenharmony_ci{
3898c2ecf20Sopenharmony_ci	int in_recovery;
3908c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
3918c2ecf20Sopenharmony_ci	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
3928c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
3938c2ecf20Sopenharmony_ci	return in_recovery;
3948c2ecf20Sopenharmony_ci}
3958c2ecf20Sopenharmony_ci
3968c2ecf20Sopenharmony_ci
3978c2ecf20Sopenharmony_civoid dlm_wait_for_recovery(struct dlm_ctxt *dlm)
3988c2ecf20Sopenharmony_ci{
3998c2ecf20Sopenharmony_ci	if (dlm_in_recovery(dlm)) {
4008c2ecf20Sopenharmony_ci		mlog(0, "%s: reco thread %d in recovery: "
4018c2ecf20Sopenharmony_ci		     "state=%d, master=%u, dead=%u\n",
4028c2ecf20Sopenharmony_ci		     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
4038c2ecf20Sopenharmony_ci		     dlm->reco.state, dlm->reco.new_master,
4048c2ecf20Sopenharmony_ci		     dlm->reco.dead_node);
4058c2ecf20Sopenharmony_ci	}
4068c2ecf20Sopenharmony_ci	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
4078c2ecf20Sopenharmony_ci}
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_cistatic void dlm_begin_recovery(struct dlm_ctxt *dlm)
4108c2ecf20Sopenharmony_ci{
4118c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
4128c2ecf20Sopenharmony_ci	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
4138c2ecf20Sopenharmony_ci	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
4148c2ecf20Sopenharmony_ci	       dlm->name, dlm->reco.dead_node);
4158c2ecf20Sopenharmony_ci	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
4168c2ecf20Sopenharmony_ci}
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_cistatic void dlm_end_recovery(struct dlm_ctxt *dlm)
4198c2ecf20Sopenharmony_ci{
4208c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
4218c2ecf20Sopenharmony_ci	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
4228c2ecf20Sopenharmony_ci	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
4238c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
4248c2ecf20Sopenharmony_ci	printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
4258c2ecf20Sopenharmony_ci	wake_up(&dlm->reco.event);
4268c2ecf20Sopenharmony_ci}
4278c2ecf20Sopenharmony_ci
4288c2ecf20Sopenharmony_cistatic void dlm_print_recovery_master(struct dlm_ctxt *dlm)
4298c2ecf20Sopenharmony_ci{
4308c2ecf20Sopenharmony_ci	printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
4318c2ecf20Sopenharmony_ci	       "dead node %u in domain %s\n", dlm->reco.new_master,
4328c2ecf20Sopenharmony_ci	       (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
4338c2ecf20Sopenharmony_ci	       dlm->reco.dead_node, dlm->name);
4348c2ecf20Sopenharmony_ci}
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_cistatic int dlm_do_recovery(struct dlm_ctxt *dlm)
4378c2ecf20Sopenharmony_ci{
4388c2ecf20Sopenharmony_ci	int status = 0;
4398c2ecf20Sopenharmony_ci	int ret;
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
4428c2ecf20Sopenharmony_ci
4438c2ecf20Sopenharmony_ci	if (dlm->migrate_done) {
4448c2ecf20Sopenharmony_ci		mlog(0, "%s: no need do recovery after migrating all "
4458c2ecf20Sopenharmony_ci		     "lock resources\n", dlm->name);
4468c2ecf20Sopenharmony_ci		spin_unlock(&dlm->spinlock);
4478c2ecf20Sopenharmony_ci		return 0;
4488c2ecf20Sopenharmony_ci	}
4498c2ecf20Sopenharmony_ci
4508c2ecf20Sopenharmony_ci	/* check to see if the new master has died */
4518c2ecf20Sopenharmony_ci	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
4528c2ecf20Sopenharmony_ci	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
4538c2ecf20Sopenharmony_ci		mlog(0, "new master %u died while recovering %u!\n",
4548c2ecf20Sopenharmony_ci		     dlm->reco.new_master, dlm->reco.dead_node);
4558c2ecf20Sopenharmony_ci		/* unset the new_master, leave dead_node */
4568c2ecf20Sopenharmony_ci		dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
4578c2ecf20Sopenharmony_ci	}
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	/* select a target to recover */
4608c2ecf20Sopenharmony_ci	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
4618c2ecf20Sopenharmony_ci		int bit;
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ci		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
4648c2ecf20Sopenharmony_ci		if (bit >= O2NM_MAX_NODES || bit < 0)
4658c2ecf20Sopenharmony_ci			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
4668c2ecf20Sopenharmony_ci		else
4678c2ecf20Sopenharmony_ci			dlm_set_reco_dead_node(dlm, bit);
4688c2ecf20Sopenharmony_ci	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
4698c2ecf20Sopenharmony_ci		/* BUG? */
4708c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
4718c2ecf20Sopenharmony_ci		     dlm->reco.dead_node);
4728c2ecf20Sopenharmony_ci		dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
4738c2ecf20Sopenharmony_ci	}
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
4768c2ecf20Sopenharmony_ci		// mlog(0, "nothing to recover!  sleeping now!\n");
4778c2ecf20Sopenharmony_ci		spin_unlock(&dlm->spinlock);
4788c2ecf20Sopenharmony_ci		/* return to main thread loop and sleep. */
4798c2ecf20Sopenharmony_ci		return 0;
4808c2ecf20Sopenharmony_ci	}
4818c2ecf20Sopenharmony_ci	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
4828c2ecf20Sopenharmony_ci	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
4838c2ecf20Sopenharmony_ci	     dlm->reco.dead_node);
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci	/* take write barrier */
4868c2ecf20Sopenharmony_ci	/* (stops the list reshuffling thread, proxy ast handling) */
4878c2ecf20Sopenharmony_ci	dlm_begin_recovery(dlm);
4888c2ecf20Sopenharmony_ci
4898c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
4908c2ecf20Sopenharmony_ci
4918c2ecf20Sopenharmony_ci	if (dlm->reco.new_master == dlm->node_num)
4928c2ecf20Sopenharmony_ci		goto master_here;
4938c2ecf20Sopenharmony_ci
4948c2ecf20Sopenharmony_ci	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
4958c2ecf20Sopenharmony_ci		/* choose a new master, returns 0 if this node
4968c2ecf20Sopenharmony_ci		 * is the master, -EEXIST if it's another node.
4978c2ecf20Sopenharmony_ci		 * this does not return until a new master is chosen
4988c2ecf20Sopenharmony_ci		 * or recovery completes entirely. */
4998c2ecf20Sopenharmony_ci		ret = dlm_pick_recovery_master(dlm);
5008c2ecf20Sopenharmony_ci		if (!ret) {
5018c2ecf20Sopenharmony_ci			/* already notified everyone.  go. */
5028c2ecf20Sopenharmony_ci			goto master_here;
5038c2ecf20Sopenharmony_ci		}
5048c2ecf20Sopenharmony_ci		mlog(0, "another node will master this recovery session.\n");
5058c2ecf20Sopenharmony_ci	}
5068c2ecf20Sopenharmony_ci
5078c2ecf20Sopenharmony_ci	dlm_print_recovery_master(dlm);
5088c2ecf20Sopenharmony_ci
5098c2ecf20Sopenharmony_ci	/* it is safe to start everything back up here
5108c2ecf20Sopenharmony_ci	 * because all of the dead node's lock resources
5118c2ecf20Sopenharmony_ci	 * have been marked as in-recovery */
5128c2ecf20Sopenharmony_ci	dlm_end_recovery(dlm);
5138c2ecf20Sopenharmony_ci
5148c2ecf20Sopenharmony_ci	/* sleep out in main dlm_recovery_thread loop. */
5158c2ecf20Sopenharmony_ci	return 0;
5168c2ecf20Sopenharmony_ci
5178c2ecf20Sopenharmony_cimaster_here:
5188c2ecf20Sopenharmony_ci	dlm_print_recovery_master(dlm);
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
5218c2ecf20Sopenharmony_ci	if (status < 0) {
5228c2ecf20Sopenharmony_ci		/* we should never hit this anymore */
5238c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
5248c2ecf20Sopenharmony_ci		     "retrying.\n", dlm->name, status, dlm->reco.dead_node);
5258c2ecf20Sopenharmony_ci		/* yield a bit to allow any final network messages
5268c2ecf20Sopenharmony_ci		 * to get handled on remaining nodes */
5278c2ecf20Sopenharmony_ci		msleep(100);
5288c2ecf20Sopenharmony_ci	} else {
5298c2ecf20Sopenharmony_ci		/* success!  see if any other nodes need recovery */
5308c2ecf20Sopenharmony_ci		mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
5318c2ecf20Sopenharmony_ci		     dlm->name, dlm->reco.dead_node, dlm->node_num);
5328c2ecf20Sopenharmony_ci		spin_lock(&dlm->spinlock);
5338c2ecf20Sopenharmony_ci		__dlm_reset_recovery(dlm);
5348c2ecf20Sopenharmony_ci		dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
5358c2ecf20Sopenharmony_ci		spin_unlock(&dlm->spinlock);
5368c2ecf20Sopenharmony_ci	}
5378c2ecf20Sopenharmony_ci	dlm_end_recovery(dlm);
5388c2ecf20Sopenharmony_ci
5398c2ecf20Sopenharmony_ci	/* continue and look for another dead node */
5408c2ecf20Sopenharmony_ci	return -EAGAIN;
5418c2ecf20Sopenharmony_ci}
5428c2ecf20Sopenharmony_ci
5438c2ecf20Sopenharmony_cistatic int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
5448c2ecf20Sopenharmony_ci{
5458c2ecf20Sopenharmony_ci	int status = 0;
5468c2ecf20Sopenharmony_ci	struct dlm_reco_node_data *ndata;
5478c2ecf20Sopenharmony_ci	int all_nodes_done;
5488c2ecf20Sopenharmony_ci	int destroy = 0;
5498c2ecf20Sopenharmony_ci	int pass = 0;
5508c2ecf20Sopenharmony_ci
5518c2ecf20Sopenharmony_ci	do {
5528c2ecf20Sopenharmony_ci		/* we have become recovery master.  there is no escaping
5538c2ecf20Sopenharmony_ci		 * this, so just keep trying until we get it. */
5548c2ecf20Sopenharmony_ci		status = dlm_init_recovery_area(dlm, dead_node);
5558c2ecf20Sopenharmony_ci		if (status < 0) {
5568c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "%s: failed to alloc recovery area, "
5578c2ecf20Sopenharmony_ci			     "retrying\n", dlm->name);
5588c2ecf20Sopenharmony_ci			msleep(1000);
5598c2ecf20Sopenharmony_ci		}
5608c2ecf20Sopenharmony_ci	} while (status != 0);
5618c2ecf20Sopenharmony_ci
5628c2ecf20Sopenharmony_ci	/* safe to access the node data list without a lock, since this
5638c2ecf20Sopenharmony_ci	 * process is the only one to change the list */
5648c2ecf20Sopenharmony_ci	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
5658c2ecf20Sopenharmony_ci		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
5668c2ecf20Sopenharmony_ci		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
5678c2ecf20Sopenharmony_ci
5688c2ecf20Sopenharmony_ci		mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
5698c2ecf20Sopenharmony_ci		     ndata->node_num);
5708c2ecf20Sopenharmony_ci
5718c2ecf20Sopenharmony_ci		if (ndata->node_num == dlm->node_num) {
5728c2ecf20Sopenharmony_ci			ndata->state = DLM_RECO_NODE_DATA_DONE;
5738c2ecf20Sopenharmony_ci			continue;
5748c2ecf20Sopenharmony_ci		}
5758c2ecf20Sopenharmony_ci
5768c2ecf20Sopenharmony_ci		do {
5778c2ecf20Sopenharmony_ci			status = dlm_request_all_locks(dlm, ndata->node_num,
5788c2ecf20Sopenharmony_ci						       dead_node);
5798c2ecf20Sopenharmony_ci			if (status < 0) {
5808c2ecf20Sopenharmony_ci				mlog_errno(status);
5818c2ecf20Sopenharmony_ci				if (dlm_is_host_down(status)) {
5828c2ecf20Sopenharmony_ci					/* node died, ignore it for recovery */
5838c2ecf20Sopenharmony_ci					status = 0;
5848c2ecf20Sopenharmony_ci					ndata->state = DLM_RECO_NODE_DATA_DEAD;
5858c2ecf20Sopenharmony_ci					/* wait for the domain map to catch up
5868c2ecf20Sopenharmony_ci					 * with the network state. */
5878c2ecf20Sopenharmony_ci					wait_event_timeout(dlm->dlm_reco_thread_wq,
5888c2ecf20Sopenharmony_ci							   dlm_is_node_dead(dlm,
5898c2ecf20Sopenharmony_ci								ndata->node_num),
5908c2ecf20Sopenharmony_ci							   msecs_to_jiffies(1000));
5918c2ecf20Sopenharmony_ci					mlog(0, "waited 1 sec for %u, "
5928c2ecf20Sopenharmony_ci					     "dead? %s\n", ndata->node_num,
5938c2ecf20Sopenharmony_ci					     dlm_is_node_dead(dlm, ndata->node_num) ?
5948c2ecf20Sopenharmony_ci					     "yes" : "no");
5958c2ecf20Sopenharmony_ci				} else {
5968c2ecf20Sopenharmony_ci					/* -ENOMEM on the other node */
5978c2ecf20Sopenharmony_ci					mlog(0, "%s: node %u returned "
5988c2ecf20Sopenharmony_ci					     "%d during recovery, retrying "
5998c2ecf20Sopenharmony_ci					     "after a short wait\n",
6008c2ecf20Sopenharmony_ci					     dlm->name, ndata->node_num,
6018c2ecf20Sopenharmony_ci					     status);
6028c2ecf20Sopenharmony_ci					msleep(100);
6038c2ecf20Sopenharmony_ci				}
6048c2ecf20Sopenharmony_ci			}
6058c2ecf20Sopenharmony_ci		} while (status != 0);
6068c2ecf20Sopenharmony_ci
6078c2ecf20Sopenharmony_ci		spin_lock(&dlm_reco_state_lock);
6088c2ecf20Sopenharmony_ci		switch (ndata->state) {
6098c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_INIT:
6108c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
6118c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_REQUESTED:
6128c2ecf20Sopenharmony_ci				BUG();
6138c2ecf20Sopenharmony_ci				break;
6148c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_DEAD:
6158c2ecf20Sopenharmony_ci				mlog(0, "node %u died after requesting "
6168c2ecf20Sopenharmony_ci				     "recovery info for node %u\n",
6178c2ecf20Sopenharmony_ci				     ndata->node_num, dead_node);
6188c2ecf20Sopenharmony_ci				/* fine.  don't need this node's info.
6198c2ecf20Sopenharmony_ci				 * continue without it. */
6208c2ecf20Sopenharmony_ci				break;
6218c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_REQUESTING:
6228c2ecf20Sopenharmony_ci				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
6238c2ecf20Sopenharmony_ci				mlog(0, "now receiving recovery data from "
6248c2ecf20Sopenharmony_ci				     "node %u for dead node %u\n",
6258c2ecf20Sopenharmony_ci				     ndata->node_num, dead_node);
6268c2ecf20Sopenharmony_ci				break;
6278c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_RECEIVING:
6288c2ecf20Sopenharmony_ci				mlog(0, "already receiving recovery data from "
6298c2ecf20Sopenharmony_ci				     "node %u for dead node %u\n",
6308c2ecf20Sopenharmony_ci				     ndata->node_num, dead_node);
6318c2ecf20Sopenharmony_ci				break;
6328c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_DONE:
6338c2ecf20Sopenharmony_ci				mlog(0, "already DONE receiving recovery data "
6348c2ecf20Sopenharmony_ci				     "from node %u for dead node %u\n",
6358c2ecf20Sopenharmony_ci				     ndata->node_num, dead_node);
6368c2ecf20Sopenharmony_ci				break;
6378c2ecf20Sopenharmony_ci		}
6388c2ecf20Sopenharmony_ci		spin_unlock(&dlm_reco_state_lock);
6398c2ecf20Sopenharmony_ci	}
6408c2ecf20Sopenharmony_ci
6418c2ecf20Sopenharmony_ci	mlog(0, "%s: Done requesting all lock info\n", dlm->name);
6428c2ecf20Sopenharmony_ci
6438c2ecf20Sopenharmony_ci	/* nodes should be sending reco data now
6448c2ecf20Sopenharmony_ci	 * just need to wait */
6458c2ecf20Sopenharmony_ci
6468c2ecf20Sopenharmony_ci	while (1) {
6478c2ecf20Sopenharmony_ci		/* check all the nodes now to see if we are
6488c2ecf20Sopenharmony_ci		 * done, or if anyone died */
6498c2ecf20Sopenharmony_ci		all_nodes_done = 1;
6508c2ecf20Sopenharmony_ci		spin_lock(&dlm_reco_state_lock);
6518c2ecf20Sopenharmony_ci		list_for_each_entry(ndata, &dlm->reco.node_data, list) {
6528c2ecf20Sopenharmony_ci			mlog(0, "checking recovery state of node %u\n",
6538c2ecf20Sopenharmony_ci			     ndata->node_num);
6548c2ecf20Sopenharmony_ci			switch (ndata->state) {
6558c2ecf20Sopenharmony_ci				case DLM_RECO_NODE_DATA_INIT:
6568c2ecf20Sopenharmony_ci				case DLM_RECO_NODE_DATA_REQUESTING:
6578c2ecf20Sopenharmony_ci					mlog(ML_ERROR, "bad ndata state for "
6588c2ecf20Sopenharmony_ci					     "node %u: state=%d\n",
6598c2ecf20Sopenharmony_ci					     ndata->node_num, ndata->state);
6608c2ecf20Sopenharmony_ci					BUG();
6618c2ecf20Sopenharmony_ci					break;
6628c2ecf20Sopenharmony_ci				case DLM_RECO_NODE_DATA_DEAD:
6638c2ecf20Sopenharmony_ci					mlog(0, "node %u died after "
6648c2ecf20Sopenharmony_ci					     "requesting recovery info for "
6658c2ecf20Sopenharmony_ci					     "node %u\n", ndata->node_num,
6668c2ecf20Sopenharmony_ci					     dead_node);
6678c2ecf20Sopenharmony_ci					break;
6688c2ecf20Sopenharmony_ci				case DLM_RECO_NODE_DATA_RECEIVING:
6698c2ecf20Sopenharmony_ci				case DLM_RECO_NODE_DATA_REQUESTED:
6708c2ecf20Sopenharmony_ci					mlog(0, "%s: node %u still in state %s\n",
6718c2ecf20Sopenharmony_ci					     dlm->name, ndata->node_num,
6728c2ecf20Sopenharmony_ci					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
6738c2ecf20Sopenharmony_ci					     "receiving" : "requested");
6748c2ecf20Sopenharmony_ci					all_nodes_done = 0;
6758c2ecf20Sopenharmony_ci					break;
6768c2ecf20Sopenharmony_ci				case DLM_RECO_NODE_DATA_DONE:
6778c2ecf20Sopenharmony_ci					mlog(0, "%s: node %u state is done\n",
6788c2ecf20Sopenharmony_ci					     dlm->name, ndata->node_num);
6798c2ecf20Sopenharmony_ci					break;
6808c2ecf20Sopenharmony_ci				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
6818c2ecf20Sopenharmony_ci					mlog(0, "%s: node %u state is finalize\n",
6828c2ecf20Sopenharmony_ci					     dlm->name, ndata->node_num);
6838c2ecf20Sopenharmony_ci					break;
6848c2ecf20Sopenharmony_ci			}
6858c2ecf20Sopenharmony_ci		}
6868c2ecf20Sopenharmony_ci		spin_unlock(&dlm_reco_state_lock);
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_ci		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
6898c2ecf20Sopenharmony_ci		     all_nodes_done?"yes":"no");
6908c2ecf20Sopenharmony_ci		if (all_nodes_done) {
6918c2ecf20Sopenharmony_ci			int ret;
6928c2ecf20Sopenharmony_ci
6938c2ecf20Sopenharmony_ci			/* Set this flag on recovery master to avoid
6948c2ecf20Sopenharmony_ci			 * a new recovery for another dead node start
6958c2ecf20Sopenharmony_ci			 * before the recovery is not done. That may
6968c2ecf20Sopenharmony_ci			 * cause recovery hung.*/
6978c2ecf20Sopenharmony_ci			spin_lock(&dlm->spinlock);
6988c2ecf20Sopenharmony_ci			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
6998c2ecf20Sopenharmony_ci			spin_unlock(&dlm->spinlock);
7008c2ecf20Sopenharmony_ci
7018c2ecf20Sopenharmony_ci			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
7028c2ecf20Sopenharmony_ci	 		 * just send a finalize message to everyone and
7038c2ecf20Sopenharmony_ci	 		 * clean up */
7048c2ecf20Sopenharmony_ci			mlog(0, "all nodes are done! send finalize\n");
7058c2ecf20Sopenharmony_ci			ret = dlm_send_finalize_reco_message(dlm);
7068c2ecf20Sopenharmony_ci			if (ret < 0)
7078c2ecf20Sopenharmony_ci				mlog_errno(ret);
7088c2ecf20Sopenharmony_ci
7098c2ecf20Sopenharmony_ci			spin_lock(&dlm->spinlock);
7108c2ecf20Sopenharmony_ci			dlm_finish_local_lockres_recovery(dlm, dead_node,
7118c2ecf20Sopenharmony_ci							  dlm->node_num);
7128c2ecf20Sopenharmony_ci			spin_unlock(&dlm->spinlock);
7138c2ecf20Sopenharmony_ci			mlog(0, "should be done with recovery!\n");
7148c2ecf20Sopenharmony_ci
7158c2ecf20Sopenharmony_ci			mlog(0, "finishing recovery of %s at %lu, "
7168c2ecf20Sopenharmony_ci			     "dead=%u, this=%u, new=%u\n", dlm->name,
7178c2ecf20Sopenharmony_ci			     jiffies, dlm->reco.dead_node,
7188c2ecf20Sopenharmony_ci			     dlm->node_num, dlm->reco.new_master);
7198c2ecf20Sopenharmony_ci			destroy = 1;
7208c2ecf20Sopenharmony_ci			status = 0;
7218c2ecf20Sopenharmony_ci			/* rescan everything marked dirty along the way */
7228c2ecf20Sopenharmony_ci			dlm_kick_thread(dlm, NULL);
7238c2ecf20Sopenharmony_ci			break;
7248c2ecf20Sopenharmony_ci		}
7258c2ecf20Sopenharmony_ci		/* wait to be signalled, with periodic timeout
7268c2ecf20Sopenharmony_ci		 * to check for node death */
7278c2ecf20Sopenharmony_ci		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
7288c2ecf20Sopenharmony_ci					 kthread_should_stop(),
7298c2ecf20Sopenharmony_ci					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
7308c2ecf20Sopenharmony_ci
7318c2ecf20Sopenharmony_ci	}
7328c2ecf20Sopenharmony_ci
7338c2ecf20Sopenharmony_ci	if (destroy)
7348c2ecf20Sopenharmony_ci		dlm_destroy_recovery_area(dlm);
7358c2ecf20Sopenharmony_ci
7368c2ecf20Sopenharmony_ci	return status;
7378c2ecf20Sopenharmony_ci}
7388c2ecf20Sopenharmony_ci
7398c2ecf20Sopenharmony_cistatic int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
7408c2ecf20Sopenharmony_ci{
7418c2ecf20Sopenharmony_ci	int num=0;
7428c2ecf20Sopenharmony_ci	struct dlm_reco_node_data *ndata;
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
7458c2ecf20Sopenharmony_ci	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
7468c2ecf20Sopenharmony_ci	/* nodes can only be removed (by dying) after dropping
7478c2ecf20Sopenharmony_ci	 * this lock, and death will be trapped later, so this should do */
7488c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
7498c2ecf20Sopenharmony_ci
7508c2ecf20Sopenharmony_ci	while (1) {
7518c2ecf20Sopenharmony_ci		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
7528c2ecf20Sopenharmony_ci		if (num >= O2NM_MAX_NODES) {
7538c2ecf20Sopenharmony_ci			break;
7548c2ecf20Sopenharmony_ci		}
7558c2ecf20Sopenharmony_ci		BUG_ON(num == dead_node);
7568c2ecf20Sopenharmony_ci
7578c2ecf20Sopenharmony_ci		ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
7588c2ecf20Sopenharmony_ci		if (!ndata) {
7598c2ecf20Sopenharmony_ci			dlm_destroy_recovery_area(dlm);
7608c2ecf20Sopenharmony_ci			return -ENOMEM;
7618c2ecf20Sopenharmony_ci		}
7628c2ecf20Sopenharmony_ci		ndata->node_num = num;
7638c2ecf20Sopenharmony_ci		ndata->state = DLM_RECO_NODE_DATA_INIT;
7648c2ecf20Sopenharmony_ci		spin_lock(&dlm_reco_state_lock);
7658c2ecf20Sopenharmony_ci		list_add_tail(&ndata->list, &dlm->reco.node_data);
7668c2ecf20Sopenharmony_ci		spin_unlock(&dlm_reco_state_lock);
7678c2ecf20Sopenharmony_ci		num++;
7688c2ecf20Sopenharmony_ci	}
7698c2ecf20Sopenharmony_ci
7708c2ecf20Sopenharmony_ci	return 0;
7718c2ecf20Sopenharmony_ci}
7728c2ecf20Sopenharmony_ci
7738c2ecf20Sopenharmony_cistatic void dlm_destroy_recovery_area(struct dlm_ctxt *dlm)
7748c2ecf20Sopenharmony_ci{
7758c2ecf20Sopenharmony_ci	struct dlm_reco_node_data *ndata, *next;
7768c2ecf20Sopenharmony_ci	LIST_HEAD(tmplist);
7778c2ecf20Sopenharmony_ci
7788c2ecf20Sopenharmony_ci	spin_lock(&dlm_reco_state_lock);
7798c2ecf20Sopenharmony_ci	list_splice_init(&dlm->reco.node_data, &tmplist);
7808c2ecf20Sopenharmony_ci	spin_unlock(&dlm_reco_state_lock);
7818c2ecf20Sopenharmony_ci
7828c2ecf20Sopenharmony_ci	list_for_each_entry_safe(ndata, next, &tmplist, list) {
7838c2ecf20Sopenharmony_ci		list_del_init(&ndata->list);
7848c2ecf20Sopenharmony_ci		kfree(ndata);
7858c2ecf20Sopenharmony_ci	}
7868c2ecf20Sopenharmony_ci}
7878c2ecf20Sopenharmony_ci
7888c2ecf20Sopenharmony_cistatic int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
7898c2ecf20Sopenharmony_ci				 u8 dead_node)
7908c2ecf20Sopenharmony_ci{
7918c2ecf20Sopenharmony_ci	struct dlm_lock_request lr;
7928c2ecf20Sopenharmony_ci	int ret;
7938c2ecf20Sopenharmony_ci	int status;
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_ci	mlog(0, "\n");
7968c2ecf20Sopenharmony_ci
7978c2ecf20Sopenharmony_ci
7988c2ecf20Sopenharmony_ci	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
7998c2ecf20Sopenharmony_ci		  "to %u\n", dead_node, request_from);
8008c2ecf20Sopenharmony_ci
8018c2ecf20Sopenharmony_ci	memset(&lr, 0, sizeof(lr));
8028c2ecf20Sopenharmony_ci	lr.node_idx = dlm->node_num;
8038c2ecf20Sopenharmony_ci	lr.dead_node = dead_node;
8048c2ecf20Sopenharmony_ci
8058c2ecf20Sopenharmony_ci	// send message
8068c2ecf20Sopenharmony_ci	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
8078c2ecf20Sopenharmony_ci				 &lr, sizeof(lr), request_from, &status);
8088c2ecf20Sopenharmony_ci
8098c2ecf20Sopenharmony_ci	/* negative status is handled by caller */
8108c2ecf20Sopenharmony_ci	if (ret < 0)
8118c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
8128c2ecf20Sopenharmony_ci		     "to recover dead node %u\n", dlm->name, ret,
8138c2ecf20Sopenharmony_ci		     request_from, dead_node);
8148c2ecf20Sopenharmony_ci	else
8158c2ecf20Sopenharmony_ci		ret = status;
8168c2ecf20Sopenharmony_ci	// return from here, then
8178c2ecf20Sopenharmony_ci	// sleep until all received or error
8188c2ecf20Sopenharmony_ci	return ret;
8198c2ecf20Sopenharmony_ci
8208c2ecf20Sopenharmony_ci}
8218c2ecf20Sopenharmony_ci
8228c2ecf20Sopenharmony_ciint dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
8238c2ecf20Sopenharmony_ci				  void **ret_data)
8248c2ecf20Sopenharmony_ci{
8258c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
8268c2ecf20Sopenharmony_ci	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
8278c2ecf20Sopenharmony_ci	char *buf = NULL;
8288c2ecf20Sopenharmony_ci	struct dlm_work_item *item = NULL;
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm))
8318c2ecf20Sopenharmony_ci		return -EINVAL;
8328c2ecf20Sopenharmony_ci
8338c2ecf20Sopenharmony_ci	if (lr->dead_node != dlm->reco.dead_node) {
8348c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
8358c2ecf20Sopenharmony_ci		     "dead_node is %u\n", dlm->name, lr->node_idx,
8368c2ecf20Sopenharmony_ci		     lr->dead_node, dlm->reco.dead_node);
8378c2ecf20Sopenharmony_ci		dlm_print_reco_node_status(dlm);
8388c2ecf20Sopenharmony_ci		/* this is a hack */
8398c2ecf20Sopenharmony_ci		dlm_put(dlm);
8408c2ecf20Sopenharmony_ci		return -ENOMEM;
8418c2ecf20Sopenharmony_ci	}
8428c2ecf20Sopenharmony_ci	BUG_ON(lr->dead_node != dlm->reco.dead_node);
8438c2ecf20Sopenharmony_ci
8448c2ecf20Sopenharmony_ci	item = kzalloc(sizeof(*item), GFP_NOFS);
8458c2ecf20Sopenharmony_ci	if (!item) {
8468c2ecf20Sopenharmony_ci		dlm_put(dlm);
8478c2ecf20Sopenharmony_ci		return -ENOMEM;
8488c2ecf20Sopenharmony_ci	}
8498c2ecf20Sopenharmony_ci
8508c2ecf20Sopenharmony_ci	/* this will get freed by dlm_request_all_locks_worker */
8518c2ecf20Sopenharmony_ci	buf = (char *) __get_free_page(GFP_NOFS);
8528c2ecf20Sopenharmony_ci	if (!buf) {
8538c2ecf20Sopenharmony_ci		kfree(item);
8548c2ecf20Sopenharmony_ci		dlm_put(dlm);
8558c2ecf20Sopenharmony_ci		return -ENOMEM;
8568c2ecf20Sopenharmony_ci	}
8578c2ecf20Sopenharmony_ci
8588c2ecf20Sopenharmony_ci	/* queue up work for dlm_request_all_locks_worker */
8598c2ecf20Sopenharmony_ci	dlm_grab(dlm);  /* get an extra ref for the work item */
8608c2ecf20Sopenharmony_ci	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
8618c2ecf20Sopenharmony_ci	item->u.ral.reco_master = lr->node_idx;
8628c2ecf20Sopenharmony_ci	item->u.ral.dead_node = lr->dead_node;
8638c2ecf20Sopenharmony_ci	spin_lock(&dlm->work_lock);
8648c2ecf20Sopenharmony_ci	list_add_tail(&item->list, &dlm->work_list);
8658c2ecf20Sopenharmony_ci	spin_unlock(&dlm->work_lock);
8668c2ecf20Sopenharmony_ci	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
8678c2ecf20Sopenharmony_ci
8688c2ecf20Sopenharmony_ci	dlm_put(dlm);
8698c2ecf20Sopenharmony_ci	return 0;
8708c2ecf20Sopenharmony_ci}
8718c2ecf20Sopenharmony_ci
8728c2ecf20Sopenharmony_cistatic void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
8738c2ecf20Sopenharmony_ci{
8748c2ecf20Sopenharmony_ci	struct dlm_migratable_lockres *mres;
8758c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res;
8768c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm;
8778c2ecf20Sopenharmony_ci	LIST_HEAD(resources);
8788c2ecf20Sopenharmony_ci	int ret;
8798c2ecf20Sopenharmony_ci	u8 dead_node, reco_master;
8808c2ecf20Sopenharmony_ci	int skip_all_done = 0;
8818c2ecf20Sopenharmony_ci
8828c2ecf20Sopenharmony_ci	dlm = item->dlm;
8838c2ecf20Sopenharmony_ci	dead_node = item->u.ral.dead_node;
8848c2ecf20Sopenharmony_ci	reco_master = item->u.ral.reco_master;
8858c2ecf20Sopenharmony_ci	mres = (struct dlm_migratable_lockres *)data;
8868c2ecf20Sopenharmony_ci
8878c2ecf20Sopenharmony_ci	mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
8888c2ecf20Sopenharmony_ci	     dlm->name, dead_node, reco_master);
8898c2ecf20Sopenharmony_ci
8908c2ecf20Sopenharmony_ci	if (dead_node != dlm->reco.dead_node ||
8918c2ecf20Sopenharmony_ci	    reco_master != dlm->reco.new_master) {
8928c2ecf20Sopenharmony_ci		/* worker could have been created before the recovery master
8938c2ecf20Sopenharmony_ci		 * died.  if so, do not continue, but do not error. */
8948c2ecf20Sopenharmony_ci		if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
8958c2ecf20Sopenharmony_ci			mlog(ML_NOTICE, "%s: will not send recovery state, "
8968c2ecf20Sopenharmony_ci			     "recovery master %u died, thread=(dead=%u,mas=%u)"
8978c2ecf20Sopenharmony_ci			     " current=(dead=%u,mas=%u)\n", dlm->name,
8988c2ecf20Sopenharmony_ci			     reco_master, dead_node, reco_master,
8998c2ecf20Sopenharmony_ci			     dlm->reco.dead_node, dlm->reco.new_master);
9008c2ecf20Sopenharmony_ci		} else {
9018c2ecf20Sopenharmony_ci			mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
9028c2ecf20Sopenharmony_ci			     "master=%u), request(dead=%u, master=%u)\n",
9038c2ecf20Sopenharmony_ci			     dlm->name, dlm->reco.dead_node,
9048c2ecf20Sopenharmony_ci			     dlm->reco.new_master, dead_node, reco_master);
9058c2ecf20Sopenharmony_ci		}
9068c2ecf20Sopenharmony_ci		goto leave;
9078c2ecf20Sopenharmony_ci	}
9088c2ecf20Sopenharmony_ci
9098c2ecf20Sopenharmony_ci	/* lock resources should have already been moved to the
9108c2ecf20Sopenharmony_ci 	 * dlm->reco.resources list.  now move items from that list
9118c2ecf20Sopenharmony_ci 	 * to a temp list if the dead owner matches.  note that the
9128c2ecf20Sopenharmony_ci	 * whole cluster recovers only one node at a time, so we
9138c2ecf20Sopenharmony_ci	 * can safely move UNKNOWN lock resources for each recovery
9148c2ecf20Sopenharmony_ci	 * session. */
9158c2ecf20Sopenharmony_ci	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
9168c2ecf20Sopenharmony_ci
9178c2ecf20Sopenharmony_ci	/* now we can begin blasting lockreses without the dlm lock */
9188c2ecf20Sopenharmony_ci
9198c2ecf20Sopenharmony_ci	/* any errors returned will be due to the new_master dying,
9208c2ecf20Sopenharmony_ci	 * the dlm_reco_thread should detect this */
9218c2ecf20Sopenharmony_ci	list_for_each_entry(res, &resources, recovering) {
9228c2ecf20Sopenharmony_ci		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
9238c2ecf20Sopenharmony_ci				   	DLM_MRES_RECOVERY);
9248c2ecf20Sopenharmony_ci		if (ret < 0) {
9258c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "%s: node %u went down while sending "
9268c2ecf20Sopenharmony_ci			     "recovery state for dead node %u, ret=%d\n", dlm->name,
9278c2ecf20Sopenharmony_ci			     reco_master, dead_node, ret);
9288c2ecf20Sopenharmony_ci			skip_all_done = 1;
9298c2ecf20Sopenharmony_ci			break;
9308c2ecf20Sopenharmony_ci		}
9318c2ecf20Sopenharmony_ci	}
9328c2ecf20Sopenharmony_ci
9338c2ecf20Sopenharmony_ci	/* move the resources back to the list */
9348c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
9358c2ecf20Sopenharmony_ci	list_splice_init(&resources, &dlm->reco.resources);
9368c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
9378c2ecf20Sopenharmony_ci
9388c2ecf20Sopenharmony_ci	if (!skip_all_done) {
9398c2ecf20Sopenharmony_ci		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
9408c2ecf20Sopenharmony_ci		if (ret < 0) {
9418c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "%s: node %u went down while sending "
9428c2ecf20Sopenharmony_ci			     "recovery all-done for dead node %u, ret=%d\n",
9438c2ecf20Sopenharmony_ci			     dlm->name, reco_master, dead_node, ret);
9448c2ecf20Sopenharmony_ci		}
9458c2ecf20Sopenharmony_ci	}
9468c2ecf20Sopenharmony_cileave:
9478c2ecf20Sopenharmony_ci	free_page((unsigned long)data);
9488c2ecf20Sopenharmony_ci}
9498c2ecf20Sopenharmony_ci
9508c2ecf20Sopenharmony_ci
9518c2ecf20Sopenharmony_cistatic int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
9528c2ecf20Sopenharmony_ci{
9538c2ecf20Sopenharmony_ci	int ret, tmpret;
9548c2ecf20Sopenharmony_ci	struct dlm_reco_data_done done_msg;
9558c2ecf20Sopenharmony_ci
9568c2ecf20Sopenharmony_ci	memset(&done_msg, 0, sizeof(done_msg));
9578c2ecf20Sopenharmony_ci	done_msg.node_idx = dlm->node_num;
9588c2ecf20Sopenharmony_ci	done_msg.dead_node = dead_node;
9598c2ecf20Sopenharmony_ci	mlog(0, "sending DATA DONE message to %u, "
9608c2ecf20Sopenharmony_ci	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
9618c2ecf20Sopenharmony_ci	     done_msg.dead_node);
9628c2ecf20Sopenharmony_ci
9638c2ecf20Sopenharmony_ci	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
9648c2ecf20Sopenharmony_ci				 sizeof(done_msg), send_to, &tmpret);
9658c2ecf20Sopenharmony_ci	if (ret < 0) {
9668c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
9678c2ecf20Sopenharmony_ci		     "to recover dead node %u\n", dlm->name, ret, send_to,
9688c2ecf20Sopenharmony_ci		     dead_node);
9698c2ecf20Sopenharmony_ci		if (!dlm_is_host_down(ret)) {
9708c2ecf20Sopenharmony_ci			BUG();
9718c2ecf20Sopenharmony_ci		}
9728c2ecf20Sopenharmony_ci	} else
9738c2ecf20Sopenharmony_ci		ret = tmpret;
9748c2ecf20Sopenharmony_ci	return ret;
9758c2ecf20Sopenharmony_ci}
9768c2ecf20Sopenharmony_ci
9778c2ecf20Sopenharmony_ci
9788c2ecf20Sopenharmony_ciint dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
9798c2ecf20Sopenharmony_ci			       void **ret_data)
9808c2ecf20Sopenharmony_ci{
9818c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
9828c2ecf20Sopenharmony_ci	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
9838c2ecf20Sopenharmony_ci	struct dlm_reco_node_data *ndata = NULL;
9848c2ecf20Sopenharmony_ci	int ret = -EINVAL;
9858c2ecf20Sopenharmony_ci
9868c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm))
9878c2ecf20Sopenharmony_ci		return -EINVAL;
9888c2ecf20Sopenharmony_ci
9898c2ecf20Sopenharmony_ci	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
9908c2ecf20Sopenharmony_ci	     "node_idx=%u, this node=%u\n", done->dead_node,
9918c2ecf20Sopenharmony_ci	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
9928c2ecf20Sopenharmony_ci
9938c2ecf20Sopenharmony_ci	mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
9948c2ecf20Sopenharmony_ci			"Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
9958c2ecf20Sopenharmony_ci			"node_idx=%u, this node=%u\n", done->dead_node,
9968c2ecf20Sopenharmony_ci			dlm->reco.dead_node, done->node_idx, dlm->node_num);
9978c2ecf20Sopenharmony_ci
9988c2ecf20Sopenharmony_ci	spin_lock(&dlm_reco_state_lock);
9998c2ecf20Sopenharmony_ci	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
10008c2ecf20Sopenharmony_ci		if (ndata->node_num != done->node_idx)
10018c2ecf20Sopenharmony_ci			continue;
10028c2ecf20Sopenharmony_ci
10038c2ecf20Sopenharmony_ci		switch (ndata->state) {
10048c2ecf20Sopenharmony_ci			/* should have moved beyond INIT but not to FINALIZE yet */
10058c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_INIT:
10068c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_DEAD:
10078c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
10088c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "bad ndata state for node %u:"
10098c2ecf20Sopenharmony_ci				     " state=%d\n", ndata->node_num,
10108c2ecf20Sopenharmony_ci				     ndata->state);
10118c2ecf20Sopenharmony_ci				BUG();
10128c2ecf20Sopenharmony_ci				break;
10138c2ecf20Sopenharmony_ci			/* these states are possible at this point, anywhere along
10148c2ecf20Sopenharmony_ci			 * the line of recovery */
10158c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_DONE:
10168c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_RECEIVING:
10178c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_REQUESTED:
10188c2ecf20Sopenharmony_ci			case DLM_RECO_NODE_DATA_REQUESTING:
10198c2ecf20Sopenharmony_ci				mlog(0, "node %u is DONE sending "
10208c2ecf20Sopenharmony_ci					  "recovery data!\n",
10218c2ecf20Sopenharmony_ci					  ndata->node_num);
10228c2ecf20Sopenharmony_ci
10238c2ecf20Sopenharmony_ci				ndata->state = DLM_RECO_NODE_DATA_DONE;
10248c2ecf20Sopenharmony_ci				ret = 0;
10258c2ecf20Sopenharmony_ci				break;
10268c2ecf20Sopenharmony_ci		}
10278c2ecf20Sopenharmony_ci	}
10288c2ecf20Sopenharmony_ci	spin_unlock(&dlm_reco_state_lock);
10298c2ecf20Sopenharmony_ci
10308c2ecf20Sopenharmony_ci	/* wake the recovery thread, some node is done */
10318c2ecf20Sopenharmony_ci	if (!ret)
10328c2ecf20Sopenharmony_ci		dlm_kick_recovery_thread(dlm);
10338c2ecf20Sopenharmony_ci
10348c2ecf20Sopenharmony_ci	if (ret < 0)
10358c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "failed to find recovery node data for node "
10368c2ecf20Sopenharmony_ci		     "%u\n", done->node_idx);
10378c2ecf20Sopenharmony_ci	dlm_put(dlm);
10388c2ecf20Sopenharmony_ci
10398c2ecf20Sopenharmony_ci	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
10408c2ecf20Sopenharmony_ci	return ret;
10418c2ecf20Sopenharmony_ci}
10428c2ecf20Sopenharmony_ci
10438c2ecf20Sopenharmony_cistatic void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
10448c2ecf20Sopenharmony_ci					struct list_head *list,
10458c2ecf20Sopenharmony_ci				       	u8 dead_node)
10468c2ecf20Sopenharmony_ci{
10478c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res, *next;
10488c2ecf20Sopenharmony_ci	struct dlm_lock *lock;
10498c2ecf20Sopenharmony_ci
10508c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
10518c2ecf20Sopenharmony_ci	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
10528c2ecf20Sopenharmony_ci		/* always prune any $RECOVERY entries for dead nodes,
10538c2ecf20Sopenharmony_ci		 * otherwise hangs can occur during later recovery */
10548c2ecf20Sopenharmony_ci		if (dlm_is_recovery_lock(res->lockname.name,
10558c2ecf20Sopenharmony_ci					 res->lockname.len)) {
10568c2ecf20Sopenharmony_ci			spin_lock(&res->spinlock);
10578c2ecf20Sopenharmony_ci			list_for_each_entry(lock, &res->granted, list) {
10588c2ecf20Sopenharmony_ci				if (lock->ml.node == dead_node) {
10598c2ecf20Sopenharmony_ci					mlog(0, "AHA! there was "
10608c2ecf20Sopenharmony_ci					     "a $RECOVERY lock for dead "
10618c2ecf20Sopenharmony_ci					     "node %u (%s)!\n",
10628c2ecf20Sopenharmony_ci					     dead_node, dlm->name);
10638c2ecf20Sopenharmony_ci					list_del_init(&lock->list);
10648c2ecf20Sopenharmony_ci					dlm_lock_put(lock);
10658c2ecf20Sopenharmony_ci					/* Can't schedule DLM_UNLOCK_FREE_LOCK
10668c2ecf20Sopenharmony_ci					 * - do manually */
10678c2ecf20Sopenharmony_ci					dlm_lock_put(lock);
10688c2ecf20Sopenharmony_ci					break;
10698c2ecf20Sopenharmony_ci				}
10708c2ecf20Sopenharmony_ci			}
10718c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
10728c2ecf20Sopenharmony_ci			continue;
10738c2ecf20Sopenharmony_ci		}
10748c2ecf20Sopenharmony_ci
10758c2ecf20Sopenharmony_ci		if (res->owner == dead_node) {
10768c2ecf20Sopenharmony_ci			mlog(0, "found lockres owned by dead node while "
10778c2ecf20Sopenharmony_ci				  "doing recovery for node %u. sending it.\n",
10788c2ecf20Sopenharmony_ci				  dead_node);
10798c2ecf20Sopenharmony_ci			list_move_tail(&res->recovering, list);
10808c2ecf20Sopenharmony_ci		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
10818c2ecf20Sopenharmony_ci			mlog(0, "found UNKNOWN owner while doing recovery "
10828c2ecf20Sopenharmony_ci				  "for node %u. sending it.\n", dead_node);
10838c2ecf20Sopenharmony_ci			list_move_tail(&res->recovering, list);
10848c2ecf20Sopenharmony_ci		}
10858c2ecf20Sopenharmony_ci	}
10868c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
10878c2ecf20Sopenharmony_ci}
10888c2ecf20Sopenharmony_ci
10898c2ecf20Sopenharmony_cistatic inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
10908c2ecf20Sopenharmony_ci{
10918c2ecf20Sopenharmony_ci	int total_locks = 0;
10928c2ecf20Sopenharmony_ci	struct list_head *iter, *queue = &res->granted;
10938c2ecf20Sopenharmony_ci	int i;
10948c2ecf20Sopenharmony_ci
10958c2ecf20Sopenharmony_ci	for (i=0; i<3; i++) {
10968c2ecf20Sopenharmony_ci		list_for_each(iter, queue)
10978c2ecf20Sopenharmony_ci			total_locks++;
10988c2ecf20Sopenharmony_ci		queue++;
10998c2ecf20Sopenharmony_ci	}
11008c2ecf20Sopenharmony_ci	return total_locks;
11018c2ecf20Sopenharmony_ci}
11028c2ecf20Sopenharmony_ci
11038c2ecf20Sopenharmony_ci
11048c2ecf20Sopenharmony_cistatic int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
11058c2ecf20Sopenharmony_ci				      struct dlm_migratable_lockres *mres,
11068c2ecf20Sopenharmony_ci				      u8 send_to,
11078c2ecf20Sopenharmony_ci				      struct dlm_lock_resource *res,
11088c2ecf20Sopenharmony_ci				      int total_locks)
11098c2ecf20Sopenharmony_ci{
11108c2ecf20Sopenharmony_ci	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
11118c2ecf20Sopenharmony_ci	int mres_total_locks = be32_to_cpu(mres->total_locks);
11128c2ecf20Sopenharmony_ci	int ret = 0, status = 0;
11138c2ecf20Sopenharmony_ci	u8 orig_flags = mres->flags,
11148c2ecf20Sopenharmony_ci	   orig_master = mres->master;
11158c2ecf20Sopenharmony_ci
11168c2ecf20Sopenharmony_ci	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
11178c2ecf20Sopenharmony_ci	if (!mres->num_locks)
11188c2ecf20Sopenharmony_ci		return 0;
11198c2ecf20Sopenharmony_ci
11208c2ecf20Sopenharmony_ci	/* add an all-done flag if we reached the last lock */
11218c2ecf20Sopenharmony_ci	orig_flags = mres->flags;
11228c2ecf20Sopenharmony_ci	BUG_ON(total_locks > mres_total_locks);
11238c2ecf20Sopenharmony_ci	if (total_locks == mres_total_locks)
11248c2ecf20Sopenharmony_ci		mres->flags |= DLM_MRES_ALL_DONE;
11258c2ecf20Sopenharmony_ci
11268c2ecf20Sopenharmony_ci	mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
11278c2ecf20Sopenharmony_ci	     dlm->name, res->lockname.len, res->lockname.name,
11288c2ecf20Sopenharmony_ci	     orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
11298c2ecf20Sopenharmony_ci	     send_to);
11308c2ecf20Sopenharmony_ci
11318c2ecf20Sopenharmony_ci	/* send it */
11328c2ecf20Sopenharmony_ci	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
11338c2ecf20Sopenharmony_ci				 struct_size(mres, ml, mres->num_locks),
11348c2ecf20Sopenharmony_ci				 send_to, &status);
11358c2ecf20Sopenharmony_ci	if (ret < 0) {
11368c2ecf20Sopenharmony_ci		/* XXX: negative status is not handled.
11378c2ecf20Sopenharmony_ci		 * this will end up killing this node. */
11388c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
11398c2ecf20Sopenharmony_ci		     "node %u (%s)\n", dlm->name, mres->lockname_len,
11408c2ecf20Sopenharmony_ci		     mres->lockname, ret, send_to,
11418c2ecf20Sopenharmony_ci		     (orig_flags & DLM_MRES_MIGRATION ?
11428c2ecf20Sopenharmony_ci		      "migration" : "recovery"));
11438c2ecf20Sopenharmony_ci	} else {
11448c2ecf20Sopenharmony_ci		/* might get an -ENOMEM back here */
11458c2ecf20Sopenharmony_ci		ret = status;
11468c2ecf20Sopenharmony_ci		if (ret < 0) {
11478c2ecf20Sopenharmony_ci			mlog_errno(ret);
11488c2ecf20Sopenharmony_ci
11498c2ecf20Sopenharmony_ci			if (ret == -EFAULT) {
11508c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "node %u told me to kill "
11518c2ecf20Sopenharmony_ci				     "myself!\n", send_to);
11528c2ecf20Sopenharmony_ci				BUG();
11538c2ecf20Sopenharmony_ci			}
11548c2ecf20Sopenharmony_ci		}
11558c2ecf20Sopenharmony_ci	}
11568c2ecf20Sopenharmony_ci
11578c2ecf20Sopenharmony_ci	/* zero and reinit the message buffer */
11588c2ecf20Sopenharmony_ci	dlm_init_migratable_lockres(mres, res->lockname.name,
11598c2ecf20Sopenharmony_ci				    res->lockname.len, mres_total_locks,
11608c2ecf20Sopenharmony_ci				    mig_cookie, orig_flags, orig_master);
11618c2ecf20Sopenharmony_ci	return ret;
11628c2ecf20Sopenharmony_ci}
11638c2ecf20Sopenharmony_ci
11648c2ecf20Sopenharmony_cistatic void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
11658c2ecf20Sopenharmony_ci					const char *lockname, int namelen,
11668c2ecf20Sopenharmony_ci					int total_locks, u64 cookie,
11678c2ecf20Sopenharmony_ci					u8 flags, u8 master)
11688c2ecf20Sopenharmony_ci{
11698c2ecf20Sopenharmony_ci	/* mres here is one full page */
11708c2ecf20Sopenharmony_ci	clear_page(mres);
11718c2ecf20Sopenharmony_ci	mres->lockname_len = namelen;
11728c2ecf20Sopenharmony_ci	memcpy(mres->lockname, lockname, namelen);
11738c2ecf20Sopenharmony_ci	mres->num_locks = 0;
11748c2ecf20Sopenharmony_ci	mres->total_locks = cpu_to_be32(total_locks);
11758c2ecf20Sopenharmony_ci	mres->mig_cookie = cpu_to_be64(cookie);
11768c2ecf20Sopenharmony_ci	mres->flags = flags;
11778c2ecf20Sopenharmony_ci	mres->master = master;
11788c2ecf20Sopenharmony_ci}
11798c2ecf20Sopenharmony_ci
11808c2ecf20Sopenharmony_cistatic void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
11818c2ecf20Sopenharmony_ci					  struct dlm_migratable_lockres *mres,
11828c2ecf20Sopenharmony_ci					  int queue)
11838c2ecf20Sopenharmony_ci{
11848c2ecf20Sopenharmony_ci	if (!lock->lksb)
11858c2ecf20Sopenharmony_ci	       return;
11868c2ecf20Sopenharmony_ci
11878c2ecf20Sopenharmony_ci	/* Ignore lvb in all locks in the blocked list */
11888c2ecf20Sopenharmony_ci	if (queue == DLM_BLOCKED_LIST)
11898c2ecf20Sopenharmony_ci		return;
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci	/* Only consider lvbs in locks with granted EX or PR lock levels */
11928c2ecf20Sopenharmony_ci	if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
11938c2ecf20Sopenharmony_ci		return;
11948c2ecf20Sopenharmony_ci
11958c2ecf20Sopenharmony_ci	if (dlm_lvb_is_empty(mres->lvb)) {
11968c2ecf20Sopenharmony_ci		memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
11978c2ecf20Sopenharmony_ci		return;
11988c2ecf20Sopenharmony_ci	}
11998c2ecf20Sopenharmony_ci
12008c2ecf20Sopenharmony_ci	/* Ensure the lvb copied for migration matches in other valid locks */
12018c2ecf20Sopenharmony_ci	if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
12028c2ecf20Sopenharmony_ci		return;
12038c2ecf20Sopenharmony_ci
12048c2ecf20Sopenharmony_ci	mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
12058c2ecf20Sopenharmony_ci	     "node=%u\n",
12068c2ecf20Sopenharmony_ci	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
12078c2ecf20Sopenharmony_ci	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
12088c2ecf20Sopenharmony_ci	     lock->lockres->lockname.len, lock->lockres->lockname.name,
12098c2ecf20Sopenharmony_ci	     lock->ml.node);
12108c2ecf20Sopenharmony_ci	dlm_print_one_lock_resource(lock->lockres);
12118c2ecf20Sopenharmony_ci	BUG();
12128c2ecf20Sopenharmony_ci}
12138c2ecf20Sopenharmony_ci
12148c2ecf20Sopenharmony_ci/* returns 1 if this lock fills the network structure,
12158c2ecf20Sopenharmony_ci * 0 otherwise */
12168c2ecf20Sopenharmony_cistatic int dlm_add_lock_to_array(struct dlm_lock *lock,
12178c2ecf20Sopenharmony_ci				 struct dlm_migratable_lockres *mres, int queue)
12188c2ecf20Sopenharmony_ci{
12198c2ecf20Sopenharmony_ci	struct dlm_migratable_lock *ml;
12208c2ecf20Sopenharmony_ci	int lock_num = mres->num_locks;
12218c2ecf20Sopenharmony_ci
12228c2ecf20Sopenharmony_ci	ml = &(mres->ml[lock_num]);
12238c2ecf20Sopenharmony_ci	ml->cookie = lock->ml.cookie;
12248c2ecf20Sopenharmony_ci	ml->type = lock->ml.type;
12258c2ecf20Sopenharmony_ci	ml->convert_type = lock->ml.convert_type;
12268c2ecf20Sopenharmony_ci	ml->highest_blocked = lock->ml.highest_blocked;
12278c2ecf20Sopenharmony_ci	ml->list = queue;
12288c2ecf20Sopenharmony_ci	if (lock->lksb) {
12298c2ecf20Sopenharmony_ci		ml->flags = lock->lksb->flags;
12308c2ecf20Sopenharmony_ci		dlm_prepare_lvb_for_migration(lock, mres, queue);
12318c2ecf20Sopenharmony_ci	}
12328c2ecf20Sopenharmony_ci	ml->node = lock->ml.node;
12338c2ecf20Sopenharmony_ci	mres->num_locks++;
12348c2ecf20Sopenharmony_ci	/* we reached the max, send this network message */
12358c2ecf20Sopenharmony_ci	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
12368c2ecf20Sopenharmony_ci		return 1;
12378c2ecf20Sopenharmony_ci	return 0;
12388c2ecf20Sopenharmony_ci}
12398c2ecf20Sopenharmony_ci
12408c2ecf20Sopenharmony_cistatic void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
12418c2ecf20Sopenharmony_ci			       struct dlm_migratable_lockres *mres)
12428c2ecf20Sopenharmony_ci{
12438c2ecf20Sopenharmony_ci	struct dlm_lock dummy;
12448c2ecf20Sopenharmony_ci	memset(&dummy, 0, sizeof(dummy));
12458c2ecf20Sopenharmony_ci	dummy.ml.cookie = 0;
12468c2ecf20Sopenharmony_ci	dummy.ml.type = LKM_IVMODE;
12478c2ecf20Sopenharmony_ci	dummy.ml.convert_type = LKM_IVMODE;
12488c2ecf20Sopenharmony_ci	dummy.ml.highest_blocked = LKM_IVMODE;
12498c2ecf20Sopenharmony_ci	dummy.lksb = NULL;
12508c2ecf20Sopenharmony_ci	dummy.ml.node = dlm->node_num;
12518c2ecf20Sopenharmony_ci	dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST);
12528c2ecf20Sopenharmony_ci}
12538c2ecf20Sopenharmony_ci
12548c2ecf20Sopenharmony_cistatic inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
12558c2ecf20Sopenharmony_ci				    struct dlm_migratable_lock *ml,
12568c2ecf20Sopenharmony_ci				    u8 *nodenum)
12578c2ecf20Sopenharmony_ci{
12588c2ecf20Sopenharmony_ci	if (unlikely(ml->cookie == 0 &&
12598c2ecf20Sopenharmony_ci	    ml->type == LKM_IVMODE &&
12608c2ecf20Sopenharmony_ci	    ml->convert_type == LKM_IVMODE &&
12618c2ecf20Sopenharmony_ci	    ml->highest_blocked == LKM_IVMODE &&
12628c2ecf20Sopenharmony_ci	    ml->list == DLM_BLOCKED_LIST)) {
12638c2ecf20Sopenharmony_ci		*nodenum = ml->node;
12648c2ecf20Sopenharmony_ci		return 1;
12658c2ecf20Sopenharmony_ci	}
12668c2ecf20Sopenharmony_ci	return 0;
12678c2ecf20Sopenharmony_ci}
12688c2ecf20Sopenharmony_ci
12698c2ecf20Sopenharmony_ciint dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
12708c2ecf20Sopenharmony_ci			 struct dlm_migratable_lockres *mres,
12718c2ecf20Sopenharmony_ci			 u8 send_to, u8 flags)
12728c2ecf20Sopenharmony_ci{
12738c2ecf20Sopenharmony_ci	struct list_head *queue;
12748c2ecf20Sopenharmony_ci	int total_locks, i;
12758c2ecf20Sopenharmony_ci	u64 mig_cookie = 0;
12768c2ecf20Sopenharmony_ci	struct dlm_lock *lock;
12778c2ecf20Sopenharmony_ci	int ret = 0;
12788c2ecf20Sopenharmony_ci
12798c2ecf20Sopenharmony_ci	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
12808c2ecf20Sopenharmony_ci
12818c2ecf20Sopenharmony_ci	mlog(0, "sending to %u\n", send_to);
12828c2ecf20Sopenharmony_ci
12838c2ecf20Sopenharmony_ci	total_locks = dlm_num_locks_in_lockres(res);
12848c2ecf20Sopenharmony_ci	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
12858c2ecf20Sopenharmony_ci		/* rare, but possible */
12868c2ecf20Sopenharmony_ci		mlog(0, "argh.  lockres has %d locks.  this will "
12878c2ecf20Sopenharmony_ci			  "require more than one network packet to "
12888c2ecf20Sopenharmony_ci			  "migrate\n", total_locks);
12898c2ecf20Sopenharmony_ci		mig_cookie = dlm_get_next_mig_cookie();
12908c2ecf20Sopenharmony_ci	}
12918c2ecf20Sopenharmony_ci
12928c2ecf20Sopenharmony_ci	dlm_init_migratable_lockres(mres, res->lockname.name,
12938c2ecf20Sopenharmony_ci				    res->lockname.len, total_locks,
12948c2ecf20Sopenharmony_ci				    mig_cookie, flags, res->owner);
12958c2ecf20Sopenharmony_ci
12968c2ecf20Sopenharmony_ci	total_locks = 0;
12978c2ecf20Sopenharmony_ci	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
12988c2ecf20Sopenharmony_ci		queue = dlm_list_idx_to_ptr(res, i);
12998c2ecf20Sopenharmony_ci		list_for_each_entry(lock, queue, list) {
13008c2ecf20Sopenharmony_ci			/* add another lock. */
13018c2ecf20Sopenharmony_ci			total_locks++;
13028c2ecf20Sopenharmony_ci			if (!dlm_add_lock_to_array(lock, mres, i))
13038c2ecf20Sopenharmony_ci				continue;
13048c2ecf20Sopenharmony_ci
13058c2ecf20Sopenharmony_ci			/* this filled the lock message,
13068c2ecf20Sopenharmony_ci			 * we must send it immediately. */
13078c2ecf20Sopenharmony_ci			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
13088c2ecf20Sopenharmony_ci						       res, total_locks);
13098c2ecf20Sopenharmony_ci			if (ret < 0)
13108c2ecf20Sopenharmony_ci				goto error;
13118c2ecf20Sopenharmony_ci		}
13128c2ecf20Sopenharmony_ci	}
13138c2ecf20Sopenharmony_ci	if (total_locks == 0) {
13148c2ecf20Sopenharmony_ci		/* send a dummy lock to indicate a mastery reference only */
13158c2ecf20Sopenharmony_ci		mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n",
13168c2ecf20Sopenharmony_ci		     dlm->name, res->lockname.len, res->lockname.name,
13178c2ecf20Sopenharmony_ci		     send_to, flags & DLM_MRES_RECOVERY ? "recovery" :
13188c2ecf20Sopenharmony_ci		     "migration");
13198c2ecf20Sopenharmony_ci		dlm_add_dummy_lock(dlm, mres);
13208c2ecf20Sopenharmony_ci	}
13218c2ecf20Sopenharmony_ci	/* flush any remaining locks */
13228c2ecf20Sopenharmony_ci	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
13238c2ecf20Sopenharmony_ci	if (ret < 0)
13248c2ecf20Sopenharmony_ci		goto error;
13258c2ecf20Sopenharmony_ci	return ret;
13268c2ecf20Sopenharmony_ci
13278c2ecf20Sopenharmony_cierror:
13288c2ecf20Sopenharmony_ci	mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
13298c2ecf20Sopenharmony_ci	     dlm->name, ret);
13308c2ecf20Sopenharmony_ci	if (!dlm_is_host_down(ret))
13318c2ecf20Sopenharmony_ci		BUG();
13328c2ecf20Sopenharmony_ci	mlog(0, "%s: node %u went down while sending %s "
13338c2ecf20Sopenharmony_ci	     "lockres %.*s\n", dlm->name, send_to,
13348c2ecf20Sopenharmony_ci	     flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
13358c2ecf20Sopenharmony_ci	     res->lockname.len, res->lockname.name);
13368c2ecf20Sopenharmony_ci	return ret;
13378c2ecf20Sopenharmony_ci}
13388c2ecf20Sopenharmony_ci
13398c2ecf20Sopenharmony_ci
13408c2ecf20Sopenharmony_ci
13418c2ecf20Sopenharmony_ci/*
13428c2ecf20Sopenharmony_ci * this message will contain no more than one page worth of
13438c2ecf20Sopenharmony_ci * recovery data, and it will work on only one lockres.
13448c2ecf20Sopenharmony_ci * there may be many locks in this page, and we may need to wait
13458c2ecf20Sopenharmony_ci * for additional packets to complete all the locks (rare, but
13468c2ecf20Sopenharmony_ci * possible).
13478c2ecf20Sopenharmony_ci */
13488c2ecf20Sopenharmony_ci/*
13498c2ecf20Sopenharmony_ci * NOTE: the allocation error cases here are scary
13508c2ecf20Sopenharmony_ci * we really cannot afford to fail an alloc in recovery
13518c2ecf20Sopenharmony_ci * do we spin?  returning an error only delays the problem really
13528c2ecf20Sopenharmony_ci */
13538c2ecf20Sopenharmony_ci
13548c2ecf20Sopenharmony_ciint dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
13558c2ecf20Sopenharmony_ci			    void **ret_data)
13568c2ecf20Sopenharmony_ci{
13578c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
13588c2ecf20Sopenharmony_ci	struct dlm_migratable_lockres *mres =
13598c2ecf20Sopenharmony_ci		(struct dlm_migratable_lockres *)msg->buf;
13608c2ecf20Sopenharmony_ci	int ret = 0;
13618c2ecf20Sopenharmony_ci	u8 real_master;
13628c2ecf20Sopenharmony_ci	u8 extra_refs = 0;
13638c2ecf20Sopenharmony_ci	char *buf = NULL;
13648c2ecf20Sopenharmony_ci	struct dlm_work_item *item = NULL;
13658c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res = NULL;
13668c2ecf20Sopenharmony_ci	unsigned int hash;
13678c2ecf20Sopenharmony_ci
13688c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm))
13698c2ecf20Sopenharmony_ci		return -EINVAL;
13708c2ecf20Sopenharmony_ci
13718c2ecf20Sopenharmony_ci	if (!dlm_joined(dlm)) {
13728c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "Domain %s not joined! "
13738c2ecf20Sopenharmony_ci			  "lockres %.*s, master %u\n",
13748c2ecf20Sopenharmony_ci			  dlm->name, mres->lockname_len,
13758c2ecf20Sopenharmony_ci			  mres->lockname, mres->master);
13768c2ecf20Sopenharmony_ci		dlm_put(dlm);
13778c2ecf20Sopenharmony_ci		return -EINVAL;
13788c2ecf20Sopenharmony_ci	}
13798c2ecf20Sopenharmony_ci
13808c2ecf20Sopenharmony_ci	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
13818c2ecf20Sopenharmony_ci
13828c2ecf20Sopenharmony_ci	real_master = mres->master;
13838c2ecf20Sopenharmony_ci	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
13848c2ecf20Sopenharmony_ci		/* cannot migrate a lockres with no master */
13858c2ecf20Sopenharmony_ci		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
13868c2ecf20Sopenharmony_ci	}
13878c2ecf20Sopenharmony_ci
13888c2ecf20Sopenharmony_ci	mlog(0, "%s message received from node %u\n",
13898c2ecf20Sopenharmony_ci		  (mres->flags & DLM_MRES_RECOVERY) ?
13908c2ecf20Sopenharmony_ci		  "recovery" : "migration", mres->master);
13918c2ecf20Sopenharmony_ci	if (mres->flags & DLM_MRES_ALL_DONE)
13928c2ecf20Sopenharmony_ci		mlog(0, "all done flag.  all lockres data received!\n");
13938c2ecf20Sopenharmony_ci
13948c2ecf20Sopenharmony_ci	ret = -ENOMEM;
13958c2ecf20Sopenharmony_ci	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
13968c2ecf20Sopenharmony_ci	item = kzalloc(sizeof(*item), GFP_NOFS);
13978c2ecf20Sopenharmony_ci	if (!buf || !item)
13988c2ecf20Sopenharmony_ci		goto leave;
13998c2ecf20Sopenharmony_ci
14008c2ecf20Sopenharmony_ci	/* lookup the lock to see if we have a secondary queue for this
14018c2ecf20Sopenharmony_ci	 * already...  just add the locks in and this will have its owner
14028c2ecf20Sopenharmony_ci	 * and RECOVERY flag changed when it completes. */
14038c2ecf20Sopenharmony_ci	hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
14048c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
14058c2ecf20Sopenharmony_ci	res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
14068c2ecf20Sopenharmony_ci			hash);
14078c2ecf20Sopenharmony_ci	if (res) {
14088c2ecf20Sopenharmony_ci	 	/* this will get a ref on res */
14098c2ecf20Sopenharmony_ci		/* mark it as recovering/migrating and hash it */
14108c2ecf20Sopenharmony_ci		spin_lock(&res->spinlock);
14118c2ecf20Sopenharmony_ci		if (res->state & DLM_LOCK_RES_DROPPING_REF) {
14128c2ecf20Sopenharmony_ci			mlog(0, "%s: node is attempting to migrate "
14138c2ecf20Sopenharmony_ci				"lockres %.*s, but marked as dropping "
14148c2ecf20Sopenharmony_ci				" ref!\n", dlm->name,
14158c2ecf20Sopenharmony_ci				mres->lockname_len, mres->lockname);
14168c2ecf20Sopenharmony_ci			ret = -EINVAL;
14178c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
14188c2ecf20Sopenharmony_ci			spin_unlock(&dlm->spinlock);
14198c2ecf20Sopenharmony_ci			dlm_lockres_put(res);
14208c2ecf20Sopenharmony_ci			goto leave;
14218c2ecf20Sopenharmony_ci		}
14228c2ecf20Sopenharmony_ci
14238c2ecf20Sopenharmony_ci		if (mres->flags & DLM_MRES_RECOVERY) {
14248c2ecf20Sopenharmony_ci			res->state |= DLM_LOCK_RES_RECOVERING;
14258c2ecf20Sopenharmony_ci		} else {
14268c2ecf20Sopenharmony_ci			if (res->state & DLM_LOCK_RES_MIGRATING) {
14278c2ecf20Sopenharmony_ci				/* this is at least the second
14288c2ecf20Sopenharmony_ci				 * lockres message */
14298c2ecf20Sopenharmony_ci				mlog(0, "lock %.*s is already migrating\n",
14308c2ecf20Sopenharmony_ci					  mres->lockname_len,
14318c2ecf20Sopenharmony_ci					  mres->lockname);
14328c2ecf20Sopenharmony_ci			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
14338c2ecf20Sopenharmony_ci				/* caller should BUG */
14348c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "node is attempting to migrate "
14358c2ecf20Sopenharmony_ci				     "lock %.*s, but marked as recovering!\n",
14368c2ecf20Sopenharmony_ci				     mres->lockname_len, mres->lockname);
14378c2ecf20Sopenharmony_ci				ret = -EFAULT;
14388c2ecf20Sopenharmony_ci				spin_unlock(&res->spinlock);
14398c2ecf20Sopenharmony_ci				spin_unlock(&dlm->spinlock);
14408c2ecf20Sopenharmony_ci				dlm_lockres_put(res);
14418c2ecf20Sopenharmony_ci				goto leave;
14428c2ecf20Sopenharmony_ci			}
14438c2ecf20Sopenharmony_ci			res->state |= DLM_LOCK_RES_MIGRATING;
14448c2ecf20Sopenharmony_ci		}
14458c2ecf20Sopenharmony_ci		spin_unlock(&res->spinlock);
14468c2ecf20Sopenharmony_ci		spin_unlock(&dlm->spinlock);
14478c2ecf20Sopenharmony_ci	} else {
14488c2ecf20Sopenharmony_ci		spin_unlock(&dlm->spinlock);
14498c2ecf20Sopenharmony_ci		/* need to allocate, just like if it was
14508c2ecf20Sopenharmony_ci		 * mastered here normally  */
14518c2ecf20Sopenharmony_ci		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
14528c2ecf20Sopenharmony_ci		if (!res)
14538c2ecf20Sopenharmony_ci			goto leave;
14548c2ecf20Sopenharmony_ci
14558c2ecf20Sopenharmony_ci		/* to match the ref that we would have gotten if
14568c2ecf20Sopenharmony_ci		 * dlm_lookup_lockres had succeeded */
14578c2ecf20Sopenharmony_ci		dlm_lockres_get(res);
14588c2ecf20Sopenharmony_ci
14598c2ecf20Sopenharmony_ci		/* mark it as recovering/migrating and hash it */
14608c2ecf20Sopenharmony_ci		if (mres->flags & DLM_MRES_RECOVERY)
14618c2ecf20Sopenharmony_ci			res->state |= DLM_LOCK_RES_RECOVERING;
14628c2ecf20Sopenharmony_ci		else
14638c2ecf20Sopenharmony_ci			res->state |= DLM_LOCK_RES_MIGRATING;
14648c2ecf20Sopenharmony_ci
14658c2ecf20Sopenharmony_ci		spin_lock(&dlm->spinlock);
14668c2ecf20Sopenharmony_ci		__dlm_insert_lockres(dlm, res);
14678c2ecf20Sopenharmony_ci		spin_unlock(&dlm->spinlock);
14688c2ecf20Sopenharmony_ci
14698c2ecf20Sopenharmony_ci		/* Add an extra ref for this lock-less lockres lest the
14708c2ecf20Sopenharmony_ci		 * dlm_thread purges it before we get the chance to add
14718c2ecf20Sopenharmony_ci		 * locks to it */
14728c2ecf20Sopenharmony_ci		dlm_lockres_get(res);
14738c2ecf20Sopenharmony_ci
14748c2ecf20Sopenharmony_ci		/* There are three refs that need to be put.
14758c2ecf20Sopenharmony_ci		 * 1. Taken above.
14768c2ecf20Sopenharmony_ci		 * 2. kref_init in dlm_new_lockres()->dlm_init_lockres().
14778c2ecf20Sopenharmony_ci		 * 3. dlm_lookup_lockres()
14788c2ecf20Sopenharmony_ci		 * The first one is handled at the end of this function. The
14798c2ecf20Sopenharmony_ci		 * other two are handled in the worker thread after locks have
14808c2ecf20Sopenharmony_ci		 * been attached. Yes, we don't wait for purge time to match
14818c2ecf20Sopenharmony_ci		 * kref_init. The lockres will still have atleast one ref
14828c2ecf20Sopenharmony_ci		 * added because it is in the hash __dlm_insert_lockres() */
14838c2ecf20Sopenharmony_ci		extra_refs++;
14848c2ecf20Sopenharmony_ci
14858c2ecf20Sopenharmony_ci		/* now that the new lockres is inserted,
14868c2ecf20Sopenharmony_ci		 * make it usable by other processes */
14878c2ecf20Sopenharmony_ci		spin_lock(&res->spinlock);
14888c2ecf20Sopenharmony_ci		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
14898c2ecf20Sopenharmony_ci		spin_unlock(&res->spinlock);
14908c2ecf20Sopenharmony_ci		wake_up(&res->wq);
14918c2ecf20Sopenharmony_ci	}
14928c2ecf20Sopenharmony_ci
14938c2ecf20Sopenharmony_ci	/* at this point we have allocated everything we need,
14948c2ecf20Sopenharmony_ci	 * and we have a hashed lockres with an extra ref and
14958c2ecf20Sopenharmony_ci	 * the proper res->state flags. */
14968c2ecf20Sopenharmony_ci	ret = 0;
14978c2ecf20Sopenharmony_ci	spin_lock(&res->spinlock);
14988c2ecf20Sopenharmony_ci	/* drop this either when master requery finds a different master
14998c2ecf20Sopenharmony_ci	 * or when a lock is added by the recovery worker */
15008c2ecf20Sopenharmony_ci	dlm_lockres_grab_inflight_ref(dlm, res);
15018c2ecf20Sopenharmony_ci	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
15028c2ecf20Sopenharmony_ci		/* migration cannot have an unknown master */
15038c2ecf20Sopenharmony_ci		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
15048c2ecf20Sopenharmony_ci		mlog(0, "recovery has passed me a lockres with an "
15058c2ecf20Sopenharmony_ci			  "unknown owner.. will need to requery: "
15068c2ecf20Sopenharmony_ci			  "%.*s\n", mres->lockname_len, mres->lockname);
15078c2ecf20Sopenharmony_ci	} else {
15088c2ecf20Sopenharmony_ci		/* take a reference now to pin the lockres, drop it
15098c2ecf20Sopenharmony_ci		 * when locks are added in the worker */
15108c2ecf20Sopenharmony_ci		dlm_change_lockres_owner(dlm, res, dlm->node_num);
15118c2ecf20Sopenharmony_ci	}
15128c2ecf20Sopenharmony_ci	spin_unlock(&res->spinlock);
15138c2ecf20Sopenharmony_ci
15148c2ecf20Sopenharmony_ci	/* queue up work for dlm_mig_lockres_worker */
15158c2ecf20Sopenharmony_ci	dlm_grab(dlm);  /* get an extra ref for the work item */
15168c2ecf20Sopenharmony_ci	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
15178c2ecf20Sopenharmony_ci	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
15188c2ecf20Sopenharmony_ci	item->u.ml.lockres = res; /* already have a ref */
15198c2ecf20Sopenharmony_ci	item->u.ml.real_master = real_master;
15208c2ecf20Sopenharmony_ci	item->u.ml.extra_ref = extra_refs;
15218c2ecf20Sopenharmony_ci	spin_lock(&dlm->work_lock);
15228c2ecf20Sopenharmony_ci	list_add_tail(&item->list, &dlm->work_list);
15238c2ecf20Sopenharmony_ci	spin_unlock(&dlm->work_lock);
15248c2ecf20Sopenharmony_ci	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
15258c2ecf20Sopenharmony_ci
15268c2ecf20Sopenharmony_cileave:
15278c2ecf20Sopenharmony_ci	/* One extra ref taken needs to be put here */
15288c2ecf20Sopenharmony_ci	if (extra_refs)
15298c2ecf20Sopenharmony_ci		dlm_lockres_put(res);
15308c2ecf20Sopenharmony_ci
15318c2ecf20Sopenharmony_ci	dlm_put(dlm);
15328c2ecf20Sopenharmony_ci	if (ret < 0) {
15338c2ecf20Sopenharmony_ci		kfree(buf);
15348c2ecf20Sopenharmony_ci		kfree(item);
15358c2ecf20Sopenharmony_ci		mlog_errno(ret);
15368c2ecf20Sopenharmony_ci	}
15378c2ecf20Sopenharmony_ci
15388c2ecf20Sopenharmony_ci	return ret;
15398c2ecf20Sopenharmony_ci}
15408c2ecf20Sopenharmony_ci
15418c2ecf20Sopenharmony_ci
15428c2ecf20Sopenharmony_cistatic void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
15438c2ecf20Sopenharmony_ci{
15448c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm;
15458c2ecf20Sopenharmony_ci	struct dlm_migratable_lockres *mres;
15468c2ecf20Sopenharmony_ci	int ret = 0;
15478c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res;
15488c2ecf20Sopenharmony_ci	u8 real_master;
15498c2ecf20Sopenharmony_ci	u8 extra_ref;
15508c2ecf20Sopenharmony_ci
15518c2ecf20Sopenharmony_ci	dlm = item->dlm;
15528c2ecf20Sopenharmony_ci	mres = (struct dlm_migratable_lockres *)data;
15538c2ecf20Sopenharmony_ci
15548c2ecf20Sopenharmony_ci	res = item->u.ml.lockres;
15558c2ecf20Sopenharmony_ci	real_master = item->u.ml.real_master;
15568c2ecf20Sopenharmony_ci	extra_ref = item->u.ml.extra_ref;
15578c2ecf20Sopenharmony_ci
15588c2ecf20Sopenharmony_ci	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
15598c2ecf20Sopenharmony_ci		/* this case is super-rare. only occurs if
15608c2ecf20Sopenharmony_ci		 * node death happens during migration. */
15618c2ecf20Sopenharmony_ciagain:
15628c2ecf20Sopenharmony_ci		ret = dlm_lockres_master_requery(dlm, res, &real_master);
15638c2ecf20Sopenharmony_ci		if (ret < 0) {
15648c2ecf20Sopenharmony_ci			mlog(0, "dlm_lockres_master_requery ret=%d\n",
15658c2ecf20Sopenharmony_ci				  ret);
15668c2ecf20Sopenharmony_ci			goto again;
15678c2ecf20Sopenharmony_ci		}
15688c2ecf20Sopenharmony_ci		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
15698c2ecf20Sopenharmony_ci			mlog(0, "lockres %.*s not claimed.  "
15708c2ecf20Sopenharmony_ci				   "this node will take it.\n",
15718c2ecf20Sopenharmony_ci				   res->lockname.len, res->lockname.name);
15728c2ecf20Sopenharmony_ci		} else {
15738c2ecf20Sopenharmony_ci			spin_lock(&res->spinlock);
15748c2ecf20Sopenharmony_ci			dlm_lockres_drop_inflight_ref(dlm, res);
15758c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
15768c2ecf20Sopenharmony_ci			mlog(0, "master needs to respond to sender "
15778c2ecf20Sopenharmony_ci				  "that node %u still owns %.*s\n",
15788c2ecf20Sopenharmony_ci				  real_master, res->lockname.len,
15798c2ecf20Sopenharmony_ci				  res->lockname.name);
15808c2ecf20Sopenharmony_ci			/* cannot touch this lockres */
15818c2ecf20Sopenharmony_ci			goto leave;
15828c2ecf20Sopenharmony_ci		}
15838c2ecf20Sopenharmony_ci	}
15848c2ecf20Sopenharmony_ci
15858c2ecf20Sopenharmony_ci	ret = dlm_process_recovery_data(dlm, res, mres);
15868c2ecf20Sopenharmony_ci	if (ret < 0)
15878c2ecf20Sopenharmony_ci		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
15888c2ecf20Sopenharmony_ci	else
15898c2ecf20Sopenharmony_ci		mlog(0, "dlm_process_recovery_data succeeded\n");
15908c2ecf20Sopenharmony_ci
15918c2ecf20Sopenharmony_ci	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
15928c2ecf20Sopenharmony_ci	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
15938c2ecf20Sopenharmony_ci		ret = dlm_finish_migration(dlm, res, mres->master);
15948c2ecf20Sopenharmony_ci		if (ret < 0)
15958c2ecf20Sopenharmony_ci			mlog_errno(ret);
15968c2ecf20Sopenharmony_ci	}
15978c2ecf20Sopenharmony_ci
15988c2ecf20Sopenharmony_cileave:
15998c2ecf20Sopenharmony_ci	/* See comment in dlm_mig_lockres_handler() */
16008c2ecf20Sopenharmony_ci	if (res) {
16018c2ecf20Sopenharmony_ci		if (extra_ref)
16028c2ecf20Sopenharmony_ci			dlm_lockres_put(res);
16038c2ecf20Sopenharmony_ci		dlm_lockres_put(res);
16048c2ecf20Sopenharmony_ci	}
16058c2ecf20Sopenharmony_ci	kfree(data);
16068c2ecf20Sopenharmony_ci}
16078c2ecf20Sopenharmony_ci
16088c2ecf20Sopenharmony_ci
16098c2ecf20Sopenharmony_ci
16108c2ecf20Sopenharmony_cistatic int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
16118c2ecf20Sopenharmony_ci				      struct dlm_lock_resource *res,
16128c2ecf20Sopenharmony_ci				      u8 *real_master)
16138c2ecf20Sopenharmony_ci{
16148c2ecf20Sopenharmony_ci	struct dlm_node_iter iter;
16158c2ecf20Sopenharmony_ci	int nodenum;
16168c2ecf20Sopenharmony_ci	int ret = 0;
16178c2ecf20Sopenharmony_ci
16188c2ecf20Sopenharmony_ci	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
16198c2ecf20Sopenharmony_ci
16208c2ecf20Sopenharmony_ci	/* we only reach here if one of the two nodes in a
16218c2ecf20Sopenharmony_ci	 * migration died while the migration was in progress.
16228c2ecf20Sopenharmony_ci	 * at this point we need to requery the master.  we
16238c2ecf20Sopenharmony_ci	 * know that the new_master got as far as creating
16248c2ecf20Sopenharmony_ci	 * an mle on at least one node, but we do not know
16258c2ecf20Sopenharmony_ci	 * if any nodes had actually cleared the mle and set
16268c2ecf20Sopenharmony_ci	 * the master to the new_master.  the old master
16278c2ecf20Sopenharmony_ci	 * is supposed to set the owner to UNKNOWN in the
16288c2ecf20Sopenharmony_ci	 * event of a new_master death, so the only possible
16298c2ecf20Sopenharmony_ci	 * responses that we can get from nodes here are
16308c2ecf20Sopenharmony_ci	 * that the master is new_master, or that the master
16318c2ecf20Sopenharmony_ci	 * is UNKNOWN.
16328c2ecf20Sopenharmony_ci	 * if all nodes come back with UNKNOWN then we know
16338c2ecf20Sopenharmony_ci	 * the lock needs remastering here.
16348c2ecf20Sopenharmony_ci	 * if any node comes back with a valid master, check
16358c2ecf20Sopenharmony_ci	 * to see if that master is the one that we are
16368c2ecf20Sopenharmony_ci	 * recovering.  if so, then the new_master died and
16378c2ecf20Sopenharmony_ci	 * we need to remaster this lock.  if not, then the
16388c2ecf20Sopenharmony_ci	 * new_master survived and that node will respond to
16398c2ecf20Sopenharmony_ci	 * other nodes about the owner.
16408c2ecf20Sopenharmony_ci	 * if there is an owner, this node needs to dump this
16418c2ecf20Sopenharmony_ci	 * lockres and alert the sender that this lockres
16428c2ecf20Sopenharmony_ci	 * was rejected. */
16438c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
16448c2ecf20Sopenharmony_ci	dlm_node_iter_init(dlm->domain_map, &iter);
16458c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
16468c2ecf20Sopenharmony_ci
16478c2ecf20Sopenharmony_ci	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
16488c2ecf20Sopenharmony_ci		/* do not send to self */
16498c2ecf20Sopenharmony_ci		if (nodenum == dlm->node_num)
16508c2ecf20Sopenharmony_ci			continue;
16518c2ecf20Sopenharmony_ci		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
16528c2ecf20Sopenharmony_ci		if (ret < 0) {
16538c2ecf20Sopenharmony_ci			mlog_errno(ret);
16548c2ecf20Sopenharmony_ci			if (!dlm_is_host_down(ret))
16558c2ecf20Sopenharmony_ci				BUG();
16568c2ecf20Sopenharmony_ci			/* host is down, so answer for that node would be
16578c2ecf20Sopenharmony_ci			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
16588c2ecf20Sopenharmony_ci		}
16598c2ecf20Sopenharmony_ci		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
16608c2ecf20Sopenharmony_ci			mlog(0, "lock master is %u\n", *real_master);
16618c2ecf20Sopenharmony_ci			break;
16628c2ecf20Sopenharmony_ci		}
16638c2ecf20Sopenharmony_ci	}
16648c2ecf20Sopenharmony_ci	return ret;
16658c2ecf20Sopenharmony_ci}
16668c2ecf20Sopenharmony_ci
16678c2ecf20Sopenharmony_ci
16688c2ecf20Sopenharmony_ciint dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
16698c2ecf20Sopenharmony_ci			  u8 nodenum, u8 *real_master)
16708c2ecf20Sopenharmony_ci{
16718c2ecf20Sopenharmony_ci	int ret;
16728c2ecf20Sopenharmony_ci	struct dlm_master_requery req;
16738c2ecf20Sopenharmony_ci	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
16748c2ecf20Sopenharmony_ci
16758c2ecf20Sopenharmony_ci	memset(&req, 0, sizeof(req));
16768c2ecf20Sopenharmony_ci	req.node_idx = dlm->node_num;
16778c2ecf20Sopenharmony_ci	req.namelen = res->lockname.len;
16788c2ecf20Sopenharmony_ci	memcpy(req.name, res->lockname.name, res->lockname.len);
16798c2ecf20Sopenharmony_ci
16808c2ecf20Sopenharmony_ciresend:
16818c2ecf20Sopenharmony_ci	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
16828c2ecf20Sopenharmony_ci				 &req, sizeof(req), nodenum, &status);
16838c2ecf20Sopenharmony_ci	if (ret < 0)
16848c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "Error %d when sending message %u (key "
16858c2ecf20Sopenharmony_ci		     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
16868c2ecf20Sopenharmony_ci		     dlm->key, nodenum);
16878c2ecf20Sopenharmony_ci	else if (status == -ENOMEM) {
16888c2ecf20Sopenharmony_ci		mlog_errno(status);
16898c2ecf20Sopenharmony_ci		msleep(50);
16908c2ecf20Sopenharmony_ci		goto resend;
16918c2ecf20Sopenharmony_ci	} else {
16928c2ecf20Sopenharmony_ci		BUG_ON(status < 0);
16938c2ecf20Sopenharmony_ci		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
16948c2ecf20Sopenharmony_ci		*real_master = (u8) (status & 0xff);
16958c2ecf20Sopenharmony_ci		mlog(0, "node %u responded to master requery with %u\n",
16968c2ecf20Sopenharmony_ci			  nodenum, *real_master);
16978c2ecf20Sopenharmony_ci		ret = 0;
16988c2ecf20Sopenharmony_ci	}
16998c2ecf20Sopenharmony_ci	return ret;
17008c2ecf20Sopenharmony_ci}
17018c2ecf20Sopenharmony_ci
17028c2ecf20Sopenharmony_ci
17038c2ecf20Sopenharmony_ci/* this function cannot error, so unless the sending
17048c2ecf20Sopenharmony_ci * or receiving of the message failed, the owner can
17058c2ecf20Sopenharmony_ci * be trusted */
17068c2ecf20Sopenharmony_ciint dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
17078c2ecf20Sopenharmony_ci			       void **ret_data)
17088c2ecf20Sopenharmony_ci{
17098c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
17108c2ecf20Sopenharmony_ci	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
17118c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res = NULL;
17128c2ecf20Sopenharmony_ci	unsigned int hash;
17138c2ecf20Sopenharmony_ci	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
17148c2ecf20Sopenharmony_ci	u32 flags = DLM_ASSERT_MASTER_REQUERY;
17158c2ecf20Sopenharmony_ci	int dispatched = 0;
17168c2ecf20Sopenharmony_ci
17178c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm)) {
17188c2ecf20Sopenharmony_ci		/* since the domain has gone away on this
17198c2ecf20Sopenharmony_ci		 * node, the proper response is UNKNOWN */
17208c2ecf20Sopenharmony_ci		return master;
17218c2ecf20Sopenharmony_ci	}
17228c2ecf20Sopenharmony_ci
17238c2ecf20Sopenharmony_ci	hash = dlm_lockid_hash(req->name, req->namelen);
17248c2ecf20Sopenharmony_ci
17258c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
17268c2ecf20Sopenharmony_ci	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
17278c2ecf20Sopenharmony_ci	if (res) {
17288c2ecf20Sopenharmony_ci		spin_lock(&res->spinlock);
17298c2ecf20Sopenharmony_ci		master = res->owner;
17308c2ecf20Sopenharmony_ci		if (master == dlm->node_num) {
17318c2ecf20Sopenharmony_ci			int ret = dlm_dispatch_assert_master(dlm, res,
17328c2ecf20Sopenharmony_ci							     0, 0, flags);
17338c2ecf20Sopenharmony_ci			if (ret < 0) {
17348c2ecf20Sopenharmony_ci				mlog_errno(ret);
17358c2ecf20Sopenharmony_ci				spin_unlock(&res->spinlock);
17368c2ecf20Sopenharmony_ci				dlm_lockres_put(res);
17378c2ecf20Sopenharmony_ci				spin_unlock(&dlm->spinlock);
17388c2ecf20Sopenharmony_ci				dlm_put(dlm);
17398c2ecf20Sopenharmony_ci				/* sender will take care of this and retry */
17408c2ecf20Sopenharmony_ci				return ret;
17418c2ecf20Sopenharmony_ci			} else {
17428c2ecf20Sopenharmony_ci				dispatched = 1;
17438c2ecf20Sopenharmony_ci				__dlm_lockres_grab_inflight_worker(dlm, res);
17448c2ecf20Sopenharmony_ci				spin_unlock(&res->spinlock);
17458c2ecf20Sopenharmony_ci			}
17468c2ecf20Sopenharmony_ci		} else {
17478c2ecf20Sopenharmony_ci			/* put.. incase we are not the master */
17488c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
17498c2ecf20Sopenharmony_ci			dlm_lockres_put(res);
17508c2ecf20Sopenharmony_ci		}
17518c2ecf20Sopenharmony_ci	}
17528c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
17538c2ecf20Sopenharmony_ci
17548c2ecf20Sopenharmony_ci	if (!dispatched)
17558c2ecf20Sopenharmony_ci		dlm_put(dlm);
17568c2ecf20Sopenharmony_ci	return master;
17578c2ecf20Sopenharmony_ci}
17588c2ecf20Sopenharmony_ci
17598c2ecf20Sopenharmony_cistatic inline struct list_head *
17608c2ecf20Sopenharmony_cidlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
17618c2ecf20Sopenharmony_ci{
17628c2ecf20Sopenharmony_ci	struct list_head *ret;
17638c2ecf20Sopenharmony_ci	BUG_ON(list_num < 0);
17648c2ecf20Sopenharmony_ci	BUG_ON(list_num > 2);
17658c2ecf20Sopenharmony_ci	ret = &(res->granted);
17668c2ecf20Sopenharmony_ci	ret += list_num;
17678c2ecf20Sopenharmony_ci	return ret;
17688c2ecf20Sopenharmony_ci}
17698c2ecf20Sopenharmony_ci/* TODO: do ast flush business
17708c2ecf20Sopenharmony_ci * TODO: do MIGRATING and RECOVERING spinning
17718c2ecf20Sopenharmony_ci */
17728c2ecf20Sopenharmony_ci
17738c2ecf20Sopenharmony_ci/*
17748c2ecf20Sopenharmony_ci* NOTE about in-flight requests during migration:
17758c2ecf20Sopenharmony_ci*
17768c2ecf20Sopenharmony_ci* Before attempting the migrate, the master has marked the lockres as
17778c2ecf20Sopenharmony_ci* MIGRATING and then flushed all of its pending ASTS.  So any in-flight
17788c2ecf20Sopenharmony_ci* requests either got queued before the MIGRATING flag got set, in which
17798c2ecf20Sopenharmony_ci* case the lock data will reflect the change and a return message is on
17808c2ecf20Sopenharmony_ci* the way, or the request failed to get in before MIGRATING got set.  In
17818c2ecf20Sopenharmony_ci* this case, the caller will be told to spin and wait for the MIGRATING
17828c2ecf20Sopenharmony_ci* flag to be dropped, then recheck the master.
17838c2ecf20Sopenharmony_ci* This holds true for the convert, cancel and unlock cases, and since lvb
17848c2ecf20Sopenharmony_ci* updates are tied to these same messages, it applies to lvb updates as
17858c2ecf20Sopenharmony_ci* well.  For the lock case, there is no way a lock can be on the master
17868c2ecf20Sopenharmony_ci* queue and not be on the secondary queue since the lock is always added
17878c2ecf20Sopenharmony_ci* locally first.  This means that the new target node will never be sent
17888c2ecf20Sopenharmony_ci* a lock that he doesn't already have on the list.
17898c2ecf20Sopenharmony_ci* In total, this means that the local lock is correct and should not be
17908c2ecf20Sopenharmony_ci* updated to match the one sent by the master.  Any messages sent back
17918c2ecf20Sopenharmony_ci* from the master before the MIGRATING flag will bring the lock properly
17928c2ecf20Sopenharmony_ci* up-to-date, and the change will be ordered properly for the waiter.
17938c2ecf20Sopenharmony_ci* We will *not* attempt to modify the lock underneath the waiter.
17948c2ecf20Sopenharmony_ci*/
17958c2ecf20Sopenharmony_ci
17968c2ecf20Sopenharmony_cistatic int dlm_process_recovery_data(struct dlm_ctxt *dlm,
17978c2ecf20Sopenharmony_ci				     struct dlm_lock_resource *res,
17988c2ecf20Sopenharmony_ci				     struct dlm_migratable_lockres *mres)
17998c2ecf20Sopenharmony_ci{
18008c2ecf20Sopenharmony_ci	struct dlm_migratable_lock *ml;
18018c2ecf20Sopenharmony_ci	struct list_head *queue, *iter;
18028c2ecf20Sopenharmony_ci	struct list_head *tmpq = NULL;
18038c2ecf20Sopenharmony_ci	struct dlm_lock *newlock = NULL;
18048c2ecf20Sopenharmony_ci	struct dlm_lockstatus *lksb = NULL;
18058c2ecf20Sopenharmony_ci	int ret = 0;
18068c2ecf20Sopenharmony_ci	int i, j, bad;
18078c2ecf20Sopenharmony_ci	struct dlm_lock *lock;
18088c2ecf20Sopenharmony_ci	u8 from = O2NM_MAX_NODES;
18098c2ecf20Sopenharmony_ci	__be64 c;
18108c2ecf20Sopenharmony_ci
18118c2ecf20Sopenharmony_ci	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
18128c2ecf20Sopenharmony_ci	for (i=0; i<mres->num_locks; i++) {
18138c2ecf20Sopenharmony_ci		ml = &(mres->ml[i]);
18148c2ecf20Sopenharmony_ci
18158c2ecf20Sopenharmony_ci		if (dlm_is_dummy_lock(dlm, ml, &from)) {
18168c2ecf20Sopenharmony_ci			/* placeholder, just need to set the refmap bit */
18178c2ecf20Sopenharmony_ci			BUG_ON(mres->num_locks != 1);
18188c2ecf20Sopenharmony_ci			mlog(0, "%s:%.*s: dummy lock for %u\n",
18198c2ecf20Sopenharmony_ci			     dlm->name, mres->lockname_len, mres->lockname,
18208c2ecf20Sopenharmony_ci			     from);
18218c2ecf20Sopenharmony_ci			spin_lock(&res->spinlock);
18228c2ecf20Sopenharmony_ci			dlm_lockres_set_refmap_bit(dlm, res, from);
18238c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
18248c2ecf20Sopenharmony_ci			break;
18258c2ecf20Sopenharmony_ci		}
18268c2ecf20Sopenharmony_ci		BUG_ON(ml->highest_blocked != LKM_IVMODE);
18278c2ecf20Sopenharmony_ci		newlock = NULL;
18288c2ecf20Sopenharmony_ci		lksb = NULL;
18298c2ecf20Sopenharmony_ci
18308c2ecf20Sopenharmony_ci		queue = dlm_list_num_to_pointer(res, ml->list);
18318c2ecf20Sopenharmony_ci		tmpq = NULL;
18328c2ecf20Sopenharmony_ci
18338c2ecf20Sopenharmony_ci		/* if the lock is for the local node it needs to
18348c2ecf20Sopenharmony_ci		 * be moved to the proper location within the queue.
18358c2ecf20Sopenharmony_ci		 * do not allocate a new lock structure. */
18368c2ecf20Sopenharmony_ci		if (ml->node == dlm->node_num) {
18378c2ecf20Sopenharmony_ci			/* MIGRATION ONLY! */
18388c2ecf20Sopenharmony_ci			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
18398c2ecf20Sopenharmony_ci
18408c2ecf20Sopenharmony_ci			lock = NULL;
18418c2ecf20Sopenharmony_ci			spin_lock(&res->spinlock);
18428c2ecf20Sopenharmony_ci			for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
18438c2ecf20Sopenharmony_ci				tmpq = dlm_list_idx_to_ptr(res, j);
18448c2ecf20Sopenharmony_ci				list_for_each(iter, tmpq) {
18458c2ecf20Sopenharmony_ci					lock = list_entry(iter,
18468c2ecf20Sopenharmony_ci						  struct dlm_lock, list);
18478c2ecf20Sopenharmony_ci					if (lock->ml.cookie == ml->cookie)
18488c2ecf20Sopenharmony_ci						break;
18498c2ecf20Sopenharmony_ci					lock = NULL;
18508c2ecf20Sopenharmony_ci				}
18518c2ecf20Sopenharmony_ci				if (lock)
18528c2ecf20Sopenharmony_ci					break;
18538c2ecf20Sopenharmony_ci			}
18548c2ecf20Sopenharmony_ci
18558c2ecf20Sopenharmony_ci			/* lock is always created locally first, and
18568c2ecf20Sopenharmony_ci			 * destroyed locally last.  it must be on the list */
18578c2ecf20Sopenharmony_ci			if (!lock) {
18588c2ecf20Sopenharmony_ci				c = ml->cookie;
18598c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "Could not find local lock "
18608c2ecf20Sopenharmony_ci					       "with cookie %u:%llu, node %u, "
18618c2ecf20Sopenharmony_ci					       "list %u, flags 0x%x, type %d, "
18628c2ecf20Sopenharmony_ci					       "conv %d, highest blocked %d\n",
18638c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
18648c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
18658c2ecf20Sopenharmony_ci				     ml->node, ml->list, ml->flags, ml->type,
18668c2ecf20Sopenharmony_ci				     ml->convert_type, ml->highest_blocked);
18678c2ecf20Sopenharmony_ci				__dlm_print_one_lock_resource(res);
18688c2ecf20Sopenharmony_ci				BUG();
18698c2ecf20Sopenharmony_ci			}
18708c2ecf20Sopenharmony_ci
18718c2ecf20Sopenharmony_ci			if (lock->ml.node != ml->node) {
18728c2ecf20Sopenharmony_ci				c = lock->ml.cookie;
18738c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "Mismatched node# in lock "
18748c2ecf20Sopenharmony_ci				     "cookie %u:%llu, name %.*s, node %u\n",
18758c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
18768c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
18778c2ecf20Sopenharmony_ci				     res->lockname.len, res->lockname.name,
18788c2ecf20Sopenharmony_ci				     lock->ml.node);
18798c2ecf20Sopenharmony_ci				c = ml->cookie;
18808c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
18818c2ecf20Sopenharmony_ci				     "node %u, list %u, flags 0x%x, type %d, "
18828c2ecf20Sopenharmony_ci				     "conv %d, highest blocked %d\n",
18838c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
18848c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
18858c2ecf20Sopenharmony_ci				     ml->node, ml->list, ml->flags, ml->type,
18868c2ecf20Sopenharmony_ci				     ml->convert_type, ml->highest_blocked);
18878c2ecf20Sopenharmony_ci				__dlm_print_one_lock_resource(res);
18888c2ecf20Sopenharmony_ci				BUG();
18898c2ecf20Sopenharmony_ci			}
18908c2ecf20Sopenharmony_ci
18918c2ecf20Sopenharmony_ci			if (tmpq != queue) {
18928c2ecf20Sopenharmony_ci				c = ml->cookie;
18938c2ecf20Sopenharmony_ci				mlog(0, "Lock cookie %u:%llu was on list %u "
18948c2ecf20Sopenharmony_ci				     "instead of list %u for %.*s\n",
18958c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
18968c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
18978c2ecf20Sopenharmony_ci				     j, ml->list, res->lockname.len,
18988c2ecf20Sopenharmony_ci				     res->lockname.name);
18998c2ecf20Sopenharmony_ci				__dlm_print_one_lock_resource(res);
19008c2ecf20Sopenharmony_ci				spin_unlock(&res->spinlock);
19018c2ecf20Sopenharmony_ci				continue;
19028c2ecf20Sopenharmony_ci			}
19038c2ecf20Sopenharmony_ci
19048c2ecf20Sopenharmony_ci			/* see NOTE above about why we do not update
19058c2ecf20Sopenharmony_ci			 * to match the master here */
19068c2ecf20Sopenharmony_ci
19078c2ecf20Sopenharmony_ci			/* move the lock to its proper place */
19088c2ecf20Sopenharmony_ci			/* do not alter lock refcount.  switching lists. */
19098c2ecf20Sopenharmony_ci			list_move_tail(&lock->list, queue);
19108c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
19118c2ecf20Sopenharmony_ci
19128c2ecf20Sopenharmony_ci			mlog(0, "just reordered a local lock!\n");
19138c2ecf20Sopenharmony_ci			continue;
19148c2ecf20Sopenharmony_ci		}
19158c2ecf20Sopenharmony_ci
19168c2ecf20Sopenharmony_ci		/* lock is for another node. */
19178c2ecf20Sopenharmony_ci		newlock = dlm_new_lock(ml->type, ml->node,
19188c2ecf20Sopenharmony_ci				       be64_to_cpu(ml->cookie), NULL);
19198c2ecf20Sopenharmony_ci		if (!newlock) {
19208c2ecf20Sopenharmony_ci			ret = -ENOMEM;
19218c2ecf20Sopenharmony_ci			goto leave;
19228c2ecf20Sopenharmony_ci		}
19238c2ecf20Sopenharmony_ci		lksb = newlock->lksb;
19248c2ecf20Sopenharmony_ci		dlm_lock_attach_lockres(newlock, res);
19258c2ecf20Sopenharmony_ci
19268c2ecf20Sopenharmony_ci		if (ml->convert_type != LKM_IVMODE) {
19278c2ecf20Sopenharmony_ci			BUG_ON(queue != &res->converting);
19288c2ecf20Sopenharmony_ci			newlock->ml.convert_type = ml->convert_type;
19298c2ecf20Sopenharmony_ci		}
19308c2ecf20Sopenharmony_ci		lksb->flags |= (ml->flags &
19318c2ecf20Sopenharmony_ci				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
19328c2ecf20Sopenharmony_ci
19338c2ecf20Sopenharmony_ci		if (ml->type == LKM_NLMODE)
19348c2ecf20Sopenharmony_ci			goto skip_lvb;
19358c2ecf20Sopenharmony_ci
19368c2ecf20Sopenharmony_ci		/*
19378c2ecf20Sopenharmony_ci		 * If the lock is in the blocked list it can't have a valid lvb,
19388c2ecf20Sopenharmony_ci		 * so skip it
19398c2ecf20Sopenharmony_ci		 */
19408c2ecf20Sopenharmony_ci		if (ml->list == DLM_BLOCKED_LIST)
19418c2ecf20Sopenharmony_ci			goto skip_lvb;
19428c2ecf20Sopenharmony_ci
19438c2ecf20Sopenharmony_ci		if (!dlm_lvb_is_empty(mres->lvb)) {
19448c2ecf20Sopenharmony_ci			if (lksb->flags & DLM_LKSB_PUT_LVB) {
19458c2ecf20Sopenharmony_ci				/* other node was trying to update
19468c2ecf20Sopenharmony_ci				 * lvb when node died.  recreate the
19478c2ecf20Sopenharmony_ci				 * lksb with the updated lvb. */
19488c2ecf20Sopenharmony_ci				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
19498c2ecf20Sopenharmony_ci				/* the lock resource lvb update must happen
19508c2ecf20Sopenharmony_ci				 * NOW, before the spinlock is dropped.
19518c2ecf20Sopenharmony_ci				 * we no longer wait for the AST to update
19528c2ecf20Sopenharmony_ci				 * the lvb. */
19538c2ecf20Sopenharmony_ci				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
19548c2ecf20Sopenharmony_ci			} else {
19558c2ecf20Sopenharmony_ci				/* otherwise, the node is sending its
19568c2ecf20Sopenharmony_ci				 * most recent valid lvb info */
19578c2ecf20Sopenharmony_ci				BUG_ON(ml->type != LKM_EXMODE &&
19588c2ecf20Sopenharmony_ci				       ml->type != LKM_PRMODE);
19598c2ecf20Sopenharmony_ci				if (!dlm_lvb_is_empty(res->lvb) &&
19608c2ecf20Sopenharmony_ci 				    (ml->type == LKM_EXMODE ||
19618c2ecf20Sopenharmony_ci 				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
19628c2ecf20Sopenharmony_ci 					int i;
19638c2ecf20Sopenharmony_ci 					mlog(ML_ERROR, "%s:%.*s: received bad "
19648c2ecf20Sopenharmony_ci 					     "lvb! type=%d\n", dlm->name,
19658c2ecf20Sopenharmony_ci 					     res->lockname.len,
19668c2ecf20Sopenharmony_ci 					     res->lockname.name, ml->type);
19678c2ecf20Sopenharmony_ci 					printk("lockres lvb=[");
19688c2ecf20Sopenharmony_ci 					for (i=0; i<DLM_LVB_LEN; i++)
19698c2ecf20Sopenharmony_ci 						printk("%02x", res->lvb[i]);
19708c2ecf20Sopenharmony_ci 					printk("]\nmigrated lvb=[");
19718c2ecf20Sopenharmony_ci 					for (i=0; i<DLM_LVB_LEN; i++)
19728c2ecf20Sopenharmony_ci 						printk("%02x", mres->lvb[i]);
19738c2ecf20Sopenharmony_ci 					printk("]\n");
19748c2ecf20Sopenharmony_ci 					dlm_print_one_lock_resource(res);
19758c2ecf20Sopenharmony_ci 					BUG();
19768c2ecf20Sopenharmony_ci				}
19778c2ecf20Sopenharmony_ci				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
19788c2ecf20Sopenharmony_ci			}
19798c2ecf20Sopenharmony_ci		}
19808c2ecf20Sopenharmony_ciskip_lvb:
19818c2ecf20Sopenharmony_ci
19828c2ecf20Sopenharmony_ci		/* NOTE:
19838c2ecf20Sopenharmony_ci		 * wrt lock queue ordering and recovery:
19848c2ecf20Sopenharmony_ci		 *    1. order of locks on granted queue is
19858c2ecf20Sopenharmony_ci		 *       meaningless.
19868c2ecf20Sopenharmony_ci		 *    2. order of locks on converting queue is
19878c2ecf20Sopenharmony_ci		 *       LOST with the node death.  sorry charlie.
19888c2ecf20Sopenharmony_ci		 *    3. order of locks on the blocked queue is
19898c2ecf20Sopenharmony_ci		 *       also LOST.
19908c2ecf20Sopenharmony_ci		 * order of locks does not affect integrity, it
19918c2ecf20Sopenharmony_ci		 * just means that a lock request may get pushed
19928c2ecf20Sopenharmony_ci		 * back in line as a result of the node death.
19938c2ecf20Sopenharmony_ci		 * also note that for a given node the lock order
19948c2ecf20Sopenharmony_ci		 * for its secondary queue locks is preserved
19958c2ecf20Sopenharmony_ci		 * relative to each other, but clearly *not*
19968c2ecf20Sopenharmony_ci		 * preserved relative to locks from other nodes.
19978c2ecf20Sopenharmony_ci		 */
19988c2ecf20Sopenharmony_ci		bad = 0;
19998c2ecf20Sopenharmony_ci		spin_lock(&res->spinlock);
20008c2ecf20Sopenharmony_ci		list_for_each_entry(lock, queue, list) {
20018c2ecf20Sopenharmony_ci			if (lock->ml.cookie == ml->cookie) {
20028c2ecf20Sopenharmony_ci				c = lock->ml.cookie;
20038c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
20048c2ecf20Sopenharmony_ci				     "exists on this lockres!\n", dlm->name,
20058c2ecf20Sopenharmony_ci				     res->lockname.len, res->lockname.name,
20068c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
20078c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
20088c2ecf20Sopenharmony_ci
20098c2ecf20Sopenharmony_ci				mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
20108c2ecf20Sopenharmony_ci				     "node=%u, cookie=%u:%llu, queue=%d\n",
20118c2ecf20Sopenharmony_ci	      			     ml->type, ml->convert_type, ml->node,
20128c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)),
20138c2ecf20Sopenharmony_ci				     dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)),
20148c2ecf20Sopenharmony_ci				     ml->list);
20158c2ecf20Sopenharmony_ci
20168c2ecf20Sopenharmony_ci				__dlm_print_one_lock_resource(res);
20178c2ecf20Sopenharmony_ci				bad = 1;
20188c2ecf20Sopenharmony_ci				break;
20198c2ecf20Sopenharmony_ci			}
20208c2ecf20Sopenharmony_ci		}
20218c2ecf20Sopenharmony_ci		if (!bad) {
20228c2ecf20Sopenharmony_ci			dlm_lock_get(newlock);
20238c2ecf20Sopenharmony_ci			if (mres->flags & DLM_MRES_RECOVERY &&
20248c2ecf20Sopenharmony_ci					ml->list == DLM_CONVERTING_LIST &&
20258c2ecf20Sopenharmony_ci					newlock->ml.type >
20268c2ecf20Sopenharmony_ci					newlock->ml.convert_type) {
20278c2ecf20Sopenharmony_ci				/* newlock is doing downconvert, add it to the
20288c2ecf20Sopenharmony_ci				 * head of converting list */
20298c2ecf20Sopenharmony_ci				list_add(&newlock->list, queue);
20308c2ecf20Sopenharmony_ci			} else
20318c2ecf20Sopenharmony_ci				list_add_tail(&newlock->list, queue);
20328c2ecf20Sopenharmony_ci			mlog(0, "%s:%.*s: added lock for node %u, "
20338c2ecf20Sopenharmony_ci			     "setting refmap bit\n", dlm->name,
20348c2ecf20Sopenharmony_ci			     res->lockname.len, res->lockname.name, ml->node);
20358c2ecf20Sopenharmony_ci			dlm_lockres_set_refmap_bit(dlm, res, ml->node);
20368c2ecf20Sopenharmony_ci		}
20378c2ecf20Sopenharmony_ci		spin_unlock(&res->spinlock);
20388c2ecf20Sopenharmony_ci	}
20398c2ecf20Sopenharmony_ci	mlog(0, "done running all the locks\n");
20408c2ecf20Sopenharmony_ci
20418c2ecf20Sopenharmony_cileave:
20428c2ecf20Sopenharmony_ci	/* balance the ref taken when the work was queued */
20438c2ecf20Sopenharmony_ci	spin_lock(&res->spinlock);
20448c2ecf20Sopenharmony_ci	dlm_lockres_drop_inflight_ref(dlm, res);
20458c2ecf20Sopenharmony_ci	spin_unlock(&res->spinlock);
20468c2ecf20Sopenharmony_ci
20478c2ecf20Sopenharmony_ci	if (ret < 0)
20488c2ecf20Sopenharmony_ci		mlog_errno(ret);
20498c2ecf20Sopenharmony_ci
20508c2ecf20Sopenharmony_ci	return ret;
20518c2ecf20Sopenharmony_ci}
20528c2ecf20Sopenharmony_ci
20538c2ecf20Sopenharmony_civoid dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
20548c2ecf20Sopenharmony_ci				       struct dlm_lock_resource *res)
20558c2ecf20Sopenharmony_ci{
20568c2ecf20Sopenharmony_ci	int i;
20578c2ecf20Sopenharmony_ci	struct list_head *queue;
20588c2ecf20Sopenharmony_ci	struct dlm_lock *lock, *next;
20598c2ecf20Sopenharmony_ci
20608c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
20618c2ecf20Sopenharmony_ci	assert_spin_locked(&res->spinlock);
20628c2ecf20Sopenharmony_ci	res->state |= DLM_LOCK_RES_RECOVERING;
20638c2ecf20Sopenharmony_ci	if (!list_empty(&res->recovering)) {
20648c2ecf20Sopenharmony_ci		mlog(0,
20658c2ecf20Sopenharmony_ci		     "Recovering res %s:%.*s, is already on recovery list!\n",
20668c2ecf20Sopenharmony_ci		     dlm->name, res->lockname.len, res->lockname.name);
20678c2ecf20Sopenharmony_ci		list_del_init(&res->recovering);
20688c2ecf20Sopenharmony_ci		dlm_lockres_put(res);
20698c2ecf20Sopenharmony_ci	}
20708c2ecf20Sopenharmony_ci	/* We need to hold a reference while on the recovery list */
20718c2ecf20Sopenharmony_ci	dlm_lockres_get(res);
20728c2ecf20Sopenharmony_ci	list_add_tail(&res->recovering, &dlm->reco.resources);
20738c2ecf20Sopenharmony_ci
20748c2ecf20Sopenharmony_ci	/* find any pending locks and put them back on proper list */
20758c2ecf20Sopenharmony_ci	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
20768c2ecf20Sopenharmony_ci		queue = dlm_list_idx_to_ptr(res, i);
20778c2ecf20Sopenharmony_ci		list_for_each_entry_safe(lock, next, queue, list) {
20788c2ecf20Sopenharmony_ci			dlm_lock_get(lock);
20798c2ecf20Sopenharmony_ci			if (lock->convert_pending) {
20808c2ecf20Sopenharmony_ci				/* move converting lock back to granted */
20818c2ecf20Sopenharmony_ci				mlog(0, "node died with convert pending "
20828c2ecf20Sopenharmony_ci				     "on %.*s. move back to granted list.\n",
20838c2ecf20Sopenharmony_ci				     res->lockname.len, res->lockname.name);
20848c2ecf20Sopenharmony_ci				dlm_revert_pending_convert(res, lock);
20858c2ecf20Sopenharmony_ci				lock->convert_pending = 0;
20868c2ecf20Sopenharmony_ci			} else if (lock->lock_pending) {
20878c2ecf20Sopenharmony_ci				/* remove pending lock requests completely */
20888c2ecf20Sopenharmony_ci				BUG_ON(i != DLM_BLOCKED_LIST);
20898c2ecf20Sopenharmony_ci				mlog(0, "node died with lock pending "
20908c2ecf20Sopenharmony_ci				     "on %.*s. remove from blocked list and skip.\n",
20918c2ecf20Sopenharmony_ci				     res->lockname.len, res->lockname.name);
20928c2ecf20Sopenharmony_ci				/* lock will be floating until ref in
20938c2ecf20Sopenharmony_ci				 * dlmlock_remote is freed after the network
20948c2ecf20Sopenharmony_ci				 * call returns.  ok for it to not be on any
20958c2ecf20Sopenharmony_ci				 * list since no ast can be called
20968c2ecf20Sopenharmony_ci				 * (the master is dead). */
20978c2ecf20Sopenharmony_ci				dlm_revert_pending_lock(res, lock);
20988c2ecf20Sopenharmony_ci				lock->lock_pending = 0;
20998c2ecf20Sopenharmony_ci			} else if (lock->unlock_pending) {
21008c2ecf20Sopenharmony_ci				/* if an unlock was in progress, treat as
21018c2ecf20Sopenharmony_ci				 * if this had completed successfully
21028c2ecf20Sopenharmony_ci				 * before sending this lock state to the
21038c2ecf20Sopenharmony_ci				 * new master.  note that the dlm_unlock
21048c2ecf20Sopenharmony_ci				 * call is still responsible for calling
21058c2ecf20Sopenharmony_ci				 * the unlockast.  that will happen after
21068c2ecf20Sopenharmony_ci				 * the network call times out.  for now,
21078c2ecf20Sopenharmony_ci				 * just move lists to prepare the new
21088c2ecf20Sopenharmony_ci				 * recovery master.  */
21098c2ecf20Sopenharmony_ci				BUG_ON(i != DLM_GRANTED_LIST);
21108c2ecf20Sopenharmony_ci				mlog(0, "node died with unlock pending "
21118c2ecf20Sopenharmony_ci				     "on %.*s. remove from blocked list and skip.\n",
21128c2ecf20Sopenharmony_ci				     res->lockname.len, res->lockname.name);
21138c2ecf20Sopenharmony_ci				dlm_commit_pending_unlock(res, lock);
21148c2ecf20Sopenharmony_ci				lock->unlock_pending = 0;
21158c2ecf20Sopenharmony_ci			} else if (lock->cancel_pending) {
21168c2ecf20Sopenharmony_ci				/* if a cancel was in progress, treat as
21178c2ecf20Sopenharmony_ci				 * if this had completed successfully
21188c2ecf20Sopenharmony_ci				 * before sending this lock state to the
21198c2ecf20Sopenharmony_ci				 * new master */
21208c2ecf20Sopenharmony_ci				BUG_ON(i != DLM_CONVERTING_LIST);
21218c2ecf20Sopenharmony_ci				mlog(0, "node died with cancel pending "
21228c2ecf20Sopenharmony_ci				     "on %.*s. move back to granted list.\n",
21238c2ecf20Sopenharmony_ci				     res->lockname.len, res->lockname.name);
21248c2ecf20Sopenharmony_ci				dlm_commit_pending_cancel(res, lock);
21258c2ecf20Sopenharmony_ci				lock->cancel_pending = 0;
21268c2ecf20Sopenharmony_ci			}
21278c2ecf20Sopenharmony_ci			dlm_lock_put(lock);
21288c2ecf20Sopenharmony_ci		}
21298c2ecf20Sopenharmony_ci	}
21308c2ecf20Sopenharmony_ci}
21318c2ecf20Sopenharmony_ci
21328c2ecf20Sopenharmony_ci
21338c2ecf20Sopenharmony_ci
21348c2ecf20Sopenharmony_ci/* removes all recovered locks from the recovery list.
21358c2ecf20Sopenharmony_ci * sets the res->owner to the new master.
21368c2ecf20Sopenharmony_ci * unsets the RECOVERY flag and wakes waiters. */
21378c2ecf20Sopenharmony_cistatic void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
21388c2ecf20Sopenharmony_ci					      u8 dead_node, u8 new_master)
21398c2ecf20Sopenharmony_ci{
21408c2ecf20Sopenharmony_ci	int i;
21418c2ecf20Sopenharmony_ci	struct hlist_head *bucket;
21428c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res, *next;
21438c2ecf20Sopenharmony_ci
21448c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
21458c2ecf20Sopenharmony_ci
21468c2ecf20Sopenharmony_ci	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
21478c2ecf20Sopenharmony_ci		if (res->owner == dead_node) {
21488c2ecf20Sopenharmony_ci			mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
21498c2ecf20Sopenharmony_ci			     dlm->name, res->lockname.len, res->lockname.name,
21508c2ecf20Sopenharmony_ci			     res->owner, new_master);
21518c2ecf20Sopenharmony_ci			list_del_init(&res->recovering);
21528c2ecf20Sopenharmony_ci			spin_lock(&res->spinlock);
21538c2ecf20Sopenharmony_ci			/* new_master has our reference from
21548c2ecf20Sopenharmony_ci			 * the lock state sent during recovery */
21558c2ecf20Sopenharmony_ci			dlm_change_lockres_owner(dlm, res, new_master);
21568c2ecf20Sopenharmony_ci			res->state &= ~DLM_LOCK_RES_RECOVERING;
21578c2ecf20Sopenharmony_ci			if (__dlm_lockres_has_locks(res))
21588c2ecf20Sopenharmony_ci				__dlm_dirty_lockres(dlm, res);
21598c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
21608c2ecf20Sopenharmony_ci			wake_up(&res->wq);
21618c2ecf20Sopenharmony_ci			dlm_lockres_put(res);
21628c2ecf20Sopenharmony_ci		}
21638c2ecf20Sopenharmony_ci	}
21648c2ecf20Sopenharmony_ci
21658c2ecf20Sopenharmony_ci	/* this will become unnecessary eventually, but
21668c2ecf20Sopenharmony_ci	 * for now we need to run the whole hash, clear
21678c2ecf20Sopenharmony_ci	 * the RECOVERING state and set the owner
21688c2ecf20Sopenharmony_ci	 * if necessary */
21698c2ecf20Sopenharmony_ci	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
21708c2ecf20Sopenharmony_ci		bucket = dlm_lockres_hash(dlm, i);
21718c2ecf20Sopenharmony_ci		hlist_for_each_entry(res, bucket, hash_node) {
21728c2ecf20Sopenharmony_ci			if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
21738c2ecf20Sopenharmony_ci				spin_lock(&res->spinlock);
21748c2ecf20Sopenharmony_ci				res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
21758c2ecf20Sopenharmony_ci				spin_unlock(&res->spinlock);
21768c2ecf20Sopenharmony_ci				wake_up(&res->wq);
21778c2ecf20Sopenharmony_ci			}
21788c2ecf20Sopenharmony_ci
21798c2ecf20Sopenharmony_ci			if (!(res->state & DLM_LOCK_RES_RECOVERING))
21808c2ecf20Sopenharmony_ci				continue;
21818c2ecf20Sopenharmony_ci
21828c2ecf20Sopenharmony_ci			if (res->owner != dead_node &&
21838c2ecf20Sopenharmony_ci			    res->owner != dlm->node_num)
21848c2ecf20Sopenharmony_ci				continue;
21858c2ecf20Sopenharmony_ci
21868c2ecf20Sopenharmony_ci			if (!list_empty(&res->recovering)) {
21878c2ecf20Sopenharmony_ci				list_del_init(&res->recovering);
21888c2ecf20Sopenharmony_ci				dlm_lockres_put(res);
21898c2ecf20Sopenharmony_ci			}
21908c2ecf20Sopenharmony_ci
21918c2ecf20Sopenharmony_ci			/* new_master has our reference from
21928c2ecf20Sopenharmony_ci			 * the lock state sent during recovery */
21938c2ecf20Sopenharmony_ci			mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
21948c2ecf20Sopenharmony_ci			     dlm->name, res->lockname.len, res->lockname.name,
21958c2ecf20Sopenharmony_ci			     res->owner, new_master);
21968c2ecf20Sopenharmony_ci			spin_lock(&res->spinlock);
21978c2ecf20Sopenharmony_ci			dlm_change_lockres_owner(dlm, res, new_master);
21988c2ecf20Sopenharmony_ci			res->state &= ~DLM_LOCK_RES_RECOVERING;
21998c2ecf20Sopenharmony_ci			if (__dlm_lockres_has_locks(res))
22008c2ecf20Sopenharmony_ci				__dlm_dirty_lockres(dlm, res);
22018c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
22028c2ecf20Sopenharmony_ci			wake_up(&res->wq);
22038c2ecf20Sopenharmony_ci		}
22048c2ecf20Sopenharmony_ci	}
22058c2ecf20Sopenharmony_ci}
22068c2ecf20Sopenharmony_ci
22078c2ecf20Sopenharmony_cistatic inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
22088c2ecf20Sopenharmony_ci{
22098c2ecf20Sopenharmony_ci	if (local) {
22108c2ecf20Sopenharmony_ci		if (lock->ml.type != LKM_EXMODE &&
22118c2ecf20Sopenharmony_ci		    lock->ml.type != LKM_PRMODE)
22128c2ecf20Sopenharmony_ci			return 1;
22138c2ecf20Sopenharmony_ci	} else if (lock->ml.type == LKM_EXMODE)
22148c2ecf20Sopenharmony_ci		return 1;
22158c2ecf20Sopenharmony_ci	return 0;
22168c2ecf20Sopenharmony_ci}
22178c2ecf20Sopenharmony_ci
22188c2ecf20Sopenharmony_cistatic void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
22198c2ecf20Sopenharmony_ci			       struct dlm_lock_resource *res, u8 dead_node)
22208c2ecf20Sopenharmony_ci{
22218c2ecf20Sopenharmony_ci	struct list_head *queue;
22228c2ecf20Sopenharmony_ci	struct dlm_lock *lock;
22238c2ecf20Sopenharmony_ci	int blank_lvb = 0, local = 0;
22248c2ecf20Sopenharmony_ci	int i;
22258c2ecf20Sopenharmony_ci	u8 search_node;
22268c2ecf20Sopenharmony_ci
22278c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
22288c2ecf20Sopenharmony_ci	assert_spin_locked(&res->spinlock);
22298c2ecf20Sopenharmony_ci
22308c2ecf20Sopenharmony_ci	if (res->owner == dlm->node_num)
22318c2ecf20Sopenharmony_ci		/* if this node owned the lockres, and if the dead node
22328c2ecf20Sopenharmony_ci		 * had an EX when he died, blank out the lvb */
22338c2ecf20Sopenharmony_ci		search_node = dead_node;
22348c2ecf20Sopenharmony_ci	else {
22358c2ecf20Sopenharmony_ci		/* if this is a secondary lockres, and we had no EX or PR
22368c2ecf20Sopenharmony_ci		 * locks granted, we can no longer trust the lvb */
22378c2ecf20Sopenharmony_ci		search_node = dlm->node_num;
22388c2ecf20Sopenharmony_ci		local = 1;  /* check local state for valid lvb */
22398c2ecf20Sopenharmony_ci	}
22408c2ecf20Sopenharmony_ci
22418c2ecf20Sopenharmony_ci	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
22428c2ecf20Sopenharmony_ci		queue = dlm_list_idx_to_ptr(res, i);
22438c2ecf20Sopenharmony_ci		list_for_each_entry(lock, queue, list) {
22448c2ecf20Sopenharmony_ci			if (lock->ml.node == search_node) {
22458c2ecf20Sopenharmony_ci				if (dlm_lvb_needs_invalidation(lock, local)) {
22468c2ecf20Sopenharmony_ci					/* zero the lksb lvb and lockres lvb */
22478c2ecf20Sopenharmony_ci					blank_lvb = 1;
22488c2ecf20Sopenharmony_ci					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
22498c2ecf20Sopenharmony_ci				}
22508c2ecf20Sopenharmony_ci			}
22518c2ecf20Sopenharmony_ci		}
22528c2ecf20Sopenharmony_ci	}
22538c2ecf20Sopenharmony_ci
22548c2ecf20Sopenharmony_ci	if (blank_lvb) {
22558c2ecf20Sopenharmony_ci		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
22568c2ecf20Sopenharmony_ci		     res->lockname.len, res->lockname.name, dead_node);
22578c2ecf20Sopenharmony_ci		memset(res->lvb, 0, DLM_LVB_LEN);
22588c2ecf20Sopenharmony_ci	}
22598c2ecf20Sopenharmony_ci}
22608c2ecf20Sopenharmony_ci
22618c2ecf20Sopenharmony_cistatic void dlm_free_dead_locks(struct dlm_ctxt *dlm,
22628c2ecf20Sopenharmony_ci				struct dlm_lock_resource *res, u8 dead_node)
22638c2ecf20Sopenharmony_ci{
22648c2ecf20Sopenharmony_ci	struct dlm_lock *lock, *next;
22658c2ecf20Sopenharmony_ci	unsigned int freed = 0;
22668c2ecf20Sopenharmony_ci
22678c2ecf20Sopenharmony_ci	/* this node is the lockres master:
22688c2ecf20Sopenharmony_ci	 * 1) remove any stale locks for the dead node
22698c2ecf20Sopenharmony_ci	 * 2) if the dead node had an EX when he died, blank out the lvb
22708c2ecf20Sopenharmony_ci	 */
22718c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
22728c2ecf20Sopenharmony_ci	assert_spin_locked(&res->spinlock);
22738c2ecf20Sopenharmony_ci
22748c2ecf20Sopenharmony_ci	/* We do two dlm_lock_put(). One for removing from list and the other is
22758c2ecf20Sopenharmony_ci	 * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
22768c2ecf20Sopenharmony_ci
22778c2ecf20Sopenharmony_ci	/* TODO: check pending_asts, pending_basts here */
22788c2ecf20Sopenharmony_ci	list_for_each_entry_safe(lock, next, &res->granted, list) {
22798c2ecf20Sopenharmony_ci		if (lock->ml.node == dead_node) {
22808c2ecf20Sopenharmony_ci			list_del_init(&lock->list);
22818c2ecf20Sopenharmony_ci			dlm_lock_put(lock);
22828c2ecf20Sopenharmony_ci			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
22838c2ecf20Sopenharmony_ci			dlm_lock_put(lock);
22848c2ecf20Sopenharmony_ci			freed++;
22858c2ecf20Sopenharmony_ci		}
22868c2ecf20Sopenharmony_ci	}
22878c2ecf20Sopenharmony_ci	list_for_each_entry_safe(lock, next, &res->converting, list) {
22888c2ecf20Sopenharmony_ci		if (lock->ml.node == dead_node) {
22898c2ecf20Sopenharmony_ci			list_del_init(&lock->list);
22908c2ecf20Sopenharmony_ci			dlm_lock_put(lock);
22918c2ecf20Sopenharmony_ci			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
22928c2ecf20Sopenharmony_ci			dlm_lock_put(lock);
22938c2ecf20Sopenharmony_ci			freed++;
22948c2ecf20Sopenharmony_ci		}
22958c2ecf20Sopenharmony_ci	}
22968c2ecf20Sopenharmony_ci	list_for_each_entry_safe(lock, next, &res->blocked, list) {
22978c2ecf20Sopenharmony_ci		if (lock->ml.node == dead_node) {
22988c2ecf20Sopenharmony_ci			list_del_init(&lock->list);
22998c2ecf20Sopenharmony_ci			dlm_lock_put(lock);
23008c2ecf20Sopenharmony_ci			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
23018c2ecf20Sopenharmony_ci			dlm_lock_put(lock);
23028c2ecf20Sopenharmony_ci			freed++;
23038c2ecf20Sopenharmony_ci		}
23048c2ecf20Sopenharmony_ci	}
23058c2ecf20Sopenharmony_ci
23068c2ecf20Sopenharmony_ci	if (freed) {
23078c2ecf20Sopenharmony_ci		mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
23088c2ecf20Sopenharmony_ci		     "dropping ref from lockres\n", dlm->name,
23098c2ecf20Sopenharmony_ci		     res->lockname.len, res->lockname.name, freed, dead_node);
23108c2ecf20Sopenharmony_ci		if(!test_bit(dead_node, res->refmap)) {
23118c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
23128c2ecf20Sopenharmony_ci			     "but ref was not set\n", dlm->name,
23138c2ecf20Sopenharmony_ci			     res->lockname.len, res->lockname.name, freed, dead_node);
23148c2ecf20Sopenharmony_ci			__dlm_print_one_lock_resource(res);
23158c2ecf20Sopenharmony_ci		}
23168c2ecf20Sopenharmony_ci		res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
23178c2ecf20Sopenharmony_ci		dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
23188c2ecf20Sopenharmony_ci	} else if (test_bit(dead_node, res->refmap)) {
23198c2ecf20Sopenharmony_ci		mlog(0, "%s:%.*s: dead node %u had a ref, but had "
23208c2ecf20Sopenharmony_ci		     "no locks and had not purged before dying\n", dlm->name,
23218c2ecf20Sopenharmony_ci		     res->lockname.len, res->lockname.name, dead_node);
23228c2ecf20Sopenharmony_ci		dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
23238c2ecf20Sopenharmony_ci	}
23248c2ecf20Sopenharmony_ci
23258c2ecf20Sopenharmony_ci	/* do not kick thread yet */
23268c2ecf20Sopenharmony_ci	__dlm_dirty_lockres(dlm, res);
23278c2ecf20Sopenharmony_ci}
23288c2ecf20Sopenharmony_ci
23298c2ecf20Sopenharmony_cistatic void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
23308c2ecf20Sopenharmony_ci{
23318c2ecf20Sopenharmony_ci	struct dlm_lock_resource *res;
23328c2ecf20Sopenharmony_ci	int i;
23338c2ecf20Sopenharmony_ci	struct hlist_head *bucket;
23348c2ecf20Sopenharmony_ci	struct hlist_node *tmp;
23358c2ecf20Sopenharmony_ci	struct dlm_lock *lock;
23368c2ecf20Sopenharmony_ci
23378c2ecf20Sopenharmony_ci
23388c2ecf20Sopenharmony_ci	/* purge any stale mles */
23398c2ecf20Sopenharmony_ci	dlm_clean_master_list(dlm, dead_node);
23408c2ecf20Sopenharmony_ci
23418c2ecf20Sopenharmony_ci	/*
23428c2ecf20Sopenharmony_ci	 * now clean up all lock resources.  there are two rules:
23438c2ecf20Sopenharmony_ci	 *
23448c2ecf20Sopenharmony_ci	 * 1) if the dead node was the master, move the lockres
23458c2ecf20Sopenharmony_ci	 *    to the recovering list.  set the RECOVERING flag.
23468c2ecf20Sopenharmony_ci	 *    this lockres needs to be cleaned up before it can
23478c2ecf20Sopenharmony_ci	 *    be used further.
23488c2ecf20Sopenharmony_ci	 *
23498c2ecf20Sopenharmony_ci	 * 2) if this node was the master, remove all locks from
23508c2ecf20Sopenharmony_ci	 *    each of the lockres queues that were owned by the
23518c2ecf20Sopenharmony_ci	 *    dead node.  once recovery finishes, the dlm thread
23528c2ecf20Sopenharmony_ci	 *    can be kicked again to see if any ASTs or BASTs
23538c2ecf20Sopenharmony_ci	 *    need to be fired as a result.
23548c2ecf20Sopenharmony_ci	 */
23558c2ecf20Sopenharmony_ci	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
23568c2ecf20Sopenharmony_ci		bucket = dlm_lockres_hash(dlm, i);
23578c2ecf20Sopenharmony_ci		hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
23588c2ecf20Sopenharmony_ci 			/* always prune any $RECOVERY entries for dead nodes,
23598c2ecf20Sopenharmony_ci 			 * otherwise hangs can occur during later recovery */
23608c2ecf20Sopenharmony_ci			if (dlm_is_recovery_lock(res->lockname.name,
23618c2ecf20Sopenharmony_ci						 res->lockname.len)) {
23628c2ecf20Sopenharmony_ci				spin_lock(&res->spinlock);
23638c2ecf20Sopenharmony_ci				list_for_each_entry(lock, &res->granted, list) {
23648c2ecf20Sopenharmony_ci					if (lock->ml.node == dead_node) {
23658c2ecf20Sopenharmony_ci						mlog(0, "AHA! there was "
23668c2ecf20Sopenharmony_ci						     "a $RECOVERY lock for dead "
23678c2ecf20Sopenharmony_ci						     "node %u (%s)!\n",
23688c2ecf20Sopenharmony_ci						     dead_node, dlm->name);
23698c2ecf20Sopenharmony_ci						list_del_init(&lock->list);
23708c2ecf20Sopenharmony_ci						dlm_lock_put(lock);
23718c2ecf20Sopenharmony_ci						/* Can't schedule
23728c2ecf20Sopenharmony_ci						 * DLM_UNLOCK_FREE_LOCK
23738c2ecf20Sopenharmony_ci						 * - do manually */
23748c2ecf20Sopenharmony_ci						dlm_lock_put(lock);
23758c2ecf20Sopenharmony_ci						break;
23768c2ecf20Sopenharmony_ci					}
23778c2ecf20Sopenharmony_ci				}
23788c2ecf20Sopenharmony_ci
23798c2ecf20Sopenharmony_ci				if ((res->owner == dead_node) &&
23808c2ecf20Sopenharmony_ci							(res->state & DLM_LOCK_RES_DROPPING_REF)) {
23818c2ecf20Sopenharmony_ci					dlm_lockres_get(res);
23828c2ecf20Sopenharmony_ci					__dlm_do_purge_lockres(dlm, res);
23838c2ecf20Sopenharmony_ci					spin_unlock(&res->spinlock);
23848c2ecf20Sopenharmony_ci					wake_up(&res->wq);
23858c2ecf20Sopenharmony_ci					dlm_lockres_put(res);
23868c2ecf20Sopenharmony_ci					continue;
23878c2ecf20Sopenharmony_ci				} else if (res->owner == dlm->node_num)
23888c2ecf20Sopenharmony_ci					dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
23898c2ecf20Sopenharmony_ci				spin_unlock(&res->spinlock);
23908c2ecf20Sopenharmony_ci				continue;
23918c2ecf20Sopenharmony_ci			}
23928c2ecf20Sopenharmony_ci			spin_lock(&res->spinlock);
23938c2ecf20Sopenharmony_ci			/* zero the lvb if necessary */
23948c2ecf20Sopenharmony_ci			dlm_revalidate_lvb(dlm, res, dead_node);
23958c2ecf20Sopenharmony_ci			if (res->owner == dead_node) {
23968c2ecf20Sopenharmony_ci				if (res->state & DLM_LOCK_RES_DROPPING_REF) {
23978c2ecf20Sopenharmony_ci					mlog(0, "%s:%.*s: owned by "
23988c2ecf20Sopenharmony_ci						"dead node %u, this node was "
23998c2ecf20Sopenharmony_ci						"dropping its ref when master died. "
24008c2ecf20Sopenharmony_ci						"continue, purging the lockres.\n",
24018c2ecf20Sopenharmony_ci						dlm->name, res->lockname.len,
24028c2ecf20Sopenharmony_ci						res->lockname.name, dead_node);
24038c2ecf20Sopenharmony_ci					dlm_lockres_get(res);
24048c2ecf20Sopenharmony_ci					__dlm_do_purge_lockres(dlm, res);
24058c2ecf20Sopenharmony_ci					spin_unlock(&res->spinlock);
24068c2ecf20Sopenharmony_ci					wake_up(&res->wq);
24078c2ecf20Sopenharmony_ci					dlm_lockres_put(res);
24088c2ecf20Sopenharmony_ci					continue;
24098c2ecf20Sopenharmony_ci				}
24108c2ecf20Sopenharmony_ci				dlm_move_lockres_to_recovery_list(dlm, res);
24118c2ecf20Sopenharmony_ci			} else if (res->owner == dlm->node_num) {
24128c2ecf20Sopenharmony_ci				dlm_free_dead_locks(dlm, res, dead_node);
24138c2ecf20Sopenharmony_ci				__dlm_lockres_calc_usage(dlm, res);
24148c2ecf20Sopenharmony_ci			} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
24158c2ecf20Sopenharmony_ci				if (test_bit(dead_node, res->refmap)) {
24168c2ecf20Sopenharmony_ci					mlog(0, "%s:%.*s: dead node %u had a ref, but had "
24178c2ecf20Sopenharmony_ci						"no locks and had not purged before dying\n",
24188c2ecf20Sopenharmony_ci						dlm->name, res->lockname.len,
24198c2ecf20Sopenharmony_ci						res->lockname.name, dead_node);
24208c2ecf20Sopenharmony_ci					dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
24218c2ecf20Sopenharmony_ci				}
24228c2ecf20Sopenharmony_ci			}
24238c2ecf20Sopenharmony_ci			spin_unlock(&res->spinlock);
24248c2ecf20Sopenharmony_ci		}
24258c2ecf20Sopenharmony_ci	}
24268c2ecf20Sopenharmony_ci
24278c2ecf20Sopenharmony_ci}
24288c2ecf20Sopenharmony_ci
24298c2ecf20Sopenharmony_cistatic void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
24308c2ecf20Sopenharmony_ci{
24318c2ecf20Sopenharmony_ci	assert_spin_locked(&dlm->spinlock);
24328c2ecf20Sopenharmony_ci
24338c2ecf20Sopenharmony_ci	if (dlm->reco.new_master == idx) {
24348c2ecf20Sopenharmony_ci		mlog(0, "%s: recovery master %d just died\n",
24358c2ecf20Sopenharmony_ci		     dlm->name, idx);
24368c2ecf20Sopenharmony_ci		if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
24378c2ecf20Sopenharmony_ci			/* finalize1 was reached, so it is safe to clear
24388c2ecf20Sopenharmony_ci			 * the new_master and dead_node.  that recovery
24398c2ecf20Sopenharmony_ci			 * is complete. */
24408c2ecf20Sopenharmony_ci			mlog(0, "%s: dead master %d had reached "
24418c2ecf20Sopenharmony_ci			     "finalize1 state, clearing\n", dlm->name, idx);
24428c2ecf20Sopenharmony_ci			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
24438c2ecf20Sopenharmony_ci			__dlm_reset_recovery(dlm);
24448c2ecf20Sopenharmony_ci		}
24458c2ecf20Sopenharmony_ci	}
24468c2ecf20Sopenharmony_ci
24478c2ecf20Sopenharmony_ci	/* Clean up join state on node death. */
24488c2ecf20Sopenharmony_ci	if (dlm->joining_node == idx) {
24498c2ecf20Sopenharmony_ci		mlog(0, "Clearing join state for node %u\n", idx);
24508c2ecf20Sopenharmony_ci		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
24518c2ecf20Sopenharmony_ci	}
24528c2ecf20Sopenharmony_ci
24538c2ecf20Sopenharmony_ci	/* check to see if the node is already considered dead */
24548c2ecf20Sopenharmony_ci	if (!test_bit(idx, dlm->live_nodes_map)) {
24558c2ecf20Sopenharmony_ci		mlog(0, "for domain %s, node %d is already dead. "
24568c2ecf20Sopenharmony_ci		     "another node likely did recovery already.\n",
24578c2ecf20Sopenharmony_ci		     dlm->name, idx);
24588c2ecf20Sopenharmony_ci		return;
24598c2ecf20Sopenharmony_ci	}
24608c2ecf20Sopenharmony_ci
24618c2ecf20Sopenharmony_ci	/* check to see if we do not care about this node */
24628c2ecf20Sopenharmony_ci	if (!test_bit(idx, dlm->domain_map)) {
24638c2ecf20Sopenharmony_ci		/* This also catches the case that we get a node down
24648c2ecf20Sopenharmony_ci		 * but haven't joined the domain yet. */
24658c2ecf20Sopenharmony_ci		mlog(0, "node %u already removed from domain!\n", idx);
24668c2ecf20Sopenharmony_ci		return;
24678c2ecf20Sopenharmony_ci	}
24688c2ecf20Sopenharmony_ci
24698c2ecf20Sopenharmony_ci	clear_bit(idx, dlm->live_nodes_map);
24708c2ecf20Sopenharmony_ci
24718c2ecf20Sopenharmony_ci	/* make sure local cleanup occurs before the heartbeat events */
24728c2ecf20Sopenharmony_ci	if (!test_bit(idx, dlm->recovery_map))
24738c2ecf20Sopenharmony_ci		dlm_do_local_recovery_cleanup(dlm, idx);
24748c2ecf20Sopenharmony_ci
24758c2ecf20Sopenharmony_ci	/* notify anything attached to the heartbeat events */
24768c2ecf20Sopenharmony_ci	dlm_hb_event_notify_attached(dlm, idx, 0);
24778c2ecf20Sopenharmony_ci
24788c2ecf20Sopenharmony_ci	mlog(0, "node %u being removed from domain map!\n", idx);
24798c2ecf20Sopenharmony_ci	clear_bit(idx, dlm->domain_map);
24808c2ecf20Sopenharmony_ci	clear_bit(idx, dlm->exit_domain_map);
24818c2ecf20Sopenharmony_ci	/* wake up migration waiters if a node goes down.
24828c2ecf20Sopenharmony_ci	 * perhaps later we can genericize this for other waiters. */
24838c2ecf20Sopenharmony_ci	wake_up(&dlm->migration_wq);
24848c2ecf20Sopenharmony_ci
24858c2ecf20Sopenharmony_ci	set_bit(idx, dlm->recovery_map);
24868c2ecf20Sopenharmony_ci}
24878c2ecf20Sopenharmony_ci
24888c2ecf20Sopenharmony_civoid dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
24898c2ecf20Sopenharmony_ci{
24908c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
24918c2ecf20Sopenharmony_ci
24928c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm))
24938c2ecf20Sopenharmony_ci		return;
24948c2ecf20Sopenharmony_ci
24958c2ecf20Sopenharmony_ci	/*
24968c2ecf20Sopenharmony_ci	 * This will notify any dlm users that a node in our domain
24978c2ecf20Sopenharmony_ci	 * went away without notifying us first.
24988c2ecf20Sopenharmony_ci	 */
24998c2ecf20Sopenharmony_ci	if (test_bit(idx, dlm->domain_map))
25008c2ecf20Sopenharmony_ci		dlm_fire_domain_eviction_callbacks(dlm, idx);
25018c2ecf20Sopenharmony_ci
25028c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
25038c2ecf20Sopenharmony_ci	__dlm_hb_node_down(dlm, idx);
25048c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
25058c2ecf20Sopenharmony_ci
25068c2ecf20Sopenharmony_ci	dlm_put(dlm);
25078c2ecf20Sopenharmony_ci}
25088c2ecf20Sopenharmony_ci
25098c2ecf20Sopenharmony_civoid dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
25108c2ecf20Sopenharmony_ci{
25118c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
25128c2ecf20Sopenharmony_ci
25138c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm))
25148c2ecf20Sopenharmony_ci		return;
25158c2ecf20Sopenharmony_ci
25168c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
25178c2ecf20Sopenharmony_ci	set_bit(idx, dlm->live_nodes_map);
25188c2ecf20Sopenharmony_ci	/* do NOT notify mle attached to the heartbeat events.
25198c2ecf20Sopenharmony_ci	 * new nodes are not interesting in mastery until joined. */
25208c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
25218c2ecf20Sopenharmony_ci
25228c2ecf20Sopenharmony_ci	dlm_put(dlm);
25238c2ecf20Sopenharmony_ci}
25248c2ecf20Sopenharmony_ci
25258c2ecf20Sopenharmony_cistatic void dlm_reco_ast(void *astdata)
25268c2ecf20Sopenharmony_ci{
25278c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = astdata;
25288c2ecf20Sopenharmony_ci	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
25298c2ecf20Sopenharmony_ci	     dlm->node_num, dlm->name);
25308c2ecf20Sopenharmony_ci}
25318c2ecf20Sopenharmony_cistatic void dlm_reco_bast(void *astdata, int blocked_type)
25328c2ecf20Sopenharmony_ci{
25338c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = astdata;
25348c2ecf20Sopenharmony_ci	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
25358c2ecf20Sopenharmony_ci	     dlm->node_num, dlm->name);
25368c2ecf20Sopenharmony_ci}
25378c2ecf20Sopenharmony_cistatic void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
25388c2ecf20Sopenharmony_ci{
25398c2ecf20Sopenharmony_ci	mlog(0, "unlockast for recovery lock fired!\n");
25408c2ecf20Sopenharmony_ci}
25418c2ecf20Sopenharmony_ci
25428c2ecf20Sopenharmony_ci/*
25438c2ecf20Sopenharmony_ci * dlm_pick_recovery_master will continually attempt to use
25448c2ecf20Sopenharmony_ci * dlmlock() on the special "$RECOVERY" lockres with the
25458c2ecf20Sopenharmony_ci * LKM_NOQUEUE flag to get an EX.  every thread that enters
25468c2ecf20Sopenharmony_ci * this function on each node racing to become the recovery
25478c2ecf20Sopenharmony_ci * master will not stop attempting this until either:
25488c2ecf20Sopenharmony_ci * a) this node gets the EX (and becomes the recovery master),
25498c2ecf20Sopenharmony_ci * or b) dlm->reco.new_master gets set to some nodenum
25508c2ecf20Sopenharmony_ci * != O2NM_INVALID_NODE_NUM (another node will do the reco).
25518c2ecf20Sopenharmony_ci * so each time a recovery master is needed, the entire cluster
25528c2ecf20Sopenharmony_ci * will sync at this point.  if the new master dies, that will
25538c2ecf20Sopenharmony_ci * be detected in dlm_do_recovery */
25548c2ecf20Sopenharmony_cistatic int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
25558c2ecf20Sopenharmony_ci{
25568c2ecf20Sopenharmony_ci	enum dlm_status ret;
25578c2ecf20Sopenharmony_ci	struct dlm_lockstatus lksb;
25588c2ecf20Sopenharmony_ci	int status = -EINVAL;
25598c2ecf20Sopenharmony_ci
25608c2ecf20Sopenharmony_ci	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
25618c2ecf20Sopenharmony_ci	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
25628c2ecf20Sopenharmony_ciagain:
25638c2ecf20Sopenharmony_ci	memset(&lksb, 0, sizeof(lksb));
25648c2ecf20Sopenharmony_ci
25658c2ecf20Sopenharmony_ci	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
25668c2ecf20Sopenharmony_ci		      DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
25678c2ecf20Sopenharmony_ci		      dlm_reco_ast, dlm, dlm_reco_bast);
25688c2ecf20Sopenharmony_ci
25698c2ecf20Sopenharmony_ci	mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
25708c2ecf20Sopenharmony_ci	     dlm->name, ret, lksb.status);
25718c2ecf20Sopenharmony_ci
25728c2ecf20Sopenharmony_ci	if (ret == DLM_NORMAL) {
25738c2ecf20Sopenharmony_ci		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
25748c2ecf20Sopenharmony_ci		     dlm->name, dlm->node_num);
25758c2ecf20Sopenharmony_ci
25768c2ecf20Sopenharmony_ci		/* got the EX lock.  check to see if another node
25778c2ecf20Sopenharmony_ci		 * just became the reco master */
25788c2ecf20Sopenharmony_ci		if (dlm_reco_master_ready(dlm)) {
25798c2ecf20Sopenharmony_ci			mlog(0, "%s: got reco EX lock, but %u will "
25808c2ecf20Sopenharmony_ci			     "do the recovery\n", dlm->name,
25818c2ecf20Sopenharmony_ci			     dlm->reco.new_master);
25828c2ecf20Sopenharmony_ci			status = -EEXIST;
25838c2ecf20Sopenharmony_ci		} else {
25848c2ecf20Sopenharmony_ci			status = 0;
25858c2ecf20Sopenharmony_ci
25868c2ecf20Sopenharmony_ci			/* see if recovery was already finished elsewhere */
25878c2ecf20Sopenharmony_ci			spin_lock(&dlm->spinlock);
25888c2ecf20Sopenharmony_ci			if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
25898c2ecf20Sopenharmony_ci				status = -EINVAL;
25908c2ecf20Sopenharmony_ci				mlog(0, "%s: got reco EX lock, but "
25918c2ecf20Sopenharmony_ci				     "node got recovered already\n", dlm->name);
25928c2ecf20Sopenharmony_ci				if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
25938c2ecf20Sopenharmony_ci					mlog(ML_ERROR, "%s: new master is %u "
25948c2ecf20Sopenharmony_ci					     "but no dead node!\n",
25958c2ecf20Sopenharmony_ci					     dlm->name, dlm->reco.new_master);
25968c2ecf20Sopenharmony_ci					BUG();
25978c2ecf20Sopenharmony_ci				}
25988c2ecf20Sopenharmony_ci			}
25998c2ecf20Sopenharmony_ci			spin_unlock(&dlm->spinlock);
26008c2ecf20Sopenharmony_ci		}
26018c2ecf20Sopenharmony_ci
26028c2ecf20Sopenharmony_ci		/* if this node has actually become the recovery master,
26038c2ecf20Sopenharmony_ci		 * set the master and send the messages to begin recovery */
26048c2ecf20Sopenharmony_ci		if (!status) {
26058c2ecf20Sopenharmony_ci			mlog(0, "%s: dead=%u, this=%u, sending "
26068c2ecf20Sopenharmony_ci			     "begin_reco now\n", dlm->name,
26078c2ecf20Sopenharmony_ci			     dlm->reco.dead_node, dlm->node_num);
26088c2ecf20Sopenharmony_ci			status = dlm_send_begin_reco_message(dlm,
26098c2ecf20Sopenharmony_ci				      dlm->reco.dead_node);
26108c2ecf20Sopenharmony_ci			/* this always succeeds */
26118c2ecf20Sopenharmony_ci			BUG_ON(status);
26128c2ecf20Sopenharmony_ci
26138c2ecf20Sopenharmony_ci			/* set the new_master to this node */
26148c2ecf20Sopenharmony_ci			spin_lock(&dlm->spinlock);
26158c2ecf20Sopenharmony_ci			dlm_set_reco_master(dlm, dlm->node_num);
26168c2ecf20Sopenharmony_ci			spin_unlock(&dlm->spinlock);
26178c2ecf20Sopenharmony_ci		}
26188c2ecf20Sopenharmony_ci
26198c2ecf20Sopenharmony_ci		/* recovery lock is a special case.  ast will not get fired,
26208c2ecf20Sopenharmony_ci		 * so just go ahead and unlock it. */
26218c2ecf20Sopenharmony_ci		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
26228c2ecf20Sopenharmony_ci		if (ret == DLM_DENIED) {
26238c2ecf20Sopenharmony_ci			mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
26248c2ecf20Sopenharmony_ci			ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
26258c2ecf20Sopenharmony_ci		}
26268c2ecf20Sopenharmony_ci		if (ret != DLM_NORMAL) {
26278c2ecf20Sopenharmony_ci			/* this would really suck. this could only happen
26288c2ecf20Sopenharmony_ci			 * if there was a network error during the unlock
26298c2ecf20Sopenharmony_ci			 * because of node death.  this means the unlock
26308c2ecf20Sopenharmony_ci			 * is actually "done" and the lock structure is
26318c2ecf20Sopenharmony_ci			 * even freed.  we can continue, but only
26328c2ecf20Sopenharmony_ci			 * because this specific lock name is special. */
26338c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
26348c2ecf20Sopenharmony_ci		}
26358c2ecf20Sopenharmony_ci	} else if (ret == DLM_NOTQUEUED) {
26368c2ecf20Sopenharmony_ci		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
26378c2ecf20Sopenharmony_ci		     dlm->name, dlm->node_num);
26388c2ecf20Sopenharmony_ci		/* another node is master. wait on
26398c2ecf20Sopenharmony_ci		 * reco.new_master != O2NM_INVALID_NODE_NUM
26408c2ecf20Sopenharmony_ci		 * for at most one second */
26418c2ecf20Sopenharmony_ci		wait_event_timeout(dlm->dlm_reco_thread_wq,
26428c2ecf20Sopenharmony_ci					 dlm_reco_master_ready(dlm),
26438c2ecf20Sopenharmony_ci					 msecs_to_jiffies(1000));
26448c2ecf20Sopenharmony_ci		if (!dlm_reco_master_ready(dlm)) {
26458c2ecf20Sopenharmony_ci			mlog(0, "%s: reco master taking awhile\n",
26468c2ecf20Sopenharmony_ci			     dlm->name);
26478c2ecf20Sopenharmony_ci			goto again;
26488c2ecf20Sopenharmony_ci		}
26498c2ecf20Sopenharmony_ci		/* another node has informed this one that it is reco master */
26508c2ecf20Sopenharmony_ci		mlog(0, "%s: reco master %u is ready to recover %u\n",
26518c2ecf20Sopenharmony_ci		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
26528c2ecf20Sopenharmony_ci		status = -EEXIST;
26538c2ecf20Sopenharmony_ci	} else if (ret == DLM_RECOVERING) {
26548c2ecf20Sopenharmony_ci		mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
26558c2ecf20Sopenharmony_ci		     dlm->name, dlm->node_num);
26568c2ecf20Sopenharmony_ci		goto again;
26578c2ecf20Sopenharmony_ci	} else {
26588c2ecf20Sopenharmony_ci		struct dlm_lock_resource *res;
26598c2ecf20Sopenharmony_ci
26608c2ecf20Sopenharmony_ci		/* dlmlock returned something other than NOTQUEUED or NORMAL */
26618c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
26628c2ecf20Sopenharmony_ci		     "lksb.status=%s\n", dlm->name, dlm_errname(ret),
26638c2ecf20Sopenharmony_ci		     dlm_errname(lksb.status));
26648c2ecf20Sopenharmony_ci		res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
26658c2ecf20Sopenharmony_ci					 DLM_RECOVERY_LOCK_NAME_LEN);
26668c2ecf20Sopenharmony_ci		if (res) {
26678c2ecf20Sopenharmony_ci			dlm_print_one_lock_resource(res);
26688c2ecf20Sopenharmony_ci			dlm_lockres_put(res);
26698c2ecf20Sopenharmony_ci		} else {
26708c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "recovery lock not found\n");
26718c2ecf20Sopenharmony_ci		}
26728c2ecf20Sopenharmony_ci		BUG();
26738c2ecf20Sopenharmony_ci	}
26748c2ecf20Sopenharmony_ci
26758c2ecf20Sopenharmony_ci	return status;
26768c2ecf20Sopenharmony_ci}
26778c2ecf20Sopenharmony_ci
26788c2ecf20Sopenharmony_cistatic int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
26798c2ecf20Sopenharmony_ci{
26808c2ecf20Sopenharmony_ci	struct dlm_begin_reco br;
26818c2ecf20Sopenharmony_ci	int ret = 0;
26828c2ecf20Sopenharmony_ci	struct dlm_node_iter iter;
26838c2ecf20Sopenharmony_ci	int nodenum;
26848c2ecf20Sopenharmony_ci	int status;
26858c2ecf20Sopenharmony_ci
26868c2ecf20Sopenharmony_ci	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
26878c2ecf20Sopenharmony_ci
26888c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
26898c2ecf20Sopenharmony_ci	dlm_node_iter_init(dlm->domain_map, &iter);
26908c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
26918c2ecf20Sopenharmony_ci
26928c2ecf20Sopenharmony_ci	clear_bit(dead_node, iter.node_map);
26938c2ecf20Sopenharmony_ci
26948c2ecf20Sopenharmony_ci	memset(&br, 0, sizeof(br));
26958c2ecf20Sopenharmony_ci	br.node_idx = dlm->node_num;
26968c2ecf20Sopenharmony_ci	br.dead_node = dead_node;
26978c2ecf20Sopenharmony_ci
26988c2ecf20Sopenharmony_ci	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
26998c2ecf20Sopenharmony_ci		ret = 0;
27008c2ecf20Sopenharmony_ci		if (nodenum == dead_node) {
27018c2ecf20Sopenharmony_ci			mlog(0, "not sending begin reco to dead node "
27028c2ecf20Sopenharmony_ci				  "%u\n", dead_node);
27038c2ecf20Sopenharmony_ci			continue;
27048c2ecf20Sopenharmony_ci		}
27058c2ecf20Sopenharmony_ci		if (nodenum == dlm->node_num) {
27068c2ecf20Sopenharmony_ci			mlog(0, "not sending begin reco to self\n");
27078c2ecf20Sopenharmony_ci			continue;
27088c2ecf20Sopenharmony_ci		}
27098c2ecf20Sopenharmony_ciretry:
27108c2ecf20Sopenharmony_ci		ret = -EINVAL;
27118c2ecf20Sopenharmony_ci		mlog(0, "attempting to send begin reco msg to %d\n",
27128c2ecf20Sopenharmony_ci			  nodenum);
27138c2ecf20Sopenharmony_ci		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
27148c2ecf20Sopenharmony_ci					 &br, sizeof(br), nodenum, &status);
27158c2ecf20Sopenharmony_ci		/* negative status is handled ok by caller here */
27168c2ecf20Sopenharmony_ci		if (ret >= 0)
27178c2ecf20Sopenharmony_ci			ret = status;
27188c2ecf20Sopenharmony_ci		if (dlm_is_host_down(ret)) {
27198c2ecf20Sopenharmony_ci			/* node is down.  not involved in recovery
27208c2ecf20Sopenharmony_ci			 * so just keep going */
27218c2ecf20Sopenharmony_ci			mlog(ML_NOTICE, "%s: node %u was down when sending "
27228c2ecf20Sopenharmony_ci			     "begin reco msg (%d)\n", dlm->name, nodenum, ret);
27238c2ecf20Sopenharmony_ci			ret = 0;
27248c2ecf20Sopenharmony_ci		}
27258c2ecf20Sopenharmony_ci
27268c2ecf20Sopenharmony_ci		/*
27278c2ecf20Sopenharmony_ci		 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
27288c2ecf20Sopenharmony_ci		 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
27298c2ecf20Sopenharmony_ci		 * We are handling both for compatibility reasons.
27308c2ecf20Sopenharmony_ci		 */
27318c2ecf20Sopenharmony_ci		if (ret == -EAGAIN || ret == EAGAIN) {
27328c2ecf20Sopenharmony_ci			mlog(0, "%s: trying to start recovery of node "
27338c2ecf20Sopenharmony_ci			     "%u, but node %u is waiting for last recovery "
27348c2ecf20Sopenharmony_ci			     "to complete, backoff for a bit\n", dlm->name,
27358c2ecf20Sopenharmony_ci			     dead_node, nodenum);
27368c2ecf20Sopenharmony_ci			msleep(100);
27378c2ecf20Sopenharmony_ci			goto retry;
27388c2ecf20Sopenharmony_ci		}
27398c2ecf20Sopenharmony_ci		if (ret < 0) {
27408c2ecf20Sopenharmony_ci			struct dlm_lock_resource *res;
27418c2ecf20Sopenharmony_ci
27428c2ecf20Sopenharmony_ci			/* this is now a serious problem, possibly ENOMEM
27438c2ecf20Sopenharmony_ci			 * in the network stack.  must retry */
27448c2ecf20Sopenharmony_ci			mlog_errno(ret);
27458c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
27468c2ecf20Sopenharmony_ci			     "returned %d\n", dlm->name, nodenum, ret);
27478c2ecf20Sopenharmony_ci			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
27488c2ecf20Sopenharmony_ci						 DLM_RECOVERY_LOCK_NAME_LEN);
27498c2ecf20Sopenharmony_ci			if (res) {
27508c2ecf20Sopenharmony_ci				dlm_print_one_lock_resource(res);
27518c2ecf20Sopenharmony_ci				dlm_lockres_put(res);
27528c2ecf20Sopenharmony_ci			} else {
27538c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "recovery lock not found\n");
27548c2ecf20Sopenharmony_ci			}
27558c2ecf20Sopenharmony_ci			/* sleep for a bit in hopes that we can avoid
27568c2ecf20Sopenharmony_ci			 * another ENOMEM */
27578c2ecf20Sopenharmony_ci			msleep(100);
27588c2ecf20Sopenharmony_ci			goto retry;
27598c2ecf20Sopenharmony_ci		}
27608c2ecf20Sopenharmony_ci	}
27618c2ecf20Sopenharmony_ci
27628c2ecf20Sopenharmony_ci	return ret;
27638c2ecf20Sopenharmony_ci}
27648c2ecf20Sopenharmony_ci
27658c2ecf20Sopenharmony_ciint dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
27668c2ecf20Sopenharmony_ci			   void **ret_data)
27678c2ecf20Sopenharmony_ci{
27688c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
27698c2ecf20Sopenharmony_ci	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
27708c2ecf20Sopenharmony_ci
27718c2ecf20Sopenharmony_ci	/* ok to return 0, domain has gone away */
27728c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm))
27738c2ecf20Sopenharmony_ci		return 0;
27748c2ecf20Sopenharmony_ci
27758c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
27768c2ecf20Sopenharmony_ci	if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
27778c2ecf20Sopenharmony_ci		mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
27788c2ecf20Sopenharmony_ci		     "but this node is in finalize state, waiting on finalize2\n",
27798c2ecf20Sopenharmony_ci		     dlm->name, br->node_idx, br->dead_node,
27808c2ecf20Sopenharmony_ci		     dlm->reco.dead_node, dlm->reco.new_master);
27818c2ecf20Sopenharmony_ci		spin_unlock(&dlm->spinlock);
27828c2ecf20Sopenharmony_ci		dlm_put(dlm);
27838c2ecf20Sopenharmony_ci		return -EAGAIN;
27848c2ecf20Sopenharmony_ci	}
27858c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
27868c2ecf20Sopenharmony_ci
27878c2ecf20Sopenharmony_ci	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
27888c2ecf20Sopenharmony_ci	     dlm->name, br->node_idx, br->dead_node,
27898c2ecf20Sopenharmony_ci	     dlm->reco.dead_node, dlm->reco.new_master);
27908c2ecf20Sopenharmony_ci
27918c2ecf20Sopenharmony_ci	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
27928c2ecf20Sopenharmony_ci
27938c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
27948c2ecf20Sopenharmony_ci	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
27958c2ecf20Sopenharmony_ci		if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
27968c2ecf20Sopenharmony_ci			mlog(0, "%s: new_master %u died, changing "
27978c2ecf20Sopenharmony_ci			     "to %u\n", dlm->name, dlm->reco.new_master,
27988c2ecf20Sopenharmony_ci			     br->node_idx);
27998c2ecf20Sopenharmony_ci		} else {
28008c2ecf20Sopenharmony_ci			mlog(0, "%s: new_master %u NOT DEAD, changing "
28018c2ecf20Sopenharmony_ci			     "to %u\n", dlm->name, dlm->reco.new_master,
28028c2ecf20Sopenharmony_ci			     br->node_idx);
28038c2ecf20Sopenharmony_ci			/* may not have seen the new master as dead yet */
28048c2ecf20Sopenharmony_ci		}
28058c2ecf20Sopenharmony_ci	}
28068c2ecf20Sopenharmony_ci	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
28078c2ecf20Sopenharmony_ci		mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
28088c2ecf20Sopenharmony_ci		     "node %u changing it to %u\n", dlm->name,
28098c2ecf20Sopenharmony_ci		     dlm->reco.dead_node, br->node_idx, br->dead_node);
28108c2ecf20Sopenharmony_ci	}
28118c2ecf20Sopenharmony_ci	dlm_set_reco_master(dlm, br->node_idx);
28128c2ecf20Sopenharmony_ci	dlm_set_reco_dead_node(dlm, br->dead_node);
28138c2ecf20Sopenharmony_ci	if (!test_bit(br->dead_node, dlm->recovery_map)) {
28148c2ecf20Sopenharmony_ci		mlog(0, "recovery master %u sees %u as dead, but this "
28158c2ecf20Sopenharmony_ci		     "node has not yet.  marking %u as dead\n",
28168c2ecf20Sopenharmony_ci		     br->node_idx, br->dead_node, br->dead_node);
28178c2ecf20Sopenharmony_ci		if (!test_bit(br->dead_node, dlm->domain_map) ||
28188c2ecf20Sopenharmony_ci		    !test_bit(br->dead_node, dlm->live_nodes_map))
28198c2ecf20Sopenharmony_ci			mlog(0, "%u not in domain/live_nodes map "
28208c2ecf20Sopenharmony_ci			     "so setting it in reco map manually\n",
28218c2ecf20Sopenharmony_ci			     br->dead_node);
28228c2ecf20Sopenharmony_ci		/* force the recovery cleanup in __dlm_hb_node_down
28238c2ecf20Sopenharmony_ci		 * both of these will be cleared in a moment */
28248c2ecf20Sopenharmony_ci		set_bit(br->dead_node, dlm->domain_map);
28258c2ecf20Sopenharmony_ci		set_bit(br->dead_node, dlm->live_nodes_map);
28268c2ecf20Sopenharmony_ci		__dlm_hb_node_down(dlm, br->dead_node);
28278c2ecf20Sopenharmony_ci	}
28288c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
28298c2ecf20Sopenharmony_ci
28308c2ecf20Sopenharmony_ci	dlm_kick_recovery_thread(dlm);
28318c2ecf20Sopenharmony_ci
28328c2ecf20Sopenharmony_ci	mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
28338c2ecf20Sopenharmony_ci	     dlm->name, br->node_idx, br->dead_node,
28348c2ecf20Sopenharmony_ci	     dlm->reco.dead_node, dlm->reco.new_master);
28358c2ecf20Sopenharmony_ci
28368c2ecf20Sopenharmony_ci	dlm_put(dlm);
28378c2ecf20Sopenharmony_ci	return 0;
28388c2ecf20Sopenharmony_ci}
28398c2ecf20Sopenharmony_ci
28408c2ecf20Sopenharmony_ci#define DLM_FINALIZE_STAGE2  0x01
28418c2ecf20Sopenharmony_cistatic int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
28428c2ecf20Sopenharmony_ci{
28438c2ecf20Sopenharmony_ci	int ret = 0;
28448c2ecf20Sopenharmony_ci	struct dlm_finalize_reco fr;
28458c2ecf20Sopenharmony_ci	struct dlm_node_iter iter;
28468c2ecf20Sopenharmony_ci	int nodenum;
28478c2ecf20Sopenharmony_ci	int status;
28488c2ecf20Sopenharmony_ci	int stage = 1;
28498c2ecf20Sopenharmony_ci
28508c2ecf20Sopenharmony_ci	mlog(0, "finishing recovery for node %s:%u, "
28518c2ecf20Sopenharmony_ci	     "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
28528c2ecf20Sopenharmony_ci
28538c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
28548c2ecf20Sopenharmony_ci	dlm_node_iter_init(dlm->domain_map, &iter);
28558c2ecf20Sopenharmony_ci	spin_unlock(&dlm->spinlock);
28568c2ecf20Sopenharmony_ci
28578c2ecf20Sopenharmony_cistage2:
28588c2ecf20Sopenharmony_ci	memset(&fr, 0, sizeof(fr));
28598c2ecf20Sopenharmony_ci	fr.node_idx = dlm->node_num;
28608c2ecf20Sopenharmony_ci	fr.dead_node = dlm->reco.dead_node;
28618c2ecf20Sopenharmony_ci	if (stage == 2)
28628c2ecf20Sopenharmony_ci		fr.flags |= DLM_FINALIZE_STAGE2;
28638c2ecf20Sopenharmony_ci
28648c2ecf20Sopenharmony_ci	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
28658c2ecf20Sopenharmony_ci		if (nodenum == dlm->node_num)
28668c2ecf20Sopenharmony_ci			continue;
28678c2ecf20Sopenharmony_ci		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
28688c2ecf20Sopenharmony_ci					 &fr, sizeof(fr), nodenum, &status);
28698c2ecf20Sopenharmony_ci		if (ret >= 0)
28708c2ecf20Sopenharmony_ci			ret = status;
28718c2ecf20Sopenharmony_ci		if (ret < 0) {
28728c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "Error %d when sending message %u (key "
28738c2ecf20Sopenharmony_ci			     "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
28748c2ecf20Sopenharmony_ci			     dlm->key, nodenum);
28758c2ecf20Sopenharmony_ci			if (dlm_is_host_down(ret)) {
28768c2ecf20Sopenharmony_ci				/* this has no effect on this recovery
28778c2ecf20Sopenharmony_ci				 * session, so set the status to zero to
28788c2ecf20Sopenharmony_ci				 * finish out the last recovery */
28798c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "node %u went down after this "
28808c2ecf20Sopenharmony_ci				     "node finished recovery.\n", nodenum);
28818c2ecf20Sopenharmony_ci				ret = 0;
28828c2ecf20Sopenharmony_ci				continue;
28838c2ecf20Sopenharmony_ci			}
28848c2ecf20Sopenharmony_ci			break;
28858c2ecf20Sopenharmony_ci		}
28868c2ecf20Sopenharmony_ci	}
28878c2ecf20Sopenharmony_ci	if (stage == 1) {
28888c2ecf20Sopenharmony_ci		/* reset the node_iter back to the top and send finalize2 */
28898c2ecf20Sopenharmony_ci		iter.curnode = -1;
28908c2ecf20Sopenharmony_ci		stage = 2;
28918c2ecf20Sopenharmony_ci		goto stage2;
28928c2ecf20Sopenharmony_ci	}
28938c2ecf20Sopenharmony_ci
28948c2ecf20Sopenharmony_ci	return ret;
28958c2ecf20Sopenharmony_ci}
28968c2ecf20Sopenharmony_ci
28978c2ecf20Sopenharmony_ciint dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
28988c2ecf20Sopenharmony_ci			      void **ret_data)
28998c2ecf20Sopenharmony_ci{
29008c2ecf20Sopenharmony_ci	struct dlm_ctxt *dlm = data;
29018c2ecf20Sopenharmony_ci	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
29028c2ecf20Sopenharmony_ci	int stage = 1;
29038c2ecf20Sopenharmony_ci
29048c2ecf20Sopenharmony_ci	/* ok to return 0, domain has gone away */
29058c2ecf20Sopenharmony_ci	if (!dlm_grab(dlm))
29068c2ecf20Sopenharmony_ci		return 0;
29078c2ecf20Sopenharmony_ci
29088c2ecf20Sopenharmony_ci	if (fr->flags & DLM_FINALIZE_STAGE2)
29098c2ecf20Sopenharmony_ci		stage = 2;
29108c2ecf20Sopenharmony_ci
29118c2ecf20Sopenharmony_ci	mlog(0, "%s: node %u finalizing recovery stage%d of "
29128c2ecf20Sopenharmony_ci	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
29138c2ecf20Sopenharmony_ci	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
29148c2ecf20Sopenharmony_ci
29158c2ecf20Sopenharmony_ci	spin_lock(&dlm->spinlock);
29168c2ecf20Sopenharmony_ci
29178c2ecf20Sopenharmony_ci	if (dlm->reco.new_master != fr->node_idx) {
29188c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
29198c2ecf20Sopenharmony_ci		     "%u is supposed to be the new master, dead=%u\n",
29208c2ecf20Sopenharmony_ci		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
29218c2ecf20Sopenharmony_ci		BUG();
29228c2ecf20Sopenharmony_ci	}
29238c2ecf20Sopenharmony_ci	if (dlm->reco.dead_node != fr->dead_node) {
29248c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
29258c2ecf20Sopenharmony_ci		     "node %u, but node %u is supposed to be dead\n",
29268c2ecf20Sopenharmony_ci		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
29278c2ecf20Sopenharmony_ci		BUG();
29288c2ecf20Sopenharmony_ci	}
29298c2ecf20Sopenharmony_ci
29308c2ecf20Sopenharmony_ci	switch (stage) {
29318c2ecf20Sopenharmony_ci		case 1:
29328c2ecf20Sopenharmony_ci			dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
29338c2ecf20Sopenharmony_ci			if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
29348c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "%s: received finalize1 from "
29358c2ecf20Sopenharmony_ci				     "new master %u for dead node %u, but "
29368c2ecf20Sopenharmony_ci				     "this node has already received it!\n",
29378c2ecf20Sopenharmony_ci				     dlm->name, fr->node_idx, fr->dead_node);
29388c2ecf20Sopenharmony_ci				dlm_print_reco_node_status(dlm);
29398c2ecf20Sopenharmony_ci				BUG();
29408c2ecf20Sopenharmony_ci			}
29418c2ecf20Sopenharmony_ci			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
29428c2ecf20Sopenharmony_ci			spin_unlock(&dlm->spinlock);
29438c2ecf20Sopenharmony_ci			break;
29448c2ecf20Sopenharmony_ci		case 2:
29458c2ecf20Sopenharmony_ci			if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
29468c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "%s: received finalize2 from "
29478c2ecf20Sopenharmony_ci				     "new master %u for dead node %u, but "
29488c2ecf20Sopenharmony_ci				     "this node did not have finalize1!\n",
29498c2ecf20Sopenharmony_ci				     dlm->name, fr->node_idx, fr->dead_node);
29508c2ecf20Sopenharmony_ci				dlm_print_reco_node_status(dlm);
29518c2ecf20Sopenharmony_ci				BUG();
29528c2ecf20Sopenharmony_ci			}
29538c2ecf20Sopenharmony_ci			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
29548c2ecf20Sopenharmony_ci			__dlm_reset_recovery(dlm);
29558c2ecf20Sopenharmony_ci			spin_unlock(&dlm->spinlock);
29568c2ecf20Sopenharmony_ci			dlm_kick_recovery_thread(dlm);
29578c2ecf20Sopenharmony_ci			break;
29588c2ecf20Sopenharmony_ci	}
29598c2ecf20Sopenharmony_ci
29608c2ecf20Sopenharmony_ci	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
29618c2ecf20Sopenharmony_ci	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
29628c2ecf20Sopenharmony_ci
29638c2ecf20Sopenharmony_ci	dlm_put(dlm);
29648c2ecf20Sopenharmony_ci	return 0;
29658c2ecf20Sopenharmony_ci}
2966