18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* -*- mode: c; c-basic-offset: 8; -*- 38c2ecf20Sopenharmony_ci * vim: noexpandtab sw=8 ts=8 sts=0: 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * dlmmod.c 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * standalone DLM module 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * Copyright (C) 2004 Oracle. All rights reserved. 108c2ecf20Sopenharmony_ci */ 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci#include <linux/module.h> 148c2ecf20Sopenharmony_ci#include <linux/fs.h> 158c2ecf20Sopenharmony_ci#include <linux/types.h> 168c2ecf20Sopenharmony_ci#include <linux/slab.h> 178c2ecf20Sopenharmony_ci#include <linux/highmem.h> 188c2ecf20Sopenharmony_ci#include <linux/init.h> 198c2ecf20Sopenharmony_ci#include <linux/sysctl.h> 208c2ecf20Sopenharmony_ci#include <linux/random.h> 218c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 228c2ecf20Sopenharmony_ci#include <linux/socket.h> 238c2ecf20Sopenharmony_ci#include <linux/inet.h> 248c2ecf20Sopenharmony_ci#include <linux/spinlock.h> 258c2ecf20Sopenharmony_ci#include <linux/delay.h> 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci#include "../cluster/heartbeat.h" 298c2ecf20Sopenharmony_ci#include "../cluster/nodemanager.h" 308c2ecf20Sopenharmony_ci#include "../cluster/tcp.h" 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ci#include "dlmapi.h" 338c2ecf20Sopenharmony_ci#include "dlmcommon.h" 348c2ecf20Sopenharmony_ci#include "dlmdomain.h" 358c2ecf20Sopenharmony_ci#include "dlmdebug.h" 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) 388c2ecf20Sopenharmony_ci#include "../cluster/masklog.h" 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_cistatic void dlm_mle_node_down(struct dlm_ctxt *dlm, 418c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 428c2ecf20Sopenharmony_ci struct o2nm_node *node, 438c2ecf20Sopenharmony_ci int idx); 448c2ecf20Sopenharmony_cistatic void dlm_mle_node_up(struct dlm_ctxt *dlm, 458c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 468c2ecf20Sopenharmony_ci struct o2nm_node *node, 478c2ecf20Sopenharmony_ci int idx); 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_cistatic void dlm_assert_master_worker(struct dlm_work_item *item, void *data); 508c2ecf20Sopenharmony_cistatic int dlm_do_assert_master(struct dlm_ctxt *dlm, 518c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 528c2ecf20Sopenharmony_ci void *nodemap, u32 flags); 538c2ecf20Sopenharmony_cistatic void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_cistatic inline int dlm_mle_equal(struct dlm_ctxt *dlm, 568c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 578c2ecf20Sopenharmony_ci const char *name, 588c2ecf20Sopenharmony_ci unsigned int namelen) 598c2ecf20Sopenharmony_ci{ 608c2ecf20Sopenharmony_ci if (dlm != mle->dlm) 618c2ecf20Sopenharmony_ci return 0; 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci if (namelen != mle->mnamelen || 648c2ecf20Sopenharmony_ci memcmp(name, mle->mname, namelen) != 0) 658c2ecf20Sopenharmony_ci return 0; 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci return 1; 688c2ecf20Sopenharmony_ci} 698c2ecf20Sopenharmony_ci 708c2ecf20Sopenharmony_cistatic struct kmem_cache *dlm_lockres_cache; 718c2ecf20Sopenharmony_cistatic struct kmem_cache *dlm_lockname_cache; 728c2ecf20Sopenharmony_cistatic struct kmem_cache *dlm_mle_cache; 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_cistatic void dlm_mle_release(struct kref *kref); 758c2ecf20Sopenharmony_cistatic void dlm_init_mle(struct dlm_master_list_entry *mle, 768c2ecf20Sopenharmony_ci enum dlm_mle_type type, 778c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm, 788c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 798c2ecf20Sopenharmony_ci const char *name, 808c2ecf20Sopenharmony_ci unsigned int namelen); 818c2ecf20Sopenharmony_cistatic void dlm_put_mle(struct dlm_master_list_entry *mle); 828c2ecf20Sopenharmony_cistatic void __dlm_put_mle(struct dlm_master_list_entry *mle); 838c2ecf20Sopenharmony_cistatic int dlm_find_mle(struct dlm_ctxt *dlm, 848c2ecf20Sopenharmony_ci struct dlm_master_list_entry **mle, 858c2ecf20Sopenharmony_ci char *name, unsigned int namelen); 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_cistatic int dlm_do_master_request(struct dlm_lock_resource *res, 888c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, int to); 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci 918c2ecf20Sopenharmony_cistatic int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, 928c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 938c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 948c2ecf20Sopenharmony_ci int *blocked); 958c2ecf20Sopenharmony_cistatic int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, 968c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 978c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 988c2ecf20Sopenharmony_ci int blocked); 998c2ecf20Sopenharmony_cistatic int dlm_add_migration_mle(struct dlm_ctxt *dlm, 1008c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 1018c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 1028c2ecf20Sopenharmony_ci struct dlm_master_list_entry **oldmle, 1038c2ecf20Sopenharmony_ci const char *name, unsigned int namelen, 1048c2ecf20Sopenharmony_ci u8 new_master, u8 master); 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_cistatic u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 1078c2ecf20Sopenharmony_ci struct dlm_lock_resource *res); 1088c2ecf20Sopenharmony_cistatic void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 1098c2ecf20Sopenharmony_ci struct dlm_lock_resource *res); 1108c2ecf20Sopenharmony_cistatic int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, 1118c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 1128c2ecf20Sopenharmony_ci u8 target); 1138c2ecf20Sopenharmony_cistatic int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 1148c2ecf20Sopenharmony_ci struct dlm_lock_resource *res); 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ciint dlm_is_host_down(int errno) 1188c2ecf20Sopenharmony_ci{ 1198c2ecf20Sopenharmony_ci switch (errno) { 1208c2ecf20Sopenharmony_ci case -EBADF: 1218c2ecf20Sopenharmony_ci case -ECONNREFUSED: 1228c2ecf20Sopenharmony_ci case -ENOTCONN: 1238c2ecf20Sopenharmony_ci case -ECONNRESET: 1248c2ecf20Sopenharmony_ci case -EPIPE: 1258c2ecf20Sopenharmony_ci case -EHOSTDOWN: 1268c2ecf20Sopenharmony_ci case -EHOSTUNREACH: 1278c2ecf20Sopenharmony_ci case -ETIMEDOUT: 1288c2ecf20Sopenharmony_ci case -ECONNABORTED: 1298c2ecf20Sopenharmony_ci case -ENETDOWN: 1308c2ecf20Sopenharmony_ci case -ENETUNREACH: 1318c2ecf20Sopenharmony_ci case -ENETRESET: 1328c2ecf20Sopenharmony_ci case -ESHUTDOWN: 1338c2ecf20Sopenharmony_ci case -ENOPROTOOPT: 1348c2ecf20Sopenharmony_ci case -EINVAL: /* if returned from our tcp code, 1358c2ecf20Sopenharmony_ci this means there is no socket */ 1368c2ecf20Sopenharmony_ci return 1; 1378c2ecf20Sopenharmony_ci } 1388c2ecf20Sopenharmony_ci return 0; 1398c2ecf20Sopenharmony_ci} 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci/* 1438c2ecf20Sopenharmony_ci * MASTER LIST FUNCTIONS 1448c2ecf20Sopenharmony_ci */ 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci/* 1488c2ecf20Sopenharmony_ci * regarding master list entries and heartbeat callbacks: 1498c2ecf20Sopenharmony_ci * 1508c2ecf20Sopenharmony_ci * in order to avoid sleeping and allocation that occurs in 1518c2ecf20Sopenharmony_ci * heartbeat, master list entries are simply attached to the 1528c2ecf20Sopenharmony_ci * dlm's established heartbeat callbacks. the mle is attached 1538c2ecf20Sopenharmony_ci * when it is created, and since the dlm->spinlock is held at 1548c2ecf20Sopenharmony_ci * that time, any heartbeat event will be properly discovered 1558c2ecf20Sopenharmony_ci * by the mle. the mle needs to be detached from the 1568c2ecf20Sopenharmony_ci * dlm->mle_hb_events list as soon as heartbeat events are no 1578c2ecf20Sopenharmony_ci * longer useful to the mle, and before the mle is freed. 1588c2ecf20Sopenharmony_ci * 1598c2ecf20Sopenharmony_ci * as a general rule, heartbeat events are no longer needed by 1608c2ecf20Sopenharmony_ci * the mle once an "answer" regarding the lock master has been 1618c2ecf20Sopenharmony_ci * received. 1628c2ecf20Sopenharmony_ci */ 1638c2ecf20Sopenharmony_cistatic inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, 1648c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle) 1658c2ecf20Sopenharmony_ci{ 1668c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci list_add_tail(&mle->hb_events, &dlm->mle_hb_events); 1698c2ecf20Sopenharmony_ci} 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_cistatic inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, 1738c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle) 1748c2ecf20Sopenharmony_ci{ 1758c2ecf20Sopenharmony_ci if (!list_empty(&mle->hb_events)) 1768c2ecf20Sopenharmony_ci list_del_init(&mle->hb_events); 1778c2ecf20Sopenharmony_ci} 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_cistatic inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, 1818c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle) 1828c2ecf20Sopenharmony_ci{ 1838c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 1848c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, mle); 1858c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 1868c2ecf20Sopenharmony_ci} 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_cistatic void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) 1898c2ecf20Sopenharmony_ci{ 1908c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm; 1918c2ecf20Sopenharmony_ci dlm = mle->dlm; 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 1948c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->master_lock); 1958c2ecf20Sopenharmony_ci mle->inuse++; 1968c2ecf20Sopenharmony_ci kref_get(&mle->mle_refs); 1978c2ecf20Sopenharmony_ci} 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_cistatic void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) 2008c2ecf20Sopenharmony_ci{ 2018c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm; 2028c2ecf20Sopenharmony_ci dlm = mle->dlm; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 2058c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 2068c2ecf20Sopenharmony_ci mle->inuse--; 2078c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 2088c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 2098c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci} 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci/* remove from list and free */ 2148c2ecf20Sopenharmony_cistatic void __dlm_put_mle(struct dlm_master_list_entry *mle) 2158c2ecf20Sopenharmony_ci{ 2168c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm; 2178c2ecf20Sopenharmony_ci dlm = mle->dlm; 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 2208c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->master_lock); 2218c2ecf20Sopenharmony_ci if (!kref_read(&mle->mle_refs)) { 2228c2ecf20Sopenharmony_ci /* this may or may not crash, but who cares. 2238c2ecf20Sopenharmony_ci * it's a BUG. */ 2248c2ecf20Sopenharmony_ci mlog(ML_ERROR, "bad mle: %p\n", mle); 2258c2ecf20Sopenharmony_ci dlm_print_one_mle(mle); 2268c2ecf20Sopenharmony_ci BUG(); 2278c2ecf20Sopenharmony_ci } else 2288c2ecf20Sopenharmony_ci kref_put(&mle->mle_refs, dlm_mle_release); 2298c2ecf20Sopenharmony_ci} 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_ci/* must not have any spinlocks coming in */ 2338c2ecf20Sopenharmony_cistatic void dlm_put_mle(struct dlm_master_list_entry *mle) 2348c2ecf20Sopenharmony_ci{ 2358c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm; 2368c2ecf20Sopenharmony_ci dlm = mle->dlm; 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 2398c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 2408c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 2418c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 2428c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 2438c2ecf20Sopenharmony_ci} 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_cistatic inline void dlm_get_mle(struct dlm_master_list_entry *mle) 2468c2ecf20Sopenharmony_ci{ 2478c2ecf20Sopenharmony_ci kref_get(&mle->mle_refs); 2488c2ecf20Sopenharmony_ci} 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_cistatic void dlm_init_mle(struct dlm_master_list_entry *mle, 2518c2ecf20Sopenharmony_ci enum dlm_mle_type type, 2528c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm, 2538c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 2548c2ecf20Sopenharmony_ci const char *name, 2558c2ecf20Sopenharmony_ci unsigned int namelen) 2568c2ecf20Sopenharmony_ci{ 2578c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci mle->dlm = dlm; 2608c2ecf20Sopenharmony_ci mle->type = type; 2618c2ecf20Sopenharmony_ci INIT_HLIST_NODE(&mle->master_hash_node); 2628c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&mle->hb_events); 2638c2ecf20Sopenharmony_ci memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 2648c2ecf20Sopenharmony_ci spin_lock_init(&mle->spinlock); 2658c2ecf20Sopenharmony_ci init_waitqueue_head(&mle->wq); 2668c2ecf20Sopenharmony_ci atomic_set(&mle->woken, 0); 2678c2ecf20Sopenharmony_ci kref_init(&mle->mle_refs); 2688c2ecf20Sopenharmony_ci memset(mle->response_map, 0, sizeof(mle->response_map)); 2698c2ecf20Sopenharmony_ci mle->master = O2NM_MAX_NODES; 2708c2ecf20Sopenharmony_ci mle->new_master = O2NM_MAX_NODES; 2718c2ecf20Sopenharmony_ci mle->inuse = 0; 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci BUG_ON(mle->type != DLM_MLE_BLOCK && 2748c2ecf20Sopenharmony_ci mle->type != DLM_MLE_MASTER && 2758c2ecf20Sopenharmony_ci mle->type != DLM_MLE_MIGRATION); 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci if (mle->type == DLM_MLE_MASTER) { 2788c2ecf20Sopenharmony_ci BUG_ON(!res); 2798c2ecf20Sopenharmony_ci mle->mleres = res; 2808c2ecf20Sopenharmony_ci memcpy(mle->mname, res->lockname.name, res->lockname.len); 2818c2ecf20Sopenharmony_ci mle->mnamelen = res->lockname.len; 2828c2ecf20Sopenharmony_ci mle->mnamehash = res->lockname.hash; 2838c2ecf20Sopenharmony_ci } else { 2848c2ecf20Sopenharmony_ci BUG_ON(!name); 2858c2ecf20Sopenharmony_ci mle->mleres = NULL; 2868c2ecf20Sopenharmony_ci memcpy(mle->mname, name, namelen); 2878c2ecf20Sopenharmony_ci mle->mnamelen = namelen; 2888c2ecf20Sopenharmony_ci mle->mnamehash = dlm_lockid_hash(name, namelen); 2898c2ecf20Sopenharmony_ci } 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci atomic_inc(&dlm->mle_tot_count[mle->type]); 2928c2ecf20Sopenharmony_ci atomic_inc(&dlm->mle_cur_count[mle->type]); 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ci /* copy off the node_map and register hb callbacks on our copy */ 2958c2ecf20Sopenharmony_ci memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); 2968c2ecf20Sopenharmony_ci memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); 2978c2ecf20Sopenharmony_ci clear_bit(dlm->node_num, mle->vote_map); 2988c2ecf20Sopenharmony_ci clear_bit(dlm->node_num, mle->node_map); 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci /* attach the mle to the domain node up/down events */ 3018c2ecf20Sopenharmony_ci __dlm_mle_attach_hb_events(dlm, mle); 3028c2ecf20Sopenharmony_ci} 3038c2ecf20Sopenharmony_ci 3048c2ecf20Sopenharmony_civoid __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) 3058c2ecf20Sopenharmony_ci{ 3068c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 3078c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->master_lock); 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci if (!hlist_unhashed(&mle->master_hash_node)) 3108c2ecf20Sopenharmony_ci hlist_del_init(&mle->master_hash_node); 3118c2ecf20Sopenharmony_ci} 3128c2ecf20Sopenharmony_ci 3138c2ecf20Sopenharmony_civoid __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle) 3148c2ecf20Sopenharmony_ci{ 3158c2ecf20Sopenharmony_ci struct hlist_head *bucket; 3168c2ecf20Sopenharmony_ci 3178c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->master_lock); 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_ci bucket = dlm_master_hash(dlm, mle->mnamehash); 3208c2ecf20Sopenharmony_ci hlist_add_head(&mle->master_hash_node, bucket); 3218c2ecf20Sopenharmony_ci} 3228c2ecf20Sopenharmony_ci 3238c2ecf20Sopenharmony_ci/* returns 1 if found, 0 if not */ 3248c2ecf20Sopenharmony_cistatic int dlm_find_mle(struct dlm_ctxt *dlm, 3258c2ecf20Sopenharmony_ci struct dlm_master_list_entry **mle, 3268c2ecf20Sopenharmony_ci char *name, unsigned int namelen) 3278c2ecf20Sopenharmony_ci{ 3288c2ecf20Sopenharmony_ci struct dlm_master_list_entry *tmpmle; 3298c2ecf20Sopenharmony_ci struct hlist_head *bucket; 3308c2ecf20Sopenharmony_ci unsigned int hash; 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->master_lock); 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci hash = dlm_lockid_hash(name, namelen); 3358c2ecf20Sopenharmony_ci bucket = dlm_master_hash(dlm, hash); 3368c2ecf20Sopenharmony_ci hlist_for_each_entry(tmpmle, bucket, master_hash_node) { 3378c2ecf20Sopenharmony_ci if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) 3388c2ecf20Sopenharmony_ci continue; 3398c2ecf20Sopenharmony_ci dlm_get_mle(tmpmle); 3408c2ecf20Sopenharmony_ci *mle = tmpmle; 3418c2ecf20Sopenharmony_ci return 1; 3428c2ecf20Sopenharmony_ci } 3438c2ecf20Sopenharmony_ci return 0; 3448c2ecf20Sopenharmony_ci} 3458c2ecf20Sopenharmony_ci 3468c2ecf20Sopenharmony_civoid dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) 3478c2ecf20Sopenharmony_ci{ 3488c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle; 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 3518c2ecf20Sopenharmony_ci 3528c2ecf20Sopenharmony_ci list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { 3538c2ecf20Sopenharmony_ci if (node_up) 3548c2ecf20Sopenharmony_ci dlm_mle_node_up(dlm, mle, NULL, idx); 3558c2ecf20Sopenharmony_ci else 3568c2ecf20Sopenharmony_ci dlm_mle_node_down(dlm, mle, NULL, idx); 3578c2ecf20Sopenharmony_ci } 3588c2ecf20Sopenharmony_ci} 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_cistatic void dlm_mle_node_down(struct dlm_ctxt *dlm, 3618c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 3628c2ecf20Sopenharmony_ci struct o2nm_node *node, int idx) 3638c2ecf20Sopenharmony_ci{ 3648c2ecf20Sopenharmony_ci spin_lock(&mle->spinlock); 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci if (!test_bit(idx, mle->node_map)) 3678c2ecf20Sopenharmony_ci mlog(0, "node %u already removed from nodemap!\n", idx); 3688c2ecf20Sopenharmony_ci else 3698c2ecf20Sopenharmony_ci clear_bit(idx, mle->node_map); 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 3728c2ecf20Sopenharmony_ci} 3738c2ecf20Sopenharmony_ci 3748c2ecf20Sopenharmony_cistatic void dlm_mle_node_up(struct dlm_ctxt *dlm, 3758c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 3768c2ecf20Sopenharmony_ci struct o2nm_node *node, int idx) 3778c2ecf20Sopenharmony_ci{ 3788c2ecf20Sopenharmony_ci spin_lock(&mle->spinlock); 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_ci if (test_bit(idx, mle->node_map)) 3818c2ecf20Sopenharmony_ci mlog(0, "node %u already in node map!\n", idx); 3828c2ecf20Sopenharmony_ci else 3838c2ecf20Sopenharmony_ci set_bit(idx, mle->node_map); 3848c2ecf20Sopenharmony_ci 3858c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 3868c2ecf20Sopenharmony_ci} 3878c2ecf20Sopenharmony_ci 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ciint dlm_init_mle_cache(void) 3908c2ecf20Sopenharmony_ci{ 3918c2ecf20Sopenharmony_ci dlm_mle_cache = kmem_cache_create("o2dlm_mle", 3928c2ecf20Sopenharmony_ci sizeof(struct dlm_master_list_entry), 3938c2ecf20Sopenharmony_ci 0, SLAB_HWCACHE_ALIGN, 3948c2ecf20Sopenharmony_ci NULL); 3958c2ecf20Sopenharmony_ci if (dlm_mle_cache == NULL) 3968c2ecf20Sopenharmony_ci return -ENOMEM; 3978c2ecf20Sopenharmony_ci return 0; 3988c2ecf20Sopenharmony_ci} 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_civoid dlm_destroy_mle_cache(void) 4018c2ecf20Sopenharmony_ci{ 4028c2ecf20Sopenharmony_ci kmem_cache_destroy(dlm_mle_cache); 4038c2ecf20Sopenharmony_ci} 4048c2ecf20Sopenharmony_ci 4058c2ecf20Sopenharmony_cistatic void dlm_mle_release(struct kref *kref) 4068c2ecf20Sopenharmony_ci{ 4078c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle; 4088c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm; 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 4118c2ecf20Sopenharmony_ci dlm = mle->dlm; 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 4148c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->master_lock); 4158c2ecf20Sopenharmony_ci 4168c2ecf20Sopenharmony_ci mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname, 4178c2ecf20Sopenharmony_ci mle->type); 4188c2ecf20Sopenharmony_ci 4198c2ecf20Sopenharmony_ci /* remove from list if not already */ 4208c2ecf20Sopenharmony_ci __dlm_unlink_mle(dlm, mle); 4218c2ecf20Sopenharmony_ci 4228c2ecf20Sopenharmony_ci /* detach the mle from the domain node up/down events */ 4238c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, mle); 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci atomic_dec(&dlm->mle_cur_count[mle->type]); 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci /* NOTE: kfree under spinlock here. 4288c2ecf20Sopenharmony_ci * if this is bad, we can move this to a freelist. */ 4298c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 4308c2ecf20Sopenharmony_ci} 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci/* 4348c2ecf20Sopenharmony_ci * LOCK RESOURCE FUNCTIONS 4358c2ecf20Sopenharmony_ci */ 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ciint dlm_init_master_caches(void) 4388c2ecf20Sopenharmony_ci{ 4398c2ecf20Sopenharmony_ci dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", 4408c2ecf20Sopenharmony_ci sizeof(struct dlm_lock_resource), 4418c2ecf20Sopenharmony_ci 0, SLAB_HWCACHE_ALIGN, NULL); 4428c2ecf20Sopenharmony_ci if (!dlm_lockres_cache) 4438c2ecf20Sopenharmony_ci goto bail; 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", 4468c2ecf20Sopenharmony_ci DLM_LOCKID_NAME_MAX, 0, 4478c2ecf20Sopenharmony_ci SLAB_HWCACHE_ALIGN, NULL); 4488c2ecf20Sopenharmony_ci if (!dlm_lockname_cache) 4498c2ecf20Sopenharmony_ci goto bail; 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci return 0; 4528c2ecf20Sopenharmony_cibail: 4538c2ecf20Sopenharmony_ci dlm_destroy_master_caches(); 4548c2ecf20Sopenharmony_ci return -ENOMEM; 4558c2ecf20Sopenharmony_ci} 4568c2ecf20Sopenharmony_ci 4578c2ecf20Sopenharmony_civoid dlm_destroy_master_caches(void) 4588c2ecf20Sopenharmony_ci{ 4598c2ecf20Sopenharmony_ci kmem_cache_destroy(dlm_lockname_cache); 4608c2ecf20Sopenharmony_ci dlm_lockname_cache = NULL; 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci kmem_cache_destroy(dlm_lockres_cache); 4638c2ecf20Sopenharmony_ci dlm_lockres_cache = NULL; 4648c2ecf20Sopenharmony_ci} 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_cistatic void dlm_lockres_release(struct kref *kref) 4678c2ecf20Sopenharmony_ci{ 4688c2ecf20Sopenharmony_ci struct dlm_lock_resource *res; 4698c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm; 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci res = container_of(kref, struct dlm_lock_resource, refs); 4728c2ecf20Sopenharmony_ci dlm = res->dlm; 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci /* This should not happen -- all lockres' have a name 4758c2ecf20Sopenharmony_ci * associated with them at init time. */ 4768c2ecf20Sopenharmony_ci BUG_ON(!res->lockname.name); 4778c2ecf20Sopenharmony_ci 4788c2ecf20Sopenharmony_ci mlog(0, "destroying lockres %.*s\n", res->lockname.len, 4798c2ecf20Sopenharmony_ci res->lockname.name); 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_ci atomic_dec(&dlm->res_cur_count); 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci if (!hlist_unhashed(&res->hash_node) || 4848c2ecf20Sopenharmony_ci !list_empty(&res->granted) || 4858c2ecf20Sopenharmony_ci !list_empty(&res->converting) || 4868c2ecf20Sopenharmony_ci !list_empty(&res->blocked) || 4878c2ecf20Sopenharmony_ci !list_empty(&res->dirty) || 4888c2ecf20Sopenharmony_ci !list_empty(&res->recovering) || 4898c2ecf20Sopenharmony_ci !list_empty(&res->purge)) { 4908c2ecf20Sopenharmony_ci mlog(ML_ERROR, 4918c2ecf20Sopenharmony_ci "Going to BUG for resource %.*s." 4928c2ecf20Sopenharmony_ci " We're on a list! [%c%c%c%c%c%c%c]\n", 4938c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, 4948c2ecf20Sopenharmony_ci !hlist_unhashed(&res->hash_node) ? 'H' : ' ', 4958c2ecf20Sopenharmony_ci !list_empty(&res->granted) ? 'G' : ' ', 4968c2ecf20Sopenharmony_ci !list_empty(&res->converting) ? 'C' : ' ', 4978c2ecf20Sopenharmony_ci !list_empty(&res->blocked) ? 'B' : ' ', 4988c2ecf20Sopenharmony_ci !list_empty(&res->dirty) ? 'D' : ' ', 4998c2ecf20Sopenharmony_ci !list_empty(&res->recovering) ? 'R' : ' ', 5008c2ecf20Sopenharmony_ci !list_empty(&res->purge) ? 'P' : ' '); 5018c2ecf20Sopenharmony_ci 5028c2ecf20Sopenharmony_ci dlm_print_one_lock_resource(res); 5038c2ecf20Sopenharmony_ci } 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci /* By the time we're ready to blow this guy away, we shouldn't 5068c2ecf20Sopenharmony_ci * be on any lists. */ 5078c2ecf20Sopenharmony_ci BUG_ON(!hlist_unhashed(&res->hash_node)); 5088c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&res->granted)); 5098c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&res->converting)); 5108c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&res->blocked)); 5118c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&res->dirty)); 5128c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&res->recovering)); 5138c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&res->purge)); 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); 5168c2ecf20Sopenharmony_ci 5178c2ecf20Sopenharmony_ci kmem_cache_free(dlm_lockres_cache, res); 5188c2ecf20Sopenharmony_ci} 5198c2ecf20Sopenharmony_ci 5208c2ecf20Sopenharmony_civoid dlm_lockres_put(struct dlm_lock_resource *res) 5218c2ecf20Sopenharmony_ci{ 5228c2ecf20Sopenharmony_ci kref_put(&res->refs, dlm_lockres_release); 5238c2ecf20Sopenharmony_ci} 5248c2ecf20Sopenharmony_ci 5258c2ecf20Sopenharmony_cistatic void dlm_init_lockres(struct dlm_ctxt *dlm, 5268c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 5278c2ecf20Sopenharmony_ci const char *name, unsigned int namelen) 5288c2ecf20Sopenharmony_ci{ 5298c2ecf20Sopenharmony_ci char *qname; 5308c2ecf20Sopenharmony_ci 5318c2ecf20Sopenharmony_ci /* If we memset here, we lose our reference to the kmalloc'd 5328c2ecf20Sopenharmony_ci * res->lockname.name, so be sure to init every field 5338c2ecf20Sopenharmony_ci * correctly! */ 5348c2ecf20Sopenharmony_ci 5358c2ecf20Sopenharmony_ci qname = (char *) res->lockname.name; 5368c2ecf20Sopenharmony_ci memcpy(qname, name, namelen); 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci res->lockname.len = namelen; 5398c2ecf20Sopenharmony_ci res->lockname.hash = dlm_lockid_hash(name, namelen); 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci init_waitqueue_head(&res->wq); 5428c2ecf20Sopenharmony_ci spin_lock_init(&res->spinlock); 5438c2ecf20Sopenharmony_ci INIT_HLIST_NODE(&res->hash_node); 5448c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&res->granted); 5458c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&res->converting); 5468c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&res->blocked); 5478c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&res->dirty); 5488c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&res->recovering); 5498c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&res->purge); 5508c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&res->tracking); 5518c2ecf20Sopenharmony_ci atomic_set(&res->asts_reserved, 0); 5528c2ecf20Sopenharmony_ci res->migration_pending = 0; 5538c2ecf20Sopenharmony_ci res->inflight_locks = 0; 5548c2ecf20Sopenharmony_ci res->inflight_assert_workers = 0; 5558c2ecf20Sopenharmony_ci 5568c2ecf20Sopenharmony_ci res->dlm = dlm; 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci kref_init(&res->refs); 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci atomic_inc(&dlm->res_tot_count); 5618c2ecf20Sopenharmony_ci atomic_inc(&dlm->res_cur_count); 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci /* just for consistency */ 5648c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 5658c2ecf20Sopenharmony_ci dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 5668c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 5678c2ecf20Sopenharmony_ci 5688c2ecf20Sopenharmony_ci res->state = DLM_LOCK_RES_IN_PROGRESS; 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci res->last_used = 0; 5718c2ecf20Sopenharmony_ci 5728c2ecf20Sopenharmony_ci spin_lock(&dlm->track_lock); 5738c2ecf20Sopenharmony_ci list_add_tail(&res->tracking, &dlm->tracking_list); 5748c2ecf20Sopenharmony_ci spin_unlock(&dlm->track_lock); 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ci memset(res->lvb, 0, DLM_LVB_LEN); 5778c2ecf20Sopenharmony_ci memset(res->refmap, 0, sizeof(res->refmap)); 5788c2ecf20Sopenharmony_ci} 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_cistruct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, 5818c2ecf20Sopenharmony_ci const char *name, 5828c2ecf20Sopenharmony_ci unsigned int namelen) 5838c2ecf20Sopenharmony_ci{ 5848c2ecf20Sopenharmony_ci struct dlm_lock_resource *res = NULL; 5858c2ecf20Sopenharmony_ci 5868c2ecf20Sopenharmony_ci res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); 5878c2ecf20Sopenharmony_ci if (!res) 5888c2ecf20Sopenharmony_ci goto error; 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); 5918c2ecf20Sopenharmony_ci if (!res->lockname.name) 5928c2ecf20Sopenharmony_ci goto error; 5938c2ecf20Sopenharmony_ci 5948c2ecf20Sopenharmony_ci dlm_init_lockres(dlm, res, name, namelen); 5958c2ecf20Sopenharmony_ci return res; 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_cierror: 5988c2ecf20Sopenharmony_ci if (res) 5998c2ecf20Sopenharmony_ci kmem_cache_free(dlm_lockres_cache, res); 6008c2ecf20Sopenharmony_ci return NULL; 6018c2ecf20Sopenharmony_ci} 6028c2ecf20Sopenharmony_ci 6038c2ecf20Sopenharmony_civoid dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm, 6048c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, int bit) 6058c2ecf20Sopenharmony_ci{ 6068c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 6078c2ecf20Sopenharmony_ci 6088c2ecf20Sopenharmony_ci mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len, 6098c2ecf20Sopenharmony_ci res->lockname.name, bit, __builtin_return_address(0)); 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_ci set_bit(bit, res->refmap); 6128c2ecf20Sopenharmony_ci} 6138c2ecf20Sopenharmony_ci 6148c2ecf20Sopenharmony_civoid dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm, 6158c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, int bit) 6168c2ecf20Sopenharmony_ci{ 6178c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 6188c2ecf20Sopenharmony_ci 6198c2ecf20Sopenharmony_ci mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len, 6208c2ecf20Sopenharmony_ci res->lockname.name, bit, __builtin_return_address(0)); 6218c2ecf20Sopenharmony_ci 6228c2ecf20Sopenharmony_ci clear_bit(bit, res->refmap); 6238c2ecf20Sopenharmony_ci} 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_cistatic void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 6268c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 6278c2ecf20Sopenharmony_ci{ 6288c2ecf20Sopenharmony_ci res->inflight_locks++; 6298c2ecf20Sopenharmony_ci 6308c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, 6318c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, res->inflight_locks, 6328c2ecf20Sopenharmony_ci __builtin_return_address(0)); 6338c2ecf20Sopenharmony_ci} 6348c2ecf20Sopenharmony_ci 6358c2ecf20Sopenharmony_civoid dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 6368c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 6378c2ecf20Sopenharmony_ci{ 6388c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 6398c2ecf20Sopenharmony_ci __dlm_lockres_grab_inflight_ref(dlm, res); 6408c2ecf20Sopenharmony_ci} 6418c2ecf20Sopenharmony_ci 6428c2ecf20Sopenharmony_civoid dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 6438c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 6448c2ecf20Sopenharmony_ci{ 6458c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 6468c2ecf20Sopenharmony_ci 6478c2ecf20Sopenharmony_ci BUG_ON(res->inflight_locks == 0); 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_ci res->inflight_locks--; 6508c2ecf20Sopenharmony_ci 6518c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name, 6528c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, res->inflight_locks, 6538c2ecf20Sopenharmony_ci __builtin_return_address(0)); 6548c2ecf20Sopenharmony_ci 6558c2ecf20Sopenharmony_ci wake_up(&res->wq); 6568c2ecf20Sopenharmony_ci} 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_civoid __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, 6598c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 6608c2ecf20Sopenharmony_ci{ 6618c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 6628c2ecf20Sopenharmony_ci res->inflight_assert_workers++; 6638c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: inflight assert worker++: now %u\n", 6648c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name, 6658c2ecf20Sopenharmony_ci res->inflight_assert_workers); 6668c2ecf20Sopenharmony_ci} 6678c2ecf20Sopenharmony_ci 6688c2ecf20Sopenharmony_cistatic void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, 6698c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 6708c2ecf20Sopenharmony_ci{ 6718c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 6728c2ecf20Sopenharmony_ci BUG_ON(res->inflight_assert_workers == 0); 6738c2ecf20Sopenharmony_ci res->inflight_assert_workers--; 6748c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: inflight assert worker--: now %u\n", 6758c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name, 6768c2ecf20Sopenharmony_ci res->inflight_assert_workers); 6778c2ecf20Sopenharmony_ci} 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_cistatic void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, 6808c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 6818c2ecf20Sopenharmony_ci{ 6828c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 6838c2ecf20Sopenharmony_ci __dlm_lockres_drop_inflight_worker(dlm, res); 6848c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 6858c2ecf20Sopenharmony_ci} 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci/* 6888c2ecf20Sopenharmony_ci * lookup a lock resource by name. 6898c2ecf20Sopenharmony_ci * may already exist in the hashtable. 6908c2ecf20Sopenharmony_ci * lockid is null terminated 6918c2ecf20Sopenharmony_ci * 6928c2ecf20Sopenharmony_ci * if not, allocate enough for the lockres and for 6938c2ecf20Sopenharmony_ci * the temporary structure used in doing the mastering. 6948c2ecf20Sopenharmony_ci * 6958c2ecf20Sopenharmony_ci * also, do a lookup in the dlm->master_list to see 6968c2ecf20Sopenharmony_ci * if another node has begun mastering the same lock. 6978c2ecf20Sopenharmony_ci * if so, there should be a block entry in there 6988c2ecf20Sopenharmony_ci * for this name, and we should *not* attempt to master 6998c2ecf20Sopenharmony_ci * the lock here. need to wait around for that node 7008c2ecf20Sopenharmony_ci * to assert_master (or die). 7018c2ecf20Sopenharmony_ci * 7028c2ecf20Sopenharmony_ci */ 7038c2ecf20Sopenharmony_cistruct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, 7048c2ecf20Sopenharmony_ci const char *lockid, 7058c2ecf20Sopenharmony_ci int namelen, 7068c2ecf20Sopenharmony_ci int flags) 7078c2ecf20Sopenharmony_ci{ 7088c2ecf20Sopenharmony_ci struct dlm_lock_resource *tmpres=NULL, *res=NULL; 7098c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle = NULL; 7108c2ecf20Sopenharmony_ci struct dlm_master_list_entry *alloc_mle = NULL; 7118c2ecf20Sopenharmony_ci int blocked = 0; 7128c2ecf20Sopenharmony_ci int ret, nodenum; 7138c2ecf20Sopenharmony_ci struct dlm_node_iter iter; 7148c2ecf20Sopenharmony_ci unsigned int hash; 7158c2ecf20Sopenharmony_ci int tries = 0; 7168c2ecf20Sopenharmony_ci int bit, wait_on_recovery = 0; 7178c2ecf20Sopenharmony_ci 7188c2ecf20Sopenharmony_ci BUG_ON(!lockid); 7198c2ecf20Sopenharmony_ci 7208c2ecf20Sopenharmony_ci hash = dlm_lockid_hash(lockid, namelen); 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_ci mlog(0, "get lockres %s (len %d)\n", lockid, namelen); 7238c2ecf20Sopenharmony_ci 7248c2ecf20Sopenharmony_cilookup: 7258c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 7268c2ecf20Sopenharmony_ci tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 7278c2ecf20Sopenharmony_ci if (tmpres) { 7288c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 7298c2ecf20Sopenharmony_ci spin_lock(&tmpres->spinlock); 7308c2ecf20Sopenharmony_ci 7318c2ecf20Sopenharmony_ci /* 7328c2ecf20Sopenharmony_ci * Right after dlm spinlock was released, dlm_thread could have 7338c2ecf20Sopenharmony_ci * purged the lockres. Check if lockres got unhashed. If so 7348c2ecf20Sopenharmony_ci * start over. 7358c2ecf20Sopenharmony_ci */ 7368c2ecf20Sopenharmony_ci if (hlist_unhashed(&tmpres->hash_node)) { 7378c2ecf20Sopenharmony_ci spin_unlock(&tmpres->spinlock); 7388c2ecf20Sopenharmony_ci dlm_lockres_put(tmpres); 7398c2ecf20Sopenharmony_ci tmpres = NULL; 7408c2ecf20Sopenharmony_ci goto lookup; 7418c2ecf20Sopenharmony_ci } 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_ci /* Wait on the thread that is mastering the resource */ 7448c2ecf20Sopenharmony_ci if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 7458c2ecf20Sopenharmony_ci __dlm_wait_on_lockres(tmpres); 7468c2ecf20Sopenharmony_ci BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); 7478c2ecf20Sopenharmony_ci spin_unlock(&tmpres->spinlock); 7488c2ecf20Sopenharmony_ci dlm_lockres_put(tmpres); 7498c2ecf20Sopenharmony_ci tmpres = NULL; 7508c2ecf20Sopenharmony_ci goto lookup; 7518c2ecf20Sopenharmony_ci } 7528c2ecf20Sopenharmony_ci 7538c2ecf20Sopenharmony_ci /* Wait on the resource purge to complete before continuing */ 7548c2ecf20Sopenharmony_ci if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) { 7558c2ecf20Sopenharmony_ci BUG_ON(tmpres->owner == dlm->node_num); 7568c2ecf20Sopenharmony_ci __dlm_wait_on_lockres_flags(tmpres, 7578c2ecf20Sopenharmony_ci DLM_LOCK_RES_DROPPING_REF); 7588c2ecf20Sopenharmony_ci spin_unlock(&tmpres->spinlock); 7598c2ecf20Sopenharmony_ci dlm_lockres_put(tmpres); 7608c2ecf20Sopenharmony_ci tmpres = NULL; 7618c2ecf20Sopenharmony_ci goto lookup; 7628c2ecf20Sopenharmony_ci } 7638c2ecf20Sopenharmony_ci 7648c2ecf20Sopenharmony_ci /* Grab inflight ref to pin the resource */ 7658c2ecf20Sopenharmony_ci dlm_lockres_grab_inflight_ref(dlm, tmpres); 7668c2ecf20Sopenharmony_ci 7678c2ecf20Sopenharmony_ci spin_unlock(&tmpres->spinlock); 7688c2ecf20Sopenharmony_ci if (res) { 7698c2ecf20Sopenharmony_ci spin_lock(&dlm->track_lock); 7708c2ecf20Sopenharmony_ci if (!list_empty(&res->tracking)) 7718c2ecf20Sopenharmony_ci list_del_init(&res->tracking); 7728c2ecf20Sopenharmony_ci else 7738c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Resource %.*s not " 7748c2ecf20Sopenharmony_ci "on the Tracking list\n", 7758c2ecf20Sopenharmony_ci res->lockname.len, 7768c2ecf20Sopenharmony_ci res->lockname.name); 7778c2ecf20Sopenharmony_ci spin_unlock(&dlm->track_lock); 7788c2ecf20Sopenharmony_ci dlm_lockres_put(res); 7798c2ecf20Sopenharmony_ci } 7808c2ecf20Sopenharmony_ci res = tmpres; 7818c2ecf20Sopenharmony_ci goto leave; 7828c2ecf20Sopenharmony_ci } 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci if (!res) { 7858c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 7868c2ecf20Sopenharmony_ci mlog(0, "allocating a new resource\n"); 7878c2ecf20Sopenharmony_ci /* nothing found and we need to allocate one. */ 7888c2ecf20Sopenharmony_ci alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 7898c2ecf20Sopenharmony_ci if (!alloc_mle) 7908c2ecf20Sopenharmony_ci goto leave; 7918c2ecf20Sopenharmony_ci res = dlm_new_lockres(dlm, lockid, namelen); 7928c2ecf20Sopenharmony_ci if (!res) 7938c2ecf20Sopenharmony_ci goto leave; 7948c2ecf20Sopenharmony_ci goto lookup; 7958c2ecf20Sopenharmony_ci } 7968c2ecf20Sopenharmony_ci 7978c2ecf20Sopenharmony_ci mlog(0, "no lockres found, allocated our own: %p\n", res); 7988c2ecf20Sopenharmony_ci 7998c2ecf20Sopenharmony_ci if (flags & LKM_LOCAL) { 8008c2ecf20Sopenharmony_ci /* caller knows it's safe to assume it's not mastered elsewhere 8018c2ecf20Sopenharmony_ci * DONE! return right away */ 8028c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 8038c2ecf20Sopenharmony_ci dlm_change_lockres_owner(dlm, res, dlm->node_num); 8048c2ecf20Sopenharmony_ci __dlm_insert_lockres(dlm, res); 8058c2ecf20Sopenharmony_ci dlm_lockres_grab_inflight_ref(dlm, res); 8068c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 8078c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 8088c2ecf20Sopenharmony_ci /* lockres still marked IN_PROGRESS */ 8098c2ecf20Sopenharmony_ci goto wake_waiters; 8108c2ecf20Sopenharmony_ci } 8118c2ecf20Sopenharmony_ci 8128c2ecf20Sopenharmony_ci /* check master list to see if another node has started mastering it */ 8138c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci /* if we found a block, wait for lock to be mastered by another node */ 8168c2ecf20Sopenharmony_ci blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); 8178c2ecf20Sopenharmony_ci if (blocked) { 8188c2ecf20Sopenharmony_ci int mig; 8198c2ecf20Sopenharmony_ci if (mle->type == DLM_MLE_MASTER) { 8208c2ecf20Sopenharmony_ci mlog(ML_ERROR, "master entry for nonexistent lock!\n"); 8218c2ecf20Sopenharmony_ci BUG(); 8228c2ecf20Sopenharmony_ci } 8238c2ecf20Sopenharmony_ci mig = (mle->type == DLM_MLE_MIGRATION); 8248c2ecf20Sopenharmony_ci /* if there is a migration in progress, let the migration 8258c2ecf20Sopenharmony_ci * finish before continuing. we can wait for the absence 8268c2ecf20Sopenharmony_ci * of the MIGRATION mle: either the migrate finished or 8278c2ecf20Sopenharmony_ci * one of the nodes died and the mle was cleaned up. 8288c2ecf20Sopenharmony_ci * if there is a BLOCK here, but it already has a master 8298c2ecf20Sopenharmony_ci * set, we are too late. the master does not have a ref 8308c2ecf20Sopenharmony_ci * for us in the refmap. detach the mle and drop it. 8318c2ecf20Sopenharmony_ci * either way, go back to the top and start over. */ 8328c2ecf20Sopenharmony_ci if (mig || mle->master != O2NM_MAX_NODES) { 8338c2ecf20Sopenharmony_ci BUG_ON(mig && mle->master == dlm->node_num); 8348c2ecf20Sopenharmony_ci /* we arrived too late. the master does not 8358c2ecf20Sopenharmony_ci * have a ref for us. retry. */ 8368c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: late on %s\n", 8378c2ecf20Sopenharmony_ci dlm->name, namelen, lockid, 8388c2ecf20Sopenharmony_ci mig ? "MIGRATION" : "BLOCK"); 8398c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 8408c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci /* master is known, detach */ 8438c2ecf20Sopenharmony_ci if (!mig) 8448c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, mle); 8458c2ecf20Sopenharmony_ci dlm_put_mle(mle); 8468c2ecf20Sopenharmony_ci mle = NULL; 8478c2ecf20Sopenharmony_ci /* this is lame, but we can't wait on either 8488c2ecf20Sopenharmony_ci * the mle or lockres waitqueue here */ 8498c2ecf20Sopenharmony_ci if (mig) 8508c2ecf20Sopenharmony_ci msleep(100); 8518c2ecf20Sopenharmony_ci goto lookup; 8528c2ecf20Sopenharmony_ci } 8538c2ecf20Sopenharmony_ci } else { 8548c2ecf20Sopenharmony_ci /* go ahead and try to master lock on this node */ 8558c2ecf20Sopenharmony_ci mle = alloc_mle; 8568c2ecf20Sopenharmony_ci /* make sure this does not get freed below */ 8578c2ecf20Sopenharmony_ci alloc_mle = NULL; 8588c2ecf20Sopenharmony_ci dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); 8598c2ecf20Sopenharmony_ci set_bit(dlm->node_num, mle->maybe_map); 8608c2ecf20Sopenharmony_ci __dlm_insert_mle(dlm, mle); 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_ci /* still holding the dlm spinlock, check the recovery map 8638c2ecf20Sopenharmony_ci * to see if there are any nodes that still need to be 8648c2ecf20Sopenharmony_ci * considered. these will not appear in the mle nodemap 8658c2ecf20Sopenharmony_ci * but they might own this lockres. wait on them. */ 8668c2ecf20Sopenharmony_ci bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 8678c2ecf20Sopenharmony_ci if (bit < O2NM_MAX_NODES) { 8688c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, At least one node (%d) " 8698c2ecf20Sopenharmony_ci "to recover before lock mastery can begin\n", 8708c2ecf20Sopenharmony_ci dlm->name, namelen, (char *)lockid, bit); 8718c2ecf20Sopenharmony_ci wait_on_recovery = 1; 8728c2ecf20Sopenharmony_ci } 8738c2ecf20Sopenharmony_ci } 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci /* at this point there is either a DLM_MLE_BLOCK or a 8768c2ecf20Sopenharmony_ci * DLM_MLE_MASTER on the master list, so it's safe to add the 8778c2ecf20Sopenharmony_ci * lockres to the hashtable. anyone who finds the lock will 8788c2ecf20Sopenharmony_ci * still have to wait on the IN_PROGRESS. */ 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_ci /* finally add the lockres to its hash bucket */ 8818c2ecf20Sopenharmony_ci __dlm_insert_lockres(dlm, res); 8828c2ecf20Sopenharmony_ci 8838c2ecf20Sopenharmony_ci /* since this lockres is new it doesn't not require the spinlock */ 8848c2ecf20Sopenharmony_ci __dlm_lockres_grab_inflight_ref(dlm, res); 8858c2ecf20Sopenharmony_ci 8868c2ecf20Sopenharmony_ci /* get an extra ref on the mle in case this is a BLOCK 8878c2ecf20Sopenharmony_ci * if so, the creator of the BLOCK may try to put the last 8888c2ecf20Sopenharmony_ci * ref at this time in the assert master handler, so we 8898c2ecf20Sopenharmony_ci * need an extra one to keep from a bad ptr deref. */ 8908c2ecf20Sopenharmony_ci dlm_get_mle_inuse(mle); 8918c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 8928c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 8938c2ecf20Sopenharmony_ci 8948c2ecf20Sopenharmony_ciredo_request: 8958c2ecf20Sopenharmony_ci while (wait_on_recovery) { 8968c2ecf20Sopenharmony_ci /* any cluster changes that occurred after dropping the 8978c2ecf20Sopenharmony_ci * dlm spinlock would be detectable be a change on the mle, 8988c2ecf20Sopenharmony_ci * so we only need to clear out the recovery map once. */ 8998c2ecf20Sopenharmony_ci if (dlm_is_recovery_lock(lockid, namelen)) { 9008c2ecf20Sopenharmony_ci mlog(0, "%s: Recovery map is not empty, but must " 9018c2ecf20Sopenharmony_ci "master $RECOVERY lock now\n", dlm->name); 9028c2ecf20Sopenharmony_ci if (!dlm_pre_master_reco_lockres(dlm, res)) 9038c2ecf20Sopenharmony_ci wait_on_recovery = 0; 9048c2ecf20Sopenharmony_ci else { 9058c2ecf20Sopenharmony_ci mlog(0, "%s: waiting 500ms for heartbeat state " 9068c2ecf20Sopenharmony_ci "change\n", dlm->name); 9078c2ecf20Sopenharmony_ci msleep(500); 9088c2ecf20Sopenharmony_ci } 9098c2ecf20Sopenharmony_ci continue; 9108c2ecf20Sopenharmony_ci } 9118c2ecf20Sopenharmony_ci 9128c2ecf20Sopenharmony_ci dlm_kick_recovery_thread(dlm); 9138c2ecf20Sopenharmony_ci msleep(1000); 9148c2ecf20Sopenharmony_ci dlm_wait_for_recovery(dlm); 9158c2ecf20Sopenharmony_ci 9168c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 9178c2ecf20Sopenharmony_ci bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 9188c2ecf20Sopenharmony_ci if (bit < O2NM_MAX_NODES) { 9198c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, At least one node (%d) " 9208c2ecf20Sopenharmony_ci "to recover before lock mastery can begin\n", 9218c2ecf20Sopenharmony_ci dlm->name, namelen, (char *)lockid, bit); 9228c2ecf20Sopenharmony_ci wait_on_recovery = 1; 9238c2ecf20Sopenharmony_ci } else 9248c2ecf20Sopenharmony_ci wait_on_recovery = 0; 9258c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 9268c2ecf20Sopenharmony_ci 9278c2ecf20Sopenharmony_ci if (wait_on_recovery) 9288c2ecf20Sopenharmony_ci dlm_wait_for_node_recovery(dlm, bit, 10000); 9298c2ecf20Sopenharmony_ci } 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_ci /* must wait for lock to be mastered elsewhere */ 9328c2ecf20Sopenharmony_ci if (blocked) 9338c2ecf20Sopenharmony_ci goto wait; 9348c2ecf20Sopenharmony_ci 9358c2ecf20Sopenharmony_ci ret = -EINVAL; 9368c2ecf20Sopenharmony_ci dlm_node_iter_init(mle->vote_map, &iter); 9378c2ecf20Sopenharmony_ci while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 9388c2ecf20Sopenharmony_ci ret = dlm_do_master_request(res, mle, nodenum); 9398c2ecf20Sopenharmony_ci if (ret < 0) 9408c2ecf20Sopenharmony_ci mlog_errno(ret); 9418c2ecf20Sopenharmony_ci if (mle->master != O2NM_MAX_NODES) { 9428c2ecf20Sopenharmony_ci /* found a master ! */ 9438c2ecf20Sopenharmony_ci if (mle->master <= nodenum) 9448c2ecf20Sopenharmony_ci break; 9458c2ecf20Sopenharmony_ci /* if our master request has not reached the master 9468c2ecf20Sopenharmony_ci * yet, keep going until it does. this is how the 9478c2ecf20Sopenharmony_ci * master will know that asserts are needed back to 9488c2ecf20Sopenharmony_ci * the lower nodes. */ 9498c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, Requests only up to %u but " 9508c2ecf20Sopenharmony_ci "master is %u, keep going\n", dlm->name, namelen, 9518c2ecf20Sopenharmony_ci lockid, nodenum, mle->master); 9528c2ecf20Sopenharmony_ci } 9538c2ecf20Sopenharmony_ci } 9548c2ecf20Sopenharmony_ci 9558c2ecf20Sopenharmony_ciwait: 9568c2ecf20Sopenharmony_ci /* keep going until the response map includes all nodes */ 9578c2ecf20Sopenharmony_ci ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 9588c2ecf20Sopenharmony_ci if (ret < 0) { 9598c2ecf20Sopenharmony_ci wait_on_recovery = 1; 9608c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, Node map changed, redo the master " 9618c2ecf20Sopenharmony_ci "request now, blocked=%d\n", dlm->name, res->lockname.len, 9628c2ecf20Sopenharmony_ci res->lockname.name, blocked); 9638c2ecf20Sopenharmony_ci if (++tries > 20) { 9648c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s: res %.*s, Spinning on " 9658c2ecf20Sopenharmony_ci "dlm_wait_for_lock_mastery, blocked = %d\n", 9668c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, 9678c2ecf20Sopenharmony_ci res->lockname.name, blocked); 9688c2ecf20Sopenharmony_ci dlm_print_one_lock_resource(res); 9698c2ecf20Sopenharmony_ci dlm_print_one_mle(mle); 9708c2ecf20Sopenharmony_ci tries = 0; 9718c2ecf20Sopenharmony_ci } 9728c2ecf20Sopenharmony_ci goto redo_request; 9738c2ecf20Sopenharmony_ci } 9748c2ecf20Sopenharmony_ci 9758c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len, 9768c2ecf20Sopenharmony_ci res->lockname.name, res->owner); 9778c2ecf20Sopenharmony_ci /* make sure we never continue without this */ 9788c2ecf20Sopenharmony_ci BUG_ON(res->owner == O2NM_MAX_NODES); 9798c2ecf20Sopenharmony_ci 9808c2ecf20Sopenharmony_ci /* master is known, detach if not already detached */ 9818c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, mle); 9828c2ecf20Sopenharmony_ci dlm_put_mle(mle); 9838c2ecf20Sopenharmony_ci /* put the extra ref */ 9848c2ecf20Sopenharmony_ci dlm_put_mle_inuse(mle); 9858c2ecf20Sopenharmony_ci 9868c2ecf20Sopenharmony_ciwake_waiters: 9878c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 9888c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 9898c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 9908c2ecf20Sopenharmony_ci wake_up(&res->wq); 9918c2ecf20Sopenharmony_ci 9928c2ecf20Sopenharmony_cileave: 9938c2ecf20Sopenharmony_ci /* need to free the unused mle */ 9948c2ecf20Sopenharmony_ci if (alloc_mle) 9958c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, alloc_mle); 9968c2ecf20Sopenharmony_ci 9978c2ecf20Sopenharmony_ci return res; 9988c2ecf20Sopenharmony_ci} 9998c2ecf20Sopenharmony_ci 10008c2ecf20Sopenharmony_ci 10018c2ecf20Sopenharmony_ci#define DLM_MASTERY_TIMEOUT_MS 5000 10028c2ecf20Sopenharmony_ci 10038c2ecf20Sopenharmony_cistatic int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, 10048c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 10058c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 10068c2ecf20Sopenharmony_ci int *blocked) 10078c2ecf20Sopenharmony_ci{ 10088c2ecf20Sopenharmony_ci u8 m; 10098c2ecf20Sopenharmony_ci int ret, bit; 10108c2ecf20Sopenharmony_ci int map_changed, voting_done; 10118c2ecf20Sopenharmony_ci int assert, sleep; 10128c2ecf20Sopenharmony_ci 10138c2ecf20Sopenharmony_cirecheck: 10148c2ecf20Sopenharmony_ci ret = 0; 10158c2ecf20Sopenharmony_ci assert = 0; 10168c2ecf20Sopenharmony_ci 10178c2ecf20Sopenharmony_ci /* check if another node has already become the owner */ 10188c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 10198c2ecf20Sopenharmony_ci if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 10208c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name, 10218c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, res->owner); 10228c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 10238c2ecf20Sopenharmony_ci /* this will cause the master to re-assert across 10248c2ecf20Sopenharmony_ci * the whole cluster, freeing up mles */ 10258c2ecf20Sopenharmony_ci if (res->owner != dlm->node_num) { 10268c2ecf20Sopenharmony_ci ret = dlm_do_master_request(res, mle, res->owner); 10278c2ecf20Sopenharmony_ci if (ret < 0) { 10288c2ecf20Sopenharmony_ci /* give recovery a chance to run */ 10298c2ecf20Sopenharmony_ci mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); 10308c2ecf20Sopenharmony_ci msleep(500); 10318c2ecf20Sopenharmony_ci goto recheck; 10328c2ecf20Sopenharmony_ci } 10338c2ecf20Sopenharmony_ci } 10348c2ecf20Sopenharmony_ci ret = 0; 10358c2ecf20Sopenharmony_ci goto leave; 10368c2ecf20Sopenharmony_ci } 10378c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 10388c2ecf20Sopenharmony_ci 10398c2ecf20Sopenharmony_ci spin_lock(&mle->spinlock); 10408c2ecf20Sopenharmony_ci m = mle->master; 10418c2ecf20Sopenharmony_ci map_changed = (memcmp(mle->vote_map, mle->node_map, 10428c2ecf20Sopenharmony_ci sizeof(mle->vote_map)) != 0); 10438c2ecf20Sopenharmony_ci voting_done = (memcmp(mle->vote_map, mle->response_map, 10448c2ecf20Sopenharmony_ci sizeof(mle->vote_map)) == 0); 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_ci /* restart if we hit any errors */ 10478c2ecf20Sopenharmony_ci if (map_changed) { 10488c2ecf20Sopenharmony_ci int b; 10498c2ecf20Sopenharmony_ci mlog(0, "%s: %.*s: node map changed, restarting\n", 10508c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name); 10518c2ecf20Sopenharmony_ci ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); 10528c2ecf20Sopenharmony_ci b = (mle->type == DLM_MLE_BLOCK); 10538c2ecf20Sopenharmony_ci if ((*blocked && !b) || (!*blocked && b)) { 10548c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: status change: old=%d new=%d\n", 10558c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name, 10568c2ecf20Sopenharmony_ci *blocked, b); 10578c2ecf20Sopenharmony_ci *blocked = b; 10588c2ecf20Sopenharmony_ci } 10598c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 10608c2ecf20Sopenharmony_ci if (ret < 0) { 10618c2ecf20Sopenharmony_ci mlog_errno(ret); 10628c2ecf20Sopenharmony_ci goto leave; 10638c2ecf20Sopenharmony_ci } 10648c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: restart lock mastery succeeded, " 10658c2ecf20Sopenharmony_ci "rechecking now\n", dlm->name, res->lockname.len, 10668c2ecf20Sopenharmony_ci res->lockname.name); 10678c2ecf20Sopenharmony_ci goto recheck; 10688c2ecf20Sopenharmony_ci } else { 10698c2ecf20Sopenharmony_ci if (!voting_done) { 10708c2ecf20Sopenharmony_ci mlog(0, "map not changed and voting not done " 10718c2ecf20Sopenharmony_ci "for %s:%.*s\n", dlm->name, res->lockname.len, 10728c2ecf20Sopenharmony_ci res->lockname.name); 10738c2ecf20Sopenharmony_ci } 10748c2ecf20Sopenharmony_ci } 10758c2ecf20Sopenharmony_ci 10768c2ecf20Sopenharmony_ci if (m != O2NM_MAX_NODES) { 10778c2ecf20Sopenharmony_ci /* another node has done an assert! 10788c2ecf20Sopenharmony_ci * all done! */ 10798c2ecf20Sopenharmony_ci sleep = 0; 10808c2ecf20Sopenharmony_ci } else { 10818c2ecf20Sopenharmony_ci sleep = 1; 10828c2ecf20Sopenharmony_ci /* have all nodes responded? */ 10838c2ecf20Sopenharmony_ci if (voting_done && !*blocked) { 10848c2ecf20Sopenharmony_ci bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 10858c2ecf20Sopenharmony_ci if (dlm->node_num <= bit) { 10868c2ecf20Sopenharmony_ci /* my node number is lowest. 10878c2ecf20Sopenharmony_ci * now tell other nodes that I am 10888c2ecf20Sopenharmony_ci * mastering this. */ 10898c2ecf20Sopenharmony_ci mle->master = dlm->node_num; 10908c2ecf20Sopenharmony_ci /* ref was grabbed in get_lock_resource 10918c2ecf20Sopenharmony_ci * will be dropped in dlmlock_master */ 10928c2ecf20Sopenharmony_ci assert = 1; 10938c2ecf20Sopenharmony_ci sleep = 0; 10948c2ecf20Sopenharmony_ci } 10958c2ecf20Sopenharmony_ci /* if voting is done, but we have not received 10968c2ecf20Sopenharmony_ci * an assert master yet, we must sleep */ 10978c2ecf20Sopenharmony_ci } 10988c2ecf20Sopenharmony_ci } 10998c2ecf20Sopenharmony_ci 11008c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 11018c2ecf20Sopenharmony_ci 11028c2ecf20Sopenharmony_ci /* sleep if we haven't finished voting yet */ 11038c2ecf20Sopenharmony_ci if (sleep) { 11048c2ecf20Sopenharmony_ci unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); 11058c2ecf20Sopenharmony_ci atomic_set(&mle->woken, 0); 11068c2ecf20Sopenharmony_ci (void)wait_event_timeout(mle->wq, 11078c2ecf20Sopenharmony_ci (atomic_read(&mle->woken) == 1), 11088c2ecf20Sopenharmony_ci timeo); 11098c2ecf20Sopenharmony_ci if (res->owner == O2NM_MAX_NODES) { 11108c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: waiting again\n", dlm->name, 11118c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name); 11128c2ecf20Sopenharmony_ci goto recheck; 11138c2ecf20Sopenharmony_ci } 11148c2ecf20Sopenharmony_ci mlog(0, "done waiting, master is %u\n", res->owner); 11158c2ecf20Sopenharmony_ci ret = 0; 11168c2ecf20Sopenharmony_ci goto leave; 11178c2ecf20Sopenharmony_ci } 11188c2ecf20Sopenharmony_ci 11198c2ecf20Sopenharmony_ci ret = 0; /* done */ 11208c2ecf20Sopenharmony_ci if (assert) { 11218c2ecf20Sopenharmony_ci m = dlm->node_num; 11228c2ecf20Sopenharmony_ci mlog(0, "about to master %.*s here, this=%u\n", 11238c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, m); 11248c2ecf20Sopenharmony_ci ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); 11258c2ecf20Sopenharmony_ci if (ret) { 11268c2ecf20Sopenharmony_ci /* This is a failure in the network path, 11278c2ecf20Sopenharmony_ci * not in the response to the assert_master 11288c2ecf20Sopenharmony_ci * (any nonzero response is a BUG on this node). 11298c2ecf20Sopenharmony_ci * Most likely a socket just got disconnected 11308c2ecf20Sopenharmony_ci * due to node death. */ 11318c2ecf20Sopenharmony_ci mlog_errno(ret); 11328c2ecf20Sopenharmony_ci } 11338c2ecf20Sopenharmony_ci /* no longer need to restart lock mastery. 11348c2ecf20Sopenharmony_ci * all living nodes have been contacted. */ 11358c2ecf20Sopenharmony_ci ret = 0; 11368c2ecf20Sopenharmony_ci } 11378c2ecf20Sopenharmony_ci 11388c2ecf20Sopenharmony_ci /* set the lockres owner */ 11398c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 11408c2ecf20Sopenharmony_ci /* mastery reference obtained either during 11418c2ecf20Sopenharmony_ci * assert_master_handler or in get_lock_resource */ 11428c2ecf20Sopenharmony_ci dlm_change_lockres_owner(dlm, res, m); 11438c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 11448c2ecf20Sopenharmony_ci 11458c2ecf20Sopenharmony_cileave: 11468c2ecf20Sopenharmony_ci return ret; 11478c2ecf20Sopenharmony_ci} 11488c2ecf20Sopenharmony_ci 11498c2ecf20Sopenharmony_cistruct dlm_bitmap_diff_iter 11508c2ecf20Sopenharmony_ci{ 11518c2ecf20Sopenharmony_ci int curnode; 11528c2ecf20Sopenharmony_ci unsigned long *orig_bm; 11538c2ecf20Sopenharmony_ci unsigned long *cur_bm; 11548c2ecf20Sopenharmony_ci unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; 11558c2ecf20Sopenharmony_ci}; 11568c2ecf20Sopenharmony_ci 11578c2ecf20Sopenharmony_cienum dlm_node_state_change 11588c2ecf20Sopenharmony_ci{ 11598c2ecf20Sopenharmony_ci NODE_DOWN = -1, 11608c2ecf20Sopenharmony_ci NODE_NO_CHANGE = 0, 11618c2ecf20Sopenharmony_ci NODE_UP 11628c2ecf20Sopenharmony_ci}; 11638c2ecf20Sopenharmony_ci 11648c2ecf20Sopenharmony_cistatic void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, 11658c2ecf20Sopenharmony_ci unsigned long *orig_bm, 11668c2ecf20Sopenharmony_ci unsigned long *cur_bm) 11678c2ecf20Sopenharmony_ci{ 11688c2ecf20Sopenharmony_ci unsigned long p1, p2; 11698c2ecf20Sopenharmony_ci int i; 11708c2ecf20Sopenharmony_ci 11718c2ecf20Sopenharmony_ci iter->curnode = -1; 11728c2ecf20Sopenharmony_ci iter->orig_bm = orig_bm; 11738c2ecf20Sopenharmony_ci iter->cur_bm = cur_bm; 11748c2ecf20Sopenharmony_ci 11758c2ecf20Sopenharmony_ci for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { 11768c2ecf20Sopenharmony_ci p1 = *(iter->orig_bm + i); 11778c2ecf20Sopenharmony_ci p2 = *(iter->cur_bm + i); 11788c2ecf20Sopenharmony_ci iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); 11798c2ecf20Sopenharmony_ci } 11808c2ecf20Sopenharmony_ci} 11818c2ecf20Sopenharmony_ci 11828c2ecf20Sopenharmony_cistatic int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, 11838c2ecf20Sopenharmony_ci enum dlm_node_state_change *state) 11848c2ecf20Sopenharmony_ci{ 11858c2ecf20Sopenharmony_ci int bit; 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_ci if (iter->curnode >= O2NM_MAX_NODES) 11888c2ecf20Sopenharmony_ci return -ENOENT; 11898c2ecf20Sopenharmony_ci 11908c2ecf20Sopenharmony_ci bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, 11918c2ecf20Sopenharmony_ci iter->curnode+1); 11928c2ecf20Sopenharmony_ci if (bit >= O2NM_MAX_NODES) { 11938c2ecf20Sopenharmony_ci iter->curnode = O2NM_MAX_NODES; 11948c2ecf20Sopenharmony_ci return -ENOENT; 11958c2ecf20Sopenharmony_ci } 11968c2ecf20Sopenharmony_ci 11978c2ecf20Sopenharmony_ci /* if it was there in the original then this node died */ 11988c2ecf20Sopenharmony_ci if (test_bit(bit, iter->orig_bm)) 11998c2ecf20Sopenharmony_ci *state = NODE_DOWN; 12008c2ecf20Sopenharmony_ci else 12018c2ecf20Sopenharmony_ci *state = NODE_UP; 12028c2ecf20Sopenharmony_ci 12038c2ecf20Sopenharmony_ci iter->curnode = bit; 12048c2ecf20Sopenharmony_ci return bit; 12058c2ecf20Sopenharmony_ci} 12068c2ecf20Sopenharmony_ci 12078c2ecf20Sopenharmony_ci 12088c2ecf20Sopenharmony_cistatic int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, 12098c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 12108c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 12118c2ecf20Sopenharmony_ci int blocked) 12128c2ecf20Sopenharmony_ci{ 12138c2ecf20Sopenharmony_ci struct dlm_bitmap_diff_iter bdi; 12148c2ecf20Sopenharmony_ci enum dlm_node_state_change sc; 12158c2ecf20Sopenharmony_ci int node; 12168c2ecf20Sopenharmony_ci int ret = 0; 12178c2ecf20Sopenharmony_ci 12188c2ecf20Sopenharmony_ci mlog(0, "something happened such that the " 12198c2ecf20Sopenharmony_ci "master process may need to be restarted!\n"); 12208c2ecf20Sopenharmony_ci 12218c2ecf20Sopenharmony_ci assert_spin_locked(&mle->spinlock); 12228c2ecf20Sopenharmony_ci 12238c2ecf20Sopenharmony_ci dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); 12248c2ecf20Sopenharmony_ci node = dlm_bitmap_diff_iter_next(&bdi, &sc); 12258c2ecf20Sopenharmony_ci while (node >= 0) { 12268c2ecf20Sopenharmony_ci if (sc == NODE_UP) { 12278c2ecf20Sopenharmony_ci /* a node came up. clear any old vote from 12288c2ecf20Sopenharmony_ci * the response map and set it in the vote map 12298c2ecf20Sopenharmony_ci * then restart the mastery. */ 12308c2ecf20Sopenharmony_ci mlog(ML_NOTICE, "node %d up while restarting\n", node); 12318c2ecf20Sopenharmony_ci 12328c2ecf20Sopenharmony_ci /* redo the master request, but only for the new node */ 12338c2ecf20Sopenharmony_ci mlog(0, "sending request to new node\n"); 12348c2ecf20Sopenharmony_ci clear_bit(node, mle->response_map); 12358c2ecf20Sopenharmony_ci set_bit(node, mle->vote_map); 12368c2ecf20Sopenharmony_ci } else { 12378c2ecf20Sopenharmony_ci mlog(ML_ERROR, "node down! %d\n", node); 12388c2ecf20Sopenharmony_ci if (blocked) { 12398c2ecf20Sopenharmony_ci int lowest = find_next_bit(mle->maybe_map, 12408c2ecf20Sopenharmony_ci O2NM_MAX_NODES, 0); 12418c2ecf20Sopenharmony_ci 12428c2ecf20Sopenharmony_ci /* act like it was never there */ 12438c2ecf20Sopenharmony_ci clear_bit(node, mle->maybe_map); 12448c2ecf20Sopenharmony_ci 12458c2ecf20Sopenharmony_ci if (node == lowest) { 12468c2ecf20Sopenharmony_ci mlog(0, "expected master %u died" 12478c2ecf20Sopenharmony_ci " while this node was blocked " 12488c2ecf20Sopenharmony_ci "waiting on it!\n", node); 12498c2ecf20Sopenharmony_ci lowest = find_next_bit(mle->maybe_map, 12508c2ecf20Sopenharmony_ci O2NM_MAX_NODES, 12518c2ecf20Sopenharmony_ci lowest+1); 12528c2ecf20Sopenharmony_ci if (lowest < O2NM_MAX_NODES) { 12538c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s:still " 12548c2ecf20Sopenharmony_ci "blocked. waiting on %u " 12558c2ecf20Sopenharmony_ci "now\n", dlm->name, 12568c2ecf20Sopenharmony_ci res->lockname.len, 12578c2ecf20Sopenharmony_ci res->lockname.name, 12588c2ecf20Sopenharmony_ci lowest); 12598c2ecf20Sopenharmony_ci } else { 12608c2ecf20Sopenharmony_ci /* mle is an MLE_BLOCK, but 12618c2ecf20Sopenharmony_ci * there is now nothing left to 12628c2ecf20Sopenharmony_ci * block on. we need to return 12638c2ecf20Sopenharmony_ci * all the way back out and try 12648c2ecf20Sopenharmony_ci * again with an MLE_MASTER. 12658c2ecf20Sopenharmony_ci * dlm_do_local_recovery_cleanup 12668c2ecf20Sopenharmony_ci * has already run, so the mle 12678c2ecf20Sopenharmony_ci * refcount is ok */ 12688c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: no " 12698c2ecf20Sopenharmony_ci "longer blocking. try to " 12708c2ecf20Sopenharmony_ci "master this here\n", 12718c2ecf20Sopenharmony_ci dlm->name, 12728c2ecf20Sopenharmony_ci res->lockname.len, 12738c2ecf20Sopenharmony_ci res->lockname.name); 12748c2ecf20Sopenharmony_ci mle->type = DLM_MLE_MASTER; 12758c2ecf20Sopenharmony_ci mle->mleres = res; 12768c2ecf20Sopenharmony_ci } 12778c2ecf20Sopenharmony_ci } 12788c2ecf20Sopenharmony_ci } 12798c2ecf20Sopenharmony_ci 12808c2ecf20Sopenharmony_ci /* now blank out everything, as if we had never 12818c2ecf20Sopenharmony_ci * contacted anyone */ 12828c2ecf20Sopenharmony_ci memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); 12838c2ecf20Sopenharmony_ci memset(mle->response_map, 0, sizeof(mle->response_map)); 12848c2ecf20Sopenharmony_ci /* reset the vote_map to the current node_map */ 12858c2ecf20Sopenharmony_ci memcpy(mle->vote_map, mle->node_map, 12868c2ecf20Sopenharmony_ci sizeof(mle->node_map)); 12878c2ecf20Sopenharmony_ci /* put myself into the maybe map */ 12888c2ecf20Sopenharmony_ci if (mle->type != DLM_MLE_BLOCK) 12898c2ecf20Sopenharmony_ci set_bit(dlm->node_num, mle->maybe_map); 12908c2ecf20Sopenharmony_ci } 12918c2ecf20Sopenharmony_ci ret = -EAGAIN; 12928c2ecf20Sopenharmony_ci node = dlm_bitmap_diff_iter_next(&bdi, &sc); 12938c2ecf20Sopenharmony_ci } 12948c2ecf20Sopenharmony_ci return ret; 12958c2ecf20Sopenharmony_ci} 12968c2ecf20Sopenharmony_ci 12978c2ecf20Sopenharmony_ci 12988c2ecf20Sopenharmony_ci/* 12998c2ecf20Sopenharmony_ci * DLM_MASTER_REQUEST_MSG 13008c2ecf20Sopenharmony_ci * 13018c2ecf20Sopenharmony_ci * returns: 0 on success, 13028c2ecf20Sopenharmony_ci * -errno on a network error 13038c2ecf20Sopenharmony_ci * 13048c2ecf20Sopenharmony_ci * on error, the caller should assume the target node is "dead" 13058c2ecf20Sopenharmony_ci * 13068c2ecf20Sopenharmony_ci */ 13078c2ecf20Sopenharmony_ci 13088c2ecf20Sopenharmony_cistatic int dlm_do_master_request(struct dlm_lock_resource *res, 13098c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, int to) 13108c2ecf20Sopenharmony_ci{ 13118c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm = mle->dlm; 13128c2ecf20Sopenharmony_ci struct dlm_master_request request; 13138c2ecf20Sopenharmony_ci int ret, response=0, resend; 13148c2ecf20Sopenharmony_ci 13158c2ecf20Sopenharmony_ci memset(&request, 0, sizeof(request)); 13168c2ecf20Sopenharmony_ci request.node_idx = dlm->node_num; 13178c2ecf20Sopenharmony_ci 13188c2ecf20Sopenharmony_ci BUG_ON(mle->type == DLM_MLE_MIGRATION); 13198c2ecf20Sopenharmony_ci 13208c2ecf20Sopenharmony_ci request.namelen = (u8)mle->mnamelen; 13218c2ecf20Sopenharmony_ci memcpy(request.name, mle->mname, request.namelen); 13228c2ecf20Sopenharmony_ci 13238c2ecf20Sopenharmony_ciagain: 13248c2ecf20Sopenharmony_ci ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, 13258c2ecf20Sopenharmony_ci sizeof(request), to, &response); 13268c2ecf20Sopenharmony_ci if (ret < 0) { 13278c2ecf20Sopenharmony_ci if (ret == -ESRCH) { 13288c2ecf20Sopenharmony_ci /* should never happen */ 13298c2ecf20Sopenharmony_ci mlog(ML_ERROR, "TCP stack not ready!\n"); 13308c2ecf20Sopenharmony_ci BUG(); 13318c2ecf20Sopenharmony_ci } else if (ret == -EINVAL) { 13328c2ecf20Sopenharmony_ci mlog(ML_ERROR, "bad args passed to o2net!\n"); 13338c2ecf20Sopenharmony_ci BUG(); 13348c2ecf20Sopenharmony_ci } else if (ret == -ENOMEM) { 13358c2ecf20Sopenharmony_ci mlog(ML_ERROR, "out of memory while trying to send " 13368c2ecf20Sopenharmony_ci "network message! retrying\n"); 13378c2ecf20Sopenharmony_ci /* this is totally crude */ 13388c2ecf20Sopenharmony_ci msleep(50); 13398c2ecf20Sopenharmony_ci goto again; 13408c2ecf20Sopenharmony_ci } else if (!dlm_is_host_down(ret)) { 13418c2ecf20Sopenharmony_ci /* not a network error. bad. */ 13428c2ecf20Sopenharmony_ci mlog_errno(ret); 13438c2ecf20Sopenharmony_ci mlog(ML_ERROR, "unhandled error!"); 13448c2ecf20Sopenharmony_ci BUG(); 13458c2ecf20Sopenharmony_ci } 13468c2ecf20Sopenharmony_ci /* all other errors should be network errors, 13478c2ecf20Sopenharmony_ci * and likely indicate node death */ 13488c2ecf20Sopenharmony_ci mlog(ML_ERROR, "link to %d went down!\n", to); 13498c2ecf20Sopenharmony_ci goto out; 13508c2ecf20Sopenharmony_ci } 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci ret = 0; 13538c2ecf20Sopenharmony_ci resend = 0; 13548c2ecf20Sopenharmony_ci spin_lock(&mle->spinlock); 13558c2ecf20Sopenharmony_ci switch (response) { 13568c2ecf20Sopenharmony_ci case DLM_MASTER_RESP_YES: 13578c2ecf20Sopenharmony_ci set_bit(to, mle->response_map); 13588c2ecf20Sopenharmony_ci mlog(0, "node %u is the master, response=YES\n", to); 13598c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: master node %u now knows I have a " 13608c2ecf20Sopenharmony_ci "reference\n", dlm->name, res->lockname.len, 13618c2ecf20Sopenharmony_ci res->lockname.name, to); 13628c2ecf20Sopenharmony_ci mle->master = to; 13638c2ecf20Sopenharmony_ci break; 13648c2ecf20Sopenharmony_ci case DLM_MASTER_RESP_NO: 13658c2ecf20Sopenharmony_ci mlog(0, "node %u not master, response=NO\n", to); 13668c2ecf20Sopenharmony_ci set_bit(to, mle->response_map); 13678c2ecf20Sopenharmony_ci break; 13688c2ecf20Sopenharmony_ci case DLM_MASTER_RESP_MAYBE: 13698c2ecf20Sopenharmony_ci mlog(0, "node %u not master, response=MAYBE\n", to); 13708c2ecf20Sopenharmony_ci set_bit(to, mle->response_map); 13718c2ecf20Sopenharmony_ci set_bit(to, mle->maybe_map); 13728c2ecf20Sopenharmony_ci break; 13738c2ecf20Sopenharmony_ci case DLM_MASTER_RESP_ERROR: 13748c2ecf20Sopenharmony_ci mlog(0, "node %u hit an error, resending\n", to); 13758c2ecf20Sopenharmony_ci resend = 1; 13768c2ecf20Sopenharmony_ci response = 0; 13778c2ecf20Sopenharmony_ci break; 13788c2ecf20Sopenharmony_ci default: 13798c2ecf20Sopenharmony_ci mlog(ML_ERROR, "bad response! %u\n", response); 13808c2ecf20Sopenharmony_ci BUG(); 13818c2ecf20Sopenharmony_ci } 13828c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 13838c2ecf20Sopenharmony_ci if (resend) { 13848c2ecf20Sopenharmony_ci /* this is also totally crude */ 13858c2ecf20Sopenharmony_ci msleep(50); 13868c2ecf20Sopenharmony_ci goto again; 13878c2ecf20Sopenharmony_ci } 13888c2ecf20Sopenharmony_ci 13898c2ecf20Sopenharmony_ciout: 13908c2ecf20Sopenharmony_ci return ret; 13918c2ecf20Sopenharmony_ci} 13928c2ecf20Sopenharmony_ci 13938c2ecf20Sopenharmony_ci/* 13948c2ecf20Sopenharmony_ci * locks that can be taken here: 13958c2ecf20Sopenharmony_ci * dlm->spinlock 13968c2ecf20Sopenharmony_ci * res->spinlock 13978c2ecf20Sopenharmony_ci * mle->spinlock 13988c2ecf20Sopenharmony_ci * dlm->master_list 13998c2ecf20Sopenharmony_ci * 14008c2ecf20Sopenharmony_ci * if possible, TRIM THIS DOWN!!! 14018c2ecf20Sopenharmony_ci */ 14028c2ecf20Sopenharmony_ciint dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, 14038c2ecf20Sopenharmony_ci void **ret_data) 14048c2ecf20Sopenharmony_ci{ 14058c2ecf20Sopenharmony_ci u8 response = DLM_MASTER_RESP_MAYBE; 14068c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm = data; 14078c2ecf20Sopenharmony_ci struct dlm_lock_resource *res = NULL; 14088c2ecf20Sopenharmony_ci struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; 14098c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; 14108c2ecf20Sopenharmony_ci char *name; 14118c2ecf20Sopenharmony_ci unsigned int namelen, hash; 14128c2ecf20Sopenharmony_ci int found, ret; 14138c2ecf20Sopenharmony_ci int set_maybe; 14148c2ecf20Sopenharmony_ci int dispatch_assert = 0; 14158c2ecf20Sopenharmony_ci int dispatched = 0; 14168c2ecf20Sopenharmony_ci 14178c2ecf20Sopenharmony_ci if (!dlm_grab(dlm)) 14188c2ecf20Sopenharmony_ci return DLM_MASTER_RESP_NO; 14198c2ecf20Sopenharmony_ci 14208c2ecf20Sopenharmony_ci if (!dlm_domain_fully_joined(dlm)) { 14218c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 14228c2ecf20Sopenharmony_ci goto send_response; 14238c2ecf20Sopenharmony_ci } 14248c2ecf20Sopenharmony_ci 14258c2ecf20Sopenharmony_ci name = request->name; 14268c2ecf20Sopenharmony_ci namelen = request->namelen; 14278c2ecf20Sopenharmony_ci hash = dlm_lockid_hash(name, namelen); 14288c2ecf20Sopenharmony_ci 14298c2ecf20Sopenharmony_ci if (namelen > DLM_LOCKID_NAME_MAX) { 14308c2ecf20Sopenharmony_ci response = DLM_IVBUFLEN; 14318c2ecf20Sopenharmony_ci goto send_response; 14328c2ecf20Sopenharmony_ci } 14338c2ecf20Sopenharmony_ci 14348c2ecf20Sopenharmony_ciway_up_top: 14358c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 14368c2ecf20Sopenharmony_ci res = __dlm_lookup_lockres(dlm, name, namelen, hash); 14378c2ecf20Sopenharmony_ci if (res) { 14388c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 14398c2ecf20Sopenharmony_ci 14408c2ecf20Sopenharmony_ci /* take care of the easy cases up front */ 14418c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 14428c2ecf20Sopenharmony_ci 14438c2ecf20Sopenharmony_ci /* 14448c2ecf20Sopenharmony_ci * Right after dlm spinlock was released, dlm_thread could have 14458c2ecf20Sopenharmony_ci * purged the lockres. Check if lockres got unhashed. If so 14468c2ecf20Sopenharmony_ci * start over. 14478c2ecf20Sopenharmony_ci */ 14488c2ecf20Sopenharmony_ci if (hlist_unhashed(&res->hash_node)) { 14498c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 14508c2ecf20Sopenharmony_ci dlm_lockres_put(res); 14518c2ecf20Sopenharmony_ci goto way_up_top; 14528c2ecf20Sopenharmony_ci } 14538c2ecf20Sopenharmony_ci 14548c2ecf20Sopenharmony_ci if (res->state & (DLM_LOCK_RES_RECOVERING| 14558c2ecf20Sopenharmony_ci DLM_LOCK_RES_MIGRATING)) { 14568c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 14578c2ecf20Sopenharmony_ci mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " 14588c2ecf20Sopenharmony_ci "being recovered/migrated\n"); 14598c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_ERROR; 14608c2ecf20Sopenharmony_ci if (mle) 14618c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 14628c2ecf20Sopenharmony_ci goto send_response; 14638c2ecf20Sopenharmony_ci } 14648c2ecf20Sopenharmony_ci 14658c2ecf20Sopenharmony_ci if (res->owner == dlm->node_num) { 14668c2ecf20Sopenharmony_ci dlm_lockres_set_refmap_bit(dlm, res, request->node_idx); 14678c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 14688c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_YES; 14698c2ecf20Sopenharmony_ci if (mle) 14708c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_ci /* this node is the owner. 14738c2ecf20Sopenharmony_ci * there is some extra work that needs to 14748c2ecf20Sopenharmony_ci * happen now. the requesting node has 14758c2ecf20Sopenharmony_ci * caused all nodes up to this one to 14768c2ecf20Sopenharmony_ci * create mles. this node now needs to 14778c2ecf20Sopenharmony_ci * go back and clean those up. */ 14788c2ecf20Sopenharmony_ci dispatch_assert = 1; 14798c2ecf20Sopenharmony_ci goto send_response; 14808c2ecf20Sopenharmony_ci } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 14818c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 14828c2ecf20Sopenharmony_ci // mlog(0, "node %u is the master\n", res->owner); 14838c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 14848c2ecf20Sopenharmony_ci if (mle) 14858c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 14868c2ecf20Sopenharmony_ci goto send_response; 14878c2ecf20Sopenharmony_ci } 14888c2ecf20Sopenharmony_ci 14898c2ecf20Sopenharmony_ci /* ok, there is no owner. either this node is 14908c2ecf20Sopenharmony_ci * being blocked, or it is actively trying to 14918c2ecf20Sopenharmony_ci * master this lock. */ 14928c2ecf20Sopenharmony_ci if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { 14938c2ecf20Sopenharmony_ci mlog(ML_ERROR, "lock with no owner should be " 14948c2ecf20Sopenharmony_ci "in-progress!\n"); 14958c2ecf20Sopenharmony_ci BUG(); 14968c2ecf20Sopenharmony_ci } 14978c2ecf20Sopenharmony_ci 14988c2ecf20Sopenharmony_ci // mlog(0, "lockres is in progress...\n"); 14998c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 15008c2ecf20Sopenharmony_ci found = dlm_find_mle(dlm, &tmpmle, name, namelen); 15018c2ecf20Sopenharmony_ci if (!found) { 15028c2ecf20Sopenharmony_ci mlog(ML_ERROR, "no mle found for this lock!\n"); 15038c2ecf20Sopenharmony_ci BUG(); 15048c2ecf20Sopenharmony_ci } 15058c2ecf20Sopenharmony_ci set_maybe = 1; 15068c2ecf20Sopenharmony_ci spin_lock(&tmpmle->spinlock); 15078c2ecf20Sopenharmony_ci if (tmpmle->type == DLM_MLE_BLOCK) { 15088c2ecf20Sopenharmony_ci // mlog(0, "this node is waiting for " 15098c2ecf20Sopenharmony_ci // "lockres to be mastered\n"); 15108c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 15118c2ecf20Sopenharmony_ci } else if (tmpmle->type == DLM_MLE_MIGRATION) { 15128c2ecf20Sopenharmony_ci mlog(0, "node %u is master, but trying to migrate to " 15138c2ecf20Sopenharmony_ci "node %u.\n", tmpmle->master, tmpmle->new_master); 15148c2ecf20Sopenharmony_ci if (tmpmle->master == dlm->node_num) { 15158c2ecf20Sopenharmony_ci mlog(ML_ERROR, "no owner on lockres, but this " 15168c2ecf20Sopenharmony_ci "node is trying to migrate it to %u?!\n", 15178c2ecf20Sopenharmony_ci tmpmle->new_master); 15188c2ecf20Sopenharmony_ci BUG(); 15198c2ecf20Sopenharmony_ci } else { 15208c2ecf20Sopenharmony_ci /* the real master can respond on its own */ 15218c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 15228c2ecf20Sopenharmony_ci } 15238c2ecf20Sopenharmony_ci } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { 15248c2ecf20Sopenharmony_ci set_maybe = 0; 15258c2ecf20Sopenharmony_ci if (tmpmle->master == dlm->node_num) { 15268c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_YES; 15278c2ecf20Sopenharmony_ci /* this node will be the owner. 15288c2ecf20Sopenharmony_ci * go back and clean the mles on any 15298c2ecf20Sopenharmony_ci * other nodes */ 15308c2ecf20Sopenharmony_ci dispatch_assert = 1; 15318c2ecf20Sopenharmony_ci dlm_lockres_set_refmap_bit(dlm, res, 15328c2ecf20Sopenharmony_ci request->node_idx); 15338c2ecf20Sopenharmony_ci } else 15348c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 15358c2ecf20Sopenharmony_ci } else { 15368c2ecf20Sopenharmony_ci // mlog(0, "this node is attempting to " 15378c2ecf20Sopenharmony_ci // "master lockres\n"); 15388c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_MAYBE; 15398c2ecf20Sopenharmony_ci } 15408c2ecf20Sopenharmony_ci if (set_maybe) 15418c2ecf20Sopenharmony_ci set_bit(request->node_idx, tmpmle->maybe_map); 15428c2ecf20Sopenharmony_ci spin_unlock(&tmpmle->spinlock); 15438c2ecf20Sopenharmony_ci 15448c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 15458c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 15468c2ecf20Sopenharmony_ci 15478c2ecf20Sopenharmony_ci /* keep the mle attached to heartbeat events */ 15488c2ecf20Sopenharmony_ci dlm_put_mle(tmpmle); 15498c2ecf20Sopenharmony_ci if (mle) 15508c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 15518c2ecf20Sopenharmony_ci goto send_response; 15528c2ecf20Sopenharmony_ci } 15538c2ecf20Sopenharmony_ci 15548c2ecf20Sopenharmony_ci /* 15558c2ecf20Sopenharmony_ci * lockres doesn't exist on this node 15568c2ecf20Sopenharmony_ci * if there is an MLE_BLOCK, return NO 15578c2ecf20Sopenharmony_ci * if there is an MLE_MASTER, return MAYBE 15588c2ecf20Sopenharmony_ci * otherwise, add an MLE_BLOCK, return NO 15598c2ecf20Sopenharmony_ci */ 15608c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 15618c2ecf20Sopenharmony_ci found = dlm_find_mle(dlm, &tmpmle, name, namelen); 15628c2ecf20Sopenharmony_ci if (!found) { 15638c2ecf20Sopenharmony_ci /* this lockid has never been seen on this node yet */ 15648c2ecf20Sopenharmony_ci // mlog(0, "no mle found\n"); 15658c2ecf20Sopenharmony_ci if (!mle) { 15668c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 15678c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 15688c2ecf20Sopenharmony_ci 15698c2ecf20Sopenharmony_ci mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 15708c2ecf20Sopenharmony_ci if (!mle) { 15718c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_ERROR; 15728c2ecf20Sopenharmony_ci mlog_errno(-ENOMEM); 15738c2ecf20Sopenharmony_ci goto send_response; 15748c2ecf20Sopenharmony_ci } 15758c2ecf20Sopenharmony_ci goto way_up_top; 15768c2ecf20Sopenharmony_ci } 15778c2ecf20Sopenharmony_ci 15788c2ecf20Sopenharmony_ci // mlog(0, "this is second time thru, already allocated, " 15798c2ecf20Sopenharmony_ci // "add the block.\n"); 15808c2ecf20Sopenharmony_ci dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); 15818c2ecf20Sopenharmony_ci set_bit(request->node_idx, mle->maybe_map); 15828c2ecf20Sopenharmony_ci __dlm_insert_mle(dlm, mle); 15838c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 15848c2ecf20Sopenharmony_ci } else { 15858c2ecf20Sopenharmony_ci spin_lock(&tmpmle->spinlock); 15868c2ecf20Sopenharmony_ci if (tmpmle->master == dlm->node_num) { 15878c2ecf20Sopenharmony_ci mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); 15888c2ecf20Sopenharmony_ci BUG(); 15898c2ecf20Sopenharmony_ci } 15908c2ecf20Sopenharmony_ci if (tmpmle->type == DLM_MLE_BLOCK) 15918c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 15928c2ecf20Sopenharmony_ci else if (tmpmle->type == DLM_MLE_MIGRATION) { 15938c2ecf20Sopenharmony_ci mlog(0, "migration mle was found (%u->%u)\n", 15948c2ecf20Sopenharmony_ci tmpmle->master, tmpmle->new_master); 15958c2ecf20Sopenharmony_ci /* real master can respond on its own */ 15968c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_NO; 15978c2ecf20Sopenharmony_ci } else 15988c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_MAYBE; 15998c2ecf20Sopenharmony_ci set_bit(request->node_idx, tmpmle->maybe_map); 16008c2ecf20Sopenharmony_ci spin_unlock(&tmpmle->spinlock); 16018c2ecf20Sopenharmony_ci } 16028c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 16038c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 16048c2ecf20Sopenharmony_ci 16058c2ecf20Sopenharmony_ci if (found) { 16068c2ecf20Sopenharmony_ci /* keep the mle attached to heartbeat events */ 16078c2ecf20Sopenharmony_ci dlm_put_mle(tmpmle); 16088c2ecf20Sopenharmony_ci } 16098c2ecf20Sopenharmony_cisend_response: 16108c2ecf20Sopenharmony_ci /* 16118c2ecf20Sopenharmony_ci * __dlm_lookup_lockres() grabbed a reference to this lockres. 16128c2ecf20Sopenharmony_ci * The reference is released by dlm_assert_master_worker() under 16138c2ecf20Sopenharmony_ci * the call to dlm_dispatch_assert_master(). If 16148c2ecf20Sopenharmony_ci * dlm_assert_master_worker() isn't called, we drop it here. 16158c2ecf20Sopenharmony_ci */ 16168c2ecf20Sopenharmony_ci if (dispatch_assert) { 16178c2ecf20Sopenharmony_ci mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", 16188c2ecf20Sopenharmony_ci dlm->node_num, res->lockname.len, res->lockname.name); 16198c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 16208c2ecf20Sopenharmony_ci ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, 16218c2ecf20Sopenharmony_ci DLM_ASSERT_MASTER_MLE_CLEANUP); 16228c2ecf20Sopenharmony_ci if (ret < 0) { 16238c2ecf20Sopenharmony_ci mlog(ML_ERROR, "failed to dispatch assert master work\n"); 16248c2ecf20Sopenharmony_ci response = DLM_MASTER_RESP_ERROR; 16258c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 16268c2ecf20Sopenharmony_ci dlm_lockres_put(res); 16278c2ecf20Sopenharmony_ci } else { 16288c2ecf20Sopenharmony_ci dispatched = 1; 16298c2ecf20Sopenharmony_ci __dlm_lockres_grab_inflight_worker(dlm, res); 16308c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 16318c2ecf20Sopenharmony_ci } 16328c2ecf20Sopenharmony_ci } else { 16338c2ecf20Sopenharmony_ci if (res) 16348c2ecf20Sopenharmony_ci dlm_lockres_put(res); 16358c2ecf20Sopenharmony_ci } 16368c2ecf20Sopenharmony_ci 16378c2ecf20Sopenharmony_ci if (!dispatched) 16388c2ecf20Sopenharmony_ci dlm_put(dlm); 16398c2ecf20Sopenharmony_ci return response; 16408c2ecf20Sopenharmony_ci} 16418c2ecf20Sopenharmony_ci 16428c2ecf20Sopenharmony_ci/* 16438c2ecf20Sopenharmony_ci * DLM_ASSERT_MASTER_MSG 16448c2ecf20Sopenharmony_ci */ 16458c2ecf20Sopenharmony_ci 16468c2ecf20Sopenharmony_ci 16478c2ecf20Sopenharmony_ci/* 16488c2ecf20Sopenharmony_ci * NOTE: this can be used for debugging 16498c2ecf20Sopenharmony_ci * can periodically run all locks owned by this node 16508c2ecf20Sopenharmony_ci * and re-assert across the cluster... 16518c2ecf20Sopenharmony_ci */ 16528c2ecf20Sopenharmony_cistatic int dlm_do_assert_master(struct dlm_ctxt *dlm, 16538c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 16548c2ecf20Sopenharmony_ci void *nodemap, u32 flags) 16558c2ecf20Sopenharmony_ci{ 16568c2ecf20Sopenharmony_ci struct dlm_assert_master assert; 16578c2ecf20Sopenharmony_ci int to, tmpret; 16588c2ecf20Sopenharmony_ci struct dlm_node_iter iter; 16598c2ecf20Sopenharmony_ci int ret = 0; 16608c2ecf20Sopenharmony_ci int reassert; 16618c2ecf20Sopenharmony_ci const char *lockname = res->lockname.name; 16628c2ecf20Sopenharmony_ci unsigned int namelen = res->lockname.len; 16638c2ecf20Sopenharmony_ci 16648c2ecf20Sopenharmony_ci BUG_ON(namelen > O2NM_MAX_NAME_LEN); 16658c2ecf20Sopenharmony_ci 16668c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 16678c2ecf20Sopenharmony_ci res->state |= DLM_LOCK_RES_SETREF_INPROG; 16688c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 16698c2ecf20Sopenharmony_ci 16708c2ecf20Sopenharmony_ciagain: 16718c2ecf20Sopenharmony_ci reassert = 0; 16728c2ecf20Sopenharmony_ci 16738c2ecf20Sopenharmony_ci /* note that if this nodemap is empty, it returns 0 */ 16748c2ecf20Sopenharmony_ci dlm_node_iter_init(nodemap, &iter); 16758c2ecf20Sopenharmony_ci while ((to = dlm_node_iter_next(&iter)) >= 0) { 16768c2ecf20Sopenharmony_ci int r = 0; 16778c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle = NULL; 16788c2ecf20Sopenharmony_ci 16798c2ecf20Sopenharmony_ci mlog(0, "sending assert master to %d (%.*s)\n", to, 16808c2ecf20Sopenharmony_ci namelen, lockname); 16818c2ecf20Sopenharmony_ci memset(&assert, 0, sizeof(assert)); 16828c2ecf20Sopenharmony_ci assert.node_idx = dlm->node_num; 16838c2ecf20Sopenharmony_ci assert.namelen = namelen; 16848c2ecf20Sopenharmony_ci memcpy(assert.name, lockname, namelen); 16858c2ecf20Sopenharmony_ci assert.flags = cpu_to_be32(flags); 16868c2ecf20Sopenharmony_ci 16878c2ecf20Sopenharmony_ci tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, 16888c2ecf20Sopenharmony_ci &assert, sizeof(assert), to, &r); 16898c2ecf20Sopenharmony_ci if (tmpret < 0) { 16908c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Error %d when sending message %u (key " 16918c2ecf20Sopenharmony_ci "0x%x) to node %u\n", tmpret, 16928c2ecf20Sopenharmony_ci DLM_ASSERT_MASTER_MSG, dlm->key, to); 16938c2ecf20Sopenharmony_ci if (!dlm_is_host_down(tmpret)) { 16948c2ecf20Sopenharmony_ci mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); 16958c2ecf20Sopenharmony_ci BUG(); 16968c2ecf20Sopenharmony_ci } 16978c2ecf20Sopenharmony_ci /* a node died. finish out the rest of the nodes. */ 16988c2ecf20Sopenharmony_ci mlog(0, "link to %d went down!\n", to); 16998c2ecf20Sopenharmony_ci /* any nonzero status return will do */ 17008c2ecf20Sopenharmony_ci ret = tmpret; 17018c2ecf20Sopenharmony_ci r = 0; 17028c2ecf20Sopenharmony_ci } else if (r < 0) { 17038c2ecf20Sopenharmony_ci /* ok, something horribly messed. kill thyself. */ 17048c2ecf20Sopenharmony_ci mlog(ML_ERROR,"during assert master of %.*s to %u, " 17058c2ecf20Sopenharmony_ci "got %d.\n", namelen, lockname, to, r); 17068c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 17078c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 17088c2ecf20Sopenharmony_ci if (dlm_find_mle(dlm, &mle, (char *)lockname, 17098c2ecf20Sopenharmony_ci namelen)) { 17108c2ecf20Sopenharmony_ci dlm_print_one_mle(mle); 17118c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 17128c2ecf20Sopenharmony_ci } 17138c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 17148c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 17158c2ecf20Sopenharmony_ci BUG(); 17168c2ecf20Sopenharmony_ci } 17178c2ecf20Sopenharmony_ci 17188c2ecf20Sopenharmony_ci if (r & DLM_ASSERT_RESPONSE_REASSERT && 17198c2ecf20Sopenharmony_ci !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { 17208c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%.*s: very strange, " 17218c2ecf20Sopenharmony_ci "master MLE but no lockres on %u\n", 17228c2ecf20Sopenharmony_ci namelen, lockname, to); 17238c2ecf20Sopenharmony_ci } 17248c2ecf20Sopenharmony_ci 17258c2ecf20Sopenharmony_ci if (r & DLM_ASSERT_RESPONSE_REASSERT) { 17268c2ecf20Sopenharmony_ci mlog(0, "%.*s: node %u create mles on other " 17278c2ecf20Sopenharmony_ci "nodes and requests a re-assert\n", 17288c2ecf20Sopenharmony_ci namelen, lockname, to); 17298c2ecf20Sopenharmony_ci reassert = 1; 17308c2ecf20Sopenharmony_ci } 17318c2ecf20Sopenharmony_ci if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { 17328c2ecf20Sopenharmony_ci mlog(0, "%.*s: node %u has a reference to this " 17338c2ecf20Sopenharmony_ci "lockres, set the bit in the refmap\n", 17348c2ecf20Sopenharmony_ci namelen, lockname, to); 17358c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 17368c2ecf20Sopenharmony_ci dlm_lockres_set_refmap_bit(dlm, res, to); 17378c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 17388c2ecf20Sopenharmony_ci } 17398c2ecf20Sopenharmony_ci } 17408c2ecf20Sopenharmony_ci 17418c2ecf20Sopenharmony_ci if (reassert) 17428c2ecf20Sopenharmony_ci goto again; 17438c2ecf20Sopenharmony_ci 17448c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 17458c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 17468c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 17478c2ecf20Sopenharmony_ci wake_up(&res->wq); 17488c2ecf20Sopenharmony_ci 17498c2ecf20Sopenharmony_ci return ret; 17508c2ecf20Sopenharmony_ci} 17518c2ecf20Sopenharmony_ci 17528c2ecf20Sopenharmony_ci/* 17538c2ecf20Sopenharmony_ci * locks that can be taken here: 17548c2ecf20Sopenharmony_ci * dlm->spinlock 17558c2ecf20Sopenharmony_ci * res->spinlock 17568c2ecf20Sopenharmony_ci * mle->spinlock 17578c2ecf20Sopenharmony_ci * dlm->master_list 17588c2ecf20Sopenharmony_ci * 17598c2ecf20Sopenharmony_ci * if possible, TRIM THIS DOWN!!! 17608c2ecf20Sopenharmony_ci */ 17618c2ecf20Sopenharmony_ciint dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, 17628c2ecf20Sopenharmony_ci void **ret_data) 17638c2ecf20Sopenharmony_ci{ 17648c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm = data; 17658c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle = NULL; 17668c2ecf20Sopenharmony_ci struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; 17678c2ecf20Sopenharmony_ci struct dlm_lock_resource *res = NULL; 17688c2ecf20Sopenharmony_ci char *name; 17698c2ecf20Sopenharmony_ci unsigned int namelen, hash; 17708c2ecf20Sopenharmony_ci u32 flags; 17718c2ecf20Sopenharmony_ci int master_request = 0, have_lockres_ref = 0; 17728c2ecf20Sopenharmony_ci int ret = 0; 17738c2ecf20Sopenharmony_ci 17748c2ecf20Sopenharmony_ci if (!dlm_grab(dlm)) 17758c2ecf20Sopenharmony_ci return 0; 17768c2ecf20Sopenharmony_ci 17778c2ecf20Sopenharmony_ci name = assert->name; 17788c2ecf20Sopenharmony_ci namelen = assert->namelen; 17798c2ecf20Sopenharmony_ci hash = dlm_lockid_hash(name, namelen); 17808c2ecf20Sopenharmony_ci flags = be32_to_cpu(assert->flags); 17818c2ecf20Sopenharmony_ci 17828c2ecf20Sopenharmony_ci if (namelen > DLM_LOCKID_NAME_MAX) { 17838c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Invalid name length!"); 17848c2ecf20Sopenharmony_ci goto done; 17858c2ecf20Sopenharmony_ci } 17868c2ecf20Sopenharmony_ci 17878c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 17888c2ecf20Sopenharmony_ci 17898c2ecf20Sopenharmony_ci if (flags) 17908c2ecf20Sopenharmony_ci mlog(0, "assert_master with flags: %u\n", flags); 17918c2ecf20Sopenharmony_ci 17928c2ecf20Sopenharmony_ci /* find the MLE */ 17938c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 17948c2ecf20Sopenharmony_ci if (!dlm_find_mle(dlm, &mle, name, namelen)) { 17958c2ecf20Sopenharmony_ci /* not an error, could be master just re-asserting */ 17968c2ecf20Sopenharmony_ci mlog(0, "just got an assert_master from %u, but no " 17978c2ecf20Sopenharmony_ci "MLE for it! (%.*s)\n", assert->node_idx, 17988c2ecf20Sopenharmony_ci namelen, name); 17998c2ecf20Sopenharmony_ci } else { 18008c2ecf20Sopenharmony_ci int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); 18018c2ecf20Sopenharmony_ci if (bit >= O2NM_MAX_NODES) { 18028c2ecf20Sopenharmony_ci /* not necessarily an error, though less likely. 18038c2ecf20Sopenharmony_ci * could be master just re-asserting. */ 18048c2ecf20Sopenharmony_ci mlog(0, "no bits set in the maybe_map, but %u " 18058c2ecf20Sopenharmony_ci "is asserting! (%.*s)\n", assert->node_idx, 18068c2ecf20Sopenharmony_ci namelen, name); 18078c2ecf20Sopenharmony_ci } else if (bit != assert->node_idx) { 18088c2ecf20Sopenharmony_ci if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { 18098c2ecf20Sopenharmony_ci mlog(0, "master %u was found, %u should " 18108c2ecf20Sopenharmony_ci "back off\n", assert->node_idx, bit); 18118c2ecf20Sopenharmony_ci } else { 18128c2ecf20Sopenharmony_ci /* with the fix for bug 569, a higher node 18138c2ecf20Sopenharmony_ci * number winning the mastery will respond 18148c2ecf20Sopenharmony_ci * YES to mastery requests, but this node 18158c2ecf20Sopenharmony_ci * had no way of knowing. let it pass. */ 18168c2ecf20Sopenharmony_ci mlog(0, "%u is the lowest node, " 18178c2ecf20Sopenharmony_ci "%u is asserting. (%.*s) %u must " 18188c2ecf20Sopenharmony_ci "have begun after %u won.\n", bit, 18198c2ecf20Sopenharmony_ci assert->node_idx, namelen, name, bit, 18208c2ecf20Sopenharmony_ci assert->node_idx); 18218c2ecf20Sopenharmony_ci } 18228c2ecf20Sopenharmony_ci } 18238c2ecf20Sopenharmony_ci if (mle->type == DLM_MLE_MIGRATION) { 18248c2ecf20Sopenharmony_ci if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { 18258c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: got cleanup assert" 18268c2ecf20Sopenharmony_ci " from %u for migration\n", 18278c2ecf20Sopenharmony_ci dlm->name, namelen, name, 18288c2ecf20Sopenharmony_ci assert->node_idx); 18298c2ecf20Sopenharmony_ci } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { 18308c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: got unrelated assert" 18318c2ecf20Sopenharmony_ci " from %u for migration, ignoring\n", 18328c2ecf20Sopenharmony_ci dlm->name, namelen, name, 18338c2ecf20Sopenharmony_ci assert->node_idx); 18348c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 18358c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 18368c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 18378c2ecf20Sopenharmony_ci goto done; 18388c2ecf20Sopenharmony_ci } 18398c2ecf20Sopenharmony_ci } 18408c2ecf20Sopenharmony_ci } 18418c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 18428c2ecf20Sopenharmony_ci 18438c2ecf20Sopenharmony_ci /* ok everything checks out with the MLE 18448c2ecf20Sopenharmony_ci * now check to see if there is a lockres */ 18458c2ecf20Sopenharmony_ci res = __dlm_lookup_lockres(dlm, name, namelen, hash); 18468c2ecf20Sopenharmony_ci if (res) { 18478c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 18488c2ecf20Sopenharmony_ci if (res->state & DLM_LOCK_RES_RECOVERING) { 18498c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%u asserting but %.*s is " 18508c2ecf20Sopenharmony_ci "RECOVERING!\n", assert->node_idx, namelen, name); 18518c2ecf20Sopenharmony_ci goto kill; 18528c2ecf20Sopenharmony_ci } 18538c2ecf20Sopenharmony_ci if (!mle) { 18548c2ecf20Sopenharmony_ci if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && 18558c2ecf20Sopenharmony_ci res->owner != assert->node_idx) { 18568c2ecf20Sopenharmony_ci mlog(ML_ERROR, "DIE! Mastery assert from %u, " 18578c2ecf20Sopenharmony_ci "but current owner is %u! (%.*s)\n", 18588c2ecf20Sopenharmony_ci assert->node_idx, res->owner, namelen, 18598c2ecf20Sopenharmony_ci name); 18608c2ecf20Sopenharmony_ci __dlm_print_one_lock_resource(res); 18618c2ecf20Sopenharmony_ci BUG(); 18628c2ecf20Sopenharmony_ci } 18638c2ecf20Sopenharmony_ci } else if (mle->type != DLM_MLE_MIGRATION) { 18648c2ecf20Sopenharmony_ci if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { 18658c2ecf20Sopenharmony_ci /* owner is just re-asserting */ 18668c2ecf20Sopenharmony_ci if (res->owner == assert->node_idx) { 18678c2ecf20Sopenharmony_ci mlog(0, "owner %u re-asserting on " 18688c2ecf20Sopenharmony_ci "lock %.*s\n", assert->node_idx, 18698c2ecf20Sopenharmony_ci namelen, name); 18708c2ecf20Sopenharmony_ci goto ok; 18718c2ecf20Sopenharmony_ci } 18728c2ecf20Sopenharmony_ci mlog(ML_ERROR, "got assert_master from " 18738c2ecf20Sopenharmony_ci "node %u, but %u is the owner! " 18748c2ecf20Sopenharmony_ci "(%.*s)\n", assert->node_idx, 18758c2ecf20Sopenharmony_ci res->owner, namelen, name); 18768c2ecf20Sopenharmony_ci goto kill; 18778c2ecf20Sopenharmony_ci } 18788c2ecf20Sopenharmony_ci if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { 18798c2ecf20Sopenharmony_ci mlog(ML_ERROR, "got assert from %u, but lock " 18808c2ecf20Sopenharmony_ci "with no owner should be " 18818c2ecf20Sopenharmony_ci "in-progress! (%.*s)\n", 18828c2ecf20Sopenharmony_ci assert->node_idx, 18838c2ecf20Sopenharmony_ci namelen, name); 18848c2ecf20Sopenharmony_ci goto kill; 18858c2ecf20Sopenharmony_ci } 18868c2ecf20Sopenharmony_ci } else /* mle->type == DLM_MLE_MIGRATION */ { 18878c2ecf20Sopenharmony_ci /* should only be getting an assert from new master */ 18888c2ecf20Sopenharmony_ci if (assert->node_idx != mle->new_master) { 18898c2ecf20Sopenharmony_ci mlog(ML_ERROR, "got assert from %u, but " 18908c2ecf20Sopenharmony_ci "new master is %u, and old master " 18918c2ecf20Sopenharmony_ci "was %u (%.*s)\n", 18928c2ecf20Sopenharmony_ci assert->node_idx, mle->new_master, 18938c2ecf20Sopenharmony_ci mle->master, namelen, name); 18948c2ecf20Sopenharmony_ci goto kill; 18958c2ecf20Sopenharmony_ci } 18968c2ecf20Sopenharmony_ci 18978c2ecf20Sopenharmony_ci } 18988c2ecf20Sopenharmony_ciok: 18998c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 19008c2ecf20Sopenharmony_ci } 19018c2ecf20Sopenharmony_ci 19028c2ecf20Sopenharmony_ci // mlog(0, "woo! got an assert_master from node %u!\n", 19038c2ecf20Sopenharmony_ci // assert->node_idx); 19048c2ecf20Sopenharmony_ci if (mle) { 19058c2ecf20Sopenharmony_ci int extra_ref = 0; 19068c2ecf20Sopenharmony_ci int nn = -1; 19078c2ecf20Sopenharmony_ci int rr, err = 0; 19088c2ecf20Sopenharmony_ci 19098c2ecf20Sopenharmony_ci spin_lock(&mle->spinlock); 19108c2ecf20Sopenharmony_ci if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) 19118c2ecf20Sopenharmony_ci extra_ref = 1; 19128c2ecf20Sopenharmony_ci else { 19138c2ecf20Sopenharmony_ci /* MASTER mle: if any bits set in the response map 19148c2ecf20Sopenharmony_ci * then the calling node needs to re-assert to clear 19158c2ecf20Sopenharmony_ci * up nodes that this node contacted */ 19168c2ecf20Sopenharmony_ci while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES, 19178c2ecf20Sopenharmony_ci nn+1)) < O2NM_MAX_NODES) { 19188c2ecf20Sopenharmony_ci if (nn != dlm->node_num && nn != assert->node_idx) { 19198c2ecf20Sopenharmony_ci master_request = 1; 19208c2ecf20Sopenharmony_ci break; 19218c2ecf20Sopenharmony_ci } 19228c2ecf20Sopenharmony_ci } 19238c2ecf20Sopenharmony_ci } 19248c2ecf20Sopenharmony_ci mle->master = assert->node_idx; 19258c2ecf20Sopenharmony_ci atomic_set(&mle->woken, 1); 19268c2ecf20Sopenharmony_ci wake_up(&mle->wq); 19278c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 19288c2ecf20Sopenharmony_ci 19298c2ecf20Sopenharmony_ci if (res) { 19308c2ecf20Sopenharmony_ci int wake = 0; 19318c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 19328c2ecf20Sopenharmony_ci if (mle->type == DLM_MLE_MIGRATION) { 19338c2ecf20Sopenharmony_ci mlog(0, "finishing off migration of lockres %.*s, " 19348c2ecf20Sopenharmony_ci "from %u to %u\n", 19358c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, 19368c2ecf20Sopenharmony_ci dlm->node_num, mle->new_master); 19378c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_MIGRATING; 19388c2ecf20Sopenharmony_ci wake = 1; 19398c2ecf20Sopenharmony_ci dlm_change_lockres_owner(dlm, res, mle->new_master); 19408c2ecf20Sopenharmony_ci BUG_ON(res->state & DLM_LOCK_RES_DIRTY); 19418c2ecf20Sopenharmony_ci } else { 19428c2ecf20Sopenharmony_ci dlm_change_lockres_owner(dlm, res, mle->master); 19438c2ecf20Sopenharmony_ci } 19448c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 19458c2ecf20Sopenharmony_ci have_lockres_ref = 1; 19468c2ecf20Sopenharmony_ci if (wake) 19478c2ecf20Sopenharmony_ci wake_up(&res->wq); 19488c2ecf20Sopenharmony_ci } 19498c2ecf20Sopenharmony_ci 19508c2ecf20Sopenharmony_ci /* master is known, detach if not already detached. 19518c2ecf20Sopenharmony_ci * ensures that only one assert_master call will happen 19528c2ecf20Sopenharmony_ci * on this mle. */ 19538c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 19548c2ecf20Sopenharmony_ci 19558c2ecf20Sopenharmony_ci rr = kref_read(&mle->mle_refs); 19568c2ecf20Sopenharmony_ci if (mle->inuse > 0) { 19578c2ecf20Sopenharmony_ci if (extra_ref && rr < 3) 19588c2ecf20Sopenharmony_ci err = 1; 19598c2ecf20Sopenharmony_ci else if (!extra_ref && rr < 2) 19608c2ecf20Sopenharmony_ci err = 1; 19618c2ecf20Sopenharmony_ci } else { 19628c2ecf20Sopenharmony_ci if (extra_ref && rr < 2) 19638c2ecf20Sopenharmony_ci err = 1; 19648c2ecf20Sopenharmony_ci else if (!extra_ref && rr < 1) 19658c2ecf20Sopenharmony_ci err = 1; 19668c2ecf20Sopenharmony_ci } 19678c2ecf20Sopenharmony_ci if (err) { 19688c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s:%.*s: got assert master from %u " 19698c2ecf20Sopenharmony_ci "that will mess up this node, refs=%d, extra=%d, " 19708c2ecf20Sopenharmony_ci "inuse=%d\n", dlm->name, namelen, name, 19718c2ecf20Sopenharmony_ci assert->node_idx, rr, extra_ref, mle->inuse); 19728c2ecf20Sopenharmony_ci dlm_print_one_mle(mle); 19738c2ecf20Sopenharmony_ci } 19748c2ecf20Sopenharmony_ci __dlm_unlink_mle(dlm, mle); 19758c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, mle); 19768c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 19778c2ecf20Sopenharmony_ci if (extra_ref) { 19788c2ecf20Sopenharmony_ci /* the assert master message now balances the extra 19798c2ecf20Sopenharmony_ci * ref given by the master / migration request message. 19808c2ecf20Sopenharmony_ci * if this is the last put, it will be removed 19818c2ecf20Sopenharmony_ci * from the list. */ 19828c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 19838c2ecf20Sopenharmony_ci } 19848c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 19858c2ecf20Sopenharmony_ci } else if (res) { 19868c2ecf20Sopenharmony_ci if (res->owner != assert->node_idx) { 19878c2ecf20Sopenharmony_ci mlog(0, "assert_master from %u, but current " 19888c2ecf20Sopenharmony_ci "owner is %u (%.*s), no mle\n", assert->node_idx, 19898c2ecf20Sopenharmony_ci res->owner, namelen, name); 19908c2ecf20Sopenharmony_ci } 19918c2ecf20Sopenharmony_ci } 19928c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 19938c2ecf20Sopenharmony_ci 19948c2ecf20Sopenharmony_cidone: 19958c2ecf20Sopenharmony_ci ret = 0; 19968c2ecf20Sopenharmony_ci if (res) { 19978c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 19988c2ecf20Sopenharmony_ci res->state |= DLM_LOCK_RES_SETREF_INPROG; 19998c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 20008c2ecf20Sopenharmony_ci *ret_data = (void *)res; 20018c2ecf20Sopenharmony_ci } 20028c2ecf20Sopenharmony_ci dlm_put(dlm); 20038c2ecf20Sopenharmony_ci if (master_request) { 20048c2ecf20Sopenharmony_ci mlog(0, "need to tell master to reassert\n"); 20058c2ecf20Sopenharmony_ci /* positive. negative would shoot down the node. */ 20068c2ecf20Sopenharmony_ci ret |= DLM_ASSERT_RESPONSE_REASSERT; 20078c2ecf20Sopenharmony_ci if (!have_lockres_ref) { 20088c2ecf20Sopenharmony_ci mlog(ML_ERROR, "strange, got assert from %u, MASTER " 20098c2ecf20Sopenharmony_ci "mle present here for %s:%.*s, but no lockres!\n", 20108c2ecf20Sopenharmony_ci assert->node_idx, dlm->name, namelen, name); 20118c2ecf20Sopenharmony_ci } 20128c2ecf20Sopenharmony_ci } 20138c2ecf20Sopenharmony_ci if (have_lockres_ref) { 20148c2ecf20Sopenharmony_ci /* let the master know we have a reference to the lockres */ 20158c2ecf20Sopenharmony_ci ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; 20168c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: got assert from %u, need a ref\n", 20178c2ecf20Sopenharmony_ci dlm->name, namelen, name, assert->node_idx); 20188c2ecf20Sopenharmony_ci } 20198c2ecf20Sopenharmony_ci return ret; 20208c2ecf20Sopenharmony_ci 20218c2ecf20Sopenharmony_cikill: 20228c2ecf20Sopenharmony_ci /* kill the caller! */ 20238c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Bad message received from another node. Dumping state " 20248c2ecf20Sopenharmony_ci "and killing the other node now! This node is OK and can continue.\n"); 20258c2ecf20Sopenharmony_ci __dlm_print_one_lock_resource(res); 20268c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 20278c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 20288c2ecf20Sopenharmony_ci if (mle) 20298c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 20308c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 20318c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 20328c2ecf20Sopenharmony_ci *ret_data = (void *)res; 20338c2ecf20Sopenharmony_ci dlm_put(dlm); 20348c2ecf20Sopenharmony_ci return -EINVAL; 20358c2ecf20Sopenharmony_ci} 20368c2ecf20Sopenharmony_ci 20378c2ecf20Sopenharmony_civoid dlm_assert_master_post_handler(int status, void *data, void *ret_data) 20388c2ecf20Sopenharmony_ci{ 20398c2ecf20Sopenharmony_ci struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; 20408c2ecf20Sopenharmony_ci 20418c2ecf20Sopenharmony_ci if (ret_data) { 20428c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 20438c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_SETREF_INPROG; 20448c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 20458c2ecf20Sopenharmony_ci wake_up(&res->wq); 20468c2ecf20Sopenharmony_ci dlm_lockres_put(res); 20478c2ecf20Sopenharmony_ci } 20488c2ecf20Sopenharmony_ci return; 20498c2ecf20Sopenharmony_ci} 20508c2ecf20Sopenharmony_ci 20518c2ecf20Sopenharmony_ciint dlm_dispatch_assert_master(struct dlm_ctxt *dlm, 20528c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 20538c2ecf20Sopenharmony_ci int ignore_higher, u8 request_from, u32 flags) 20548c2ecf20Sopenharmony_ci{ 20558c2ecf20Sopenharmony_ci struct dlm_work_item *item; 20568c2ecf20Sopenharmony_ci item = kzalloc(sizeof(*item), GFP_ATOMIC); 20578c2ecf20Sopenharmony_ci if (!item) 20588c2ecf20Sopenharmony_ci return -ENOMEM; 20598c2ecf20Sopenharmony_ci 20608c2ecf20Sopenharmony_ci 20618c2ecf20Sopenharmony_ci /* queue up work for dlm_assert_master_worker */ 20628c2ecf20Sopenharmony_ci dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); 20638c2ecf20Sopenharmony_ci item->u.am.lockres = res; /* already have a ref */ 20648c2ecf20Sopenharmony_ci /* can optionally ignore node numbers higher than this node */ 20658c2ecf20Sopenharmony_ci item->u.am.ignore_higher = ignore_higher; 20668c2ecf20Sopenharmony_ci item->u.am.request_from = request_from; 20678c2ecf20Sopenharmony_ci item->u.am.flags = flags; 20688c2ecf20Sopenharmony_ci 20698c2ecf20Sopenharmony_ci if (ignore_higher) 20708c2ecf20Sopenharmony_ci mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len, 20718c2ecf20Sopenharmony_ci res->lockname.name); 20728c2ecf20Sopenharmony_ci 20738c2ecf20Sopenharmony_ci spin_lock(&dlm->work_lock); 20748c2ecf20Sopenharmony_ci list_add_tail(&item->list, &dlm->work_list); 20758c2ecf20Sopenharmony_ci spin_unlock(&dlm->work_lock); 20768c2ecf20Sopenharmony_ci 20778c2ecf20Sopenharmony_ci queue_work(dlm->dlm_worker, &dlm->dispatched_work); 20788c2ecf20Sopenharmony_ci return 0; 20798c2ecf20Sopenharmony_ci} 20808c2ecf20Sopenharmony_ci 20818c2ecf20Sopenharmony_cistatic void dlm_assert_master_worker(struct dlm_work_item *item, void *data) 20828c2ecf20Sopenharmony_ci{ 20838c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm = data; 20848c2ecf20Sopenharmony_ci int ret = 0; 20858c2ecf20Sopenharmony_ci struct dlm_lock_resource *res; 20868c2ecf20Sopenharmony_ci unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 20878c2ecf20Sopenharmony_ci int ignore_higher; 20888c2ecf20Sopenharmony_ci int bit; 20898c2ecf20Sopenharmony_ci u8 request_from; 20908c2ecf20Sopenharmony_ci u32 flags; 20918c2ecf20Sopenharmony_ci 20928c2ecf20Sopenharmony_ci dlm = item->dlm; 20938c2ecf20Sopenharmony_ci res = item->u.am.lockres; 20948c2ecf20Sopenharmony_ci ignore_higher = item->u.am.ignore_higher; 20958c2ecf20Sopenharmony_ci request_from = item->u.am.request_from; 20968c2ecf20Sopenharmony_ci flags = item->u.am.flags; 20978c2ecf20Sopenharmony_ci 20988c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 20998c2ecf20Sopenharmony_ci memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); 21008c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 21018c2ecf20Sopenharmony_ci 21028c2ecf20Sopenharmony_ci clear_bit(dlm->node_num, nodemap); 21038c2ecf20Sopenharmony_ci if (ignore_higher) { 21048c2ecf20Sopenharmony_ci /* if is this just to clear up mles for nodes below 21058c2ecf20Sopenharmony_ci * this node, do not send the message to the original 21068c2ecf20Sopenharmony_ci * caller or any node number higher than this */ 21078c2ecf20Sopenharmony_ci clear_bit(request_from, nodemap); 21088c2ecf20Sopenharmony_ci bit = dlm->node_num; 21098c2ecf20Sopenharmony_ci while (1) { 21108c2ecf20Sopenharmony_ci bit = find_next_bit(nodemap, O2NM_MAX_NODES, 21118c2ecf20Sopenharmony_ci bit+1); 21128c2ecf20Sopenharmony_ci if (bit >= O2NM_MAX_NODES) 21138c2ecf20Sopenharmony_ci break; 21148c2ecf20Sopenharmony_ci clear_bit(bit, nodemap); 21158c2ecf20Sopenharmony_ci } 21168c2ecf20Sopenharmony_ci } 21178c2ecf20Sopenharmony_ci 21188c2ecf20Sopenharmony_ci /* 21198c2ecf20Sopenharmony_ci * If we're migrating this lock to someone else, we are no 21208c2ecf20Sopenharmony_ci * longer allowed to assert out own mastery. OTOH, we need to 21218c2ecf20Sopenharmony_ci * prevent migration from starting while we're still asserting 21228c2ecf20Sopenharmony_ci * our dominance. The reserved ast delays migration. 21238c2ecf20Sopenharmony_ci */ 21248c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 21258c2ecf20Sopenharmony_ci if (res->state & DLM_LOCK_RES_MIGRATING) { 21268c2ecf20Sopenharmony_ci mlog(0, "Someone asked us to assert mastery, but we're " 21278c2ecf20Sopenharmony_ci "in the middle of migration. Skipping assert, " 21288c2ecf20Sopenharmony_ci "the new master will handle that.\n"); 21298c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 21308c2ecf20Sopenharmony_ci goto put; 21318c2ecf20Sopenharmony_ci } else 21328c2ecf20Sopenharmony_ci __dlm_lockres_reserve_ast(res); 21338c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 21348c2ecf20Sopenharmony_ci 21358c2ecf20Sopenharmony_ci /* this call now finishes out the nodemap 21368c2ecf20Sopenharmony_ci * even if one or more nodes die */ 21378c2ecf20Sopenharmony_ci mlog(0, "worker about to master %.*s here, this=%u\n", 21388c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, dlm->node_num); 21398c2ecf20Sopenharmony_ci ret = dlm_do_assert_master(dlm, res, nodemap, flags); 21408c2ecf20Sopenharmony_ci if (ret < 0) { 21418c2ecf20Sopenharmony_ci /* no need to restart, we are done */ 21428c2ecf20Sopenharmony_ci if (!dlm_is_host_down(ret)) 21438c2ecf20Sopenharmony_ci mlog_errno(ret); 21448c2ecf20Sopenharmony_ci } 21458c2ecf20Sopenharmony_ci 21468c2ecf20Sopenharmony_ci /* Ok, we've asserted ourselves. Let's let migration start. */ 21478c2ecf20Sopenharmony_ci dlm_lockres_release_ast(dlm, res); 21488c2ecf20Sopenharmony_ci 21498c2ecf20Sopenharmony_ciput: 21508c2ecf20Sopenharmony_ci dlm_lockres_drop_inflight_worker(dlm, res); 21518c2ecf20Sopenharmony_ci 21528c2ecf20Sopenharmony_ci dlm_lockres_put(res); 21538c2ecf20Sopenharmony_ci 21548c2ecf20Sopenharmony_ci mlog(0, "finished with dlm_assert_master_worker\n"); 21558c2ecf20Sopenharmony_ci} 21568c2ecf20Sopenharmony_ci 21578c2ecf20Sopenharmony_ci/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. 21588c2ecf20Sopenharmony_ci * We cannot wait for node recovery to complete to begin mastering this 21598c2ecf20Sopenharmony_ci * lockres because this lockres is used to kick off recovery! ;-) 21608c2ecf20Sopenharmony_ci * So, do a pre-check on all living nodes to see if any of those nodes 21618c2ecf20Sopenharmony_ci * think that $RECOVERY is currently mastered by a dead node. If so, 21628c2ecf20Sopenharmony_ci * we wait a short time to allow that node to get notified by its own 21638c2ecf20Sopenharmony_ci * heartbeat stack, then check again. All $RECOVERY lock resources 21648c2ecf20Sopenharmony_ci * mastered by dead nodes are purged when the heartbeat callback is 21658c2ecf20Sopenharmony_ci * fired, so we can know for sure that it is safe to continue once 21668c2ecf20Sopenharmony_ci * the node returns a live node or no node. */ 21678c2ecf20Sopenharmony_cistatic int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, 21688c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 21698c2ecf20Sopenharmony_ci{ 21708c2ecf20Sopenharmony_ci struct dlm_node_iter iter; 21718c2ecf20Sopenharmony_ci int nodenum; 21728c2ecf20Sopenharmony_ci int ret = 0; 21738c2ecf20Sopenharmony_ci u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; 21748c2ecf20Sopenharmony_ci 21758c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 21768c2ecf20Sopenharmony_ci dlm_node_iter_init(dlm->domain_map, &iter); 21778c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 21788c2ecf20Sopenharmony_ci 21798c2ecf20Sopenharmony_ci while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { 21808c2ecf20Sopenharmony_ci /* do not send to self */ 21818c2ecf20Sopenharmony_ci if (nodenum == dlm->node_num) 21828c2ecf20Sopenharmony_ci continue; 21838c2ecf20Sopenharmony_ci ret = dlm_do_master_requery(dlm, res, nodenum, &master); 21848c2ecf20Sopenharmony_ci if (ret < 0) { 21858c2ecf20Sopenharmony_ci mlog_errno(ret); 21868c2ecf20Sopenharmony_ci if (!dlm_is_host_down(ret)) 21878c2ecf20Sopenharmony_ci BUG(); 21888c2ecf20Sopenharmony_ci /* host is down, so answer for that node would be 21898c2ecf20Sopenharmony_ci * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ 21908c2ecf20Sopenharmony_ci ret = 0; 21918c2ecf20Sopenharmony_ci } 21928c2ecf20Sopenharmony_ci 21938c2ecf20Sopenharmony_ci if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { 21948c2ecf20Sopenharmony_ci /* check to see if this master is in the recovery map */ 21958c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 21968c2ecf20Sopenharmony_ci if (test_bit(master, dlm->recovery_map)) { 21978c2ecf20Sopenharmony_ci mlog(ML_NOTICE, "%s: node %u has not seen " 21988c2ecf20Sopenharmony_ci "node %u go down yet, and thinks the " 21998c2ecf20Sopenharmony_ci "dead node is mastering the recovery " 22008c2ecf20Sopenharmony_ci "lock. must wait.\n", dlm->name, 22018c2ecf20Sopenharmony_ci nodenum, master); 22028c2ecf20Sopenharmony_ci ret = -EAGAIN; 22038c2ecf20Sopenharmony_ci } 22048c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 22058c2ecf20Sopenharmony_ci mlog(0, "%s: reco lock master is %u\n", dlm->name, 22068c2ecf20Sopenharmony_ci master); 22078c2ecf20Sopenharmony_ci break; 22088c2ecf20Sopenharmony_ci } 22098c2ecf20Sopenharmony_ci } 22108c2ecf20Sopenharmony_ci return ret; 22118c2ecf20Sopenharmony_ci} 22128c2ecf20Sopenharmony_ci 22138c2ecf20Sopenharmony_ci/* 22148c2ecf20Sopenharmony_ci * DLM_DEREF_LOCKRES_MSG 22158c2ecf20Sopenharmony_ci */ 22168c2ecf20Sopenharmony_ci 22178c2ecf20Sopenharmony_ciint dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 22188c2ecf20Sopenharmony_ci{ 22198c2ecf20Sopenharmony_ci struct dlm_deref_lockres deref; 22208c2ecf20Sopenharmony_ci int ret = 0, r; 22218c2ecf20Sopenharmony_ci const char *lockname; 22228c2ecf20Sopenharmony_ci unsigned int namelen; 22238c2ecf20Sopenharmony_ci 22248c2ecf20Sopenharmony_ci lockname = res->lockname.name; 22258c2ecf20Sopenharmony_ci namelen = res->lockname.len; 22268c2ecf20Sopenharmony_ci BUG_ON(namelen > O2NM_MAX_NAME_LEN); 22278c2ecf20Sopenharmony_ci 22288c2ecf20Sopenharmony_ci memset(&deref, 0, sizeof(deref)); 22298c2ecf20Sopenharmony_ci deref.node_idx = dlm->node_num; 22308c2ecf20Sopenharmony_ci deref.namelen = namelen; 22318c2ecf20Sopenharmony_ci memcpy(deref.name, lockname, namelen); 22328c2ecf20Sopenharmony_ci 22338c2ecf20Sopenharmony_ci ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 22348c2ecf20Sopenharmony_ci &deref, sizeof(deref), res->owner, &r); 22358c2ecf20Sopenharmony_ci if (ret < 0) 22368c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n", 22378c2ecf20Sopenharmony_ci dlm->name, namelen, lockname, ret, res->owner); 22388c2ecf20Sopenharmony_ci else if (r < 0) { 22398c2ecf20Sopenharmony_ci /* BAD. other node says I did not have a ref. */ 22408c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", 22418c2ecf20Sopenharmony_ci dlm->name, namelen, lockname, res->owner, r); 22428c2ecf20Sopenharmony_ci dlm_print_one_lock_resource(res); 22438c2ecf20Sopenharmony_ci if (r == -ENOMEM) 22448c2ecf20Sopenharmony_ci BUG(); 22458c2ecf20Sopenharmony_ci } else 22468c2ecf20Sopenharmony_ci ret = r; 22478c2ecf20Sopenharmony_ci 22488c2ecf20Sopenharmony_ci return ret; 22498c2ecf20Sopenharmony_ci} 22508c2ecf20Sopenharmony_ci 22518c2ecf20Sopenharmony_ciint dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, 22528c2ecf20Sopenharmony_ci void **ret_data) 22538c2ecf20Sopenharmony_ci{ 22548c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm = data; 22558c2ecf20Sopenharmony_ci struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; 22568c2ecf20Sopenharmony_ci struct dlm_lock_resource *res = NULL; 22578c2ecf20Sopenharmony_ci char *name; 22588c2ecf20Sopenharmony_ci unsigned int namelen; 22598c2ecf20Sopenharmony_ci int ret = -EINVAL; 22608c2ecf20Sopenharmony_ci u8 node; 22618c2ecf20Sopenharmony_ci unsigned int hash; 22628c2ecf20Sopenharmony_ci struct dlm_work_item *item; 22638c2ecf20Sopenharmony_ci int cleared = 0; 22648c2ecf20Sopenharmony_ci int dispatch = 0; 22658c2ecf20Sopenharmony_ci 22668c2ecf20Sopenharmony_ci if (!dlm_grab(dlm)) 22678c2ecf20Sopenharmony_ci return 0; 22688c2ecf20Sopenharmony_ci 22698c2ecf20Sopenharmony_ci name = deref->name; 22708c2ecf20Sopenharmony_ci namelen = deref->namelen; 22718c2ecf20Sopenharmony_ci node = deref->node_idx; 22728c2ecf20Sopenharmony_ci 22738c2ecf20Sopenharmony_ci if (namelen > DLM_LOCKID_NAME_MAX) { 22748c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Invalid name length!"); 22758c2ecf20Sopenharmony_ci goto done; 22768c2ecf20Sopenharmony_ci } 22778c2ecf20Sopenharmony_ci if (deref->node_idx >= O2NM_MAX_NODES) { 22788c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Invalid node number: %u\n", node); 22798c2ecf20Sopenharmony_ci goto done; 22808c2ecf20Sopenharmony_ci } 22818c2ecf20Sopenharmony_ci 22828c2ecf20Sopenharmony_ci hash = dlm_lockid_hash(name, namelen); 22838c2ecf20Sopenharmony_ci 22848c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 22858c2ecf20Sopenharmony_ci res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); 22868c2ecf20Sopenharmony_ci if (!res) { 22878c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 22888c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", 22898c2ecf20Sopenharmony_ci dlm->name, namelen, name); 22908c2ecf20Sopenharmony_ci goto done; 22918c2ecf20Sopenharmony_ci } 22928c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 22938c2ecf20Sopenharmony_ci 22948c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 22958c2ecf20Sopenharmony_ci if (res->state & DLM_LOCK_RES_SETREF_INPROG) 22968c2ecf20Sopenharmony_ci dispatch = 1; 22978c2ecf20Sopenharmony_ci else { 22988c2ecf20Sopenharmony_ci BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 22998c2ecf20Sopenharmony_ci if (test_bit(node, res->refmap)) { 23008c2ecf20Sopenharmony_ci dlm_lockres_clear_refmap_bit(dlm, res, node); 23018c2ecf20Sopenharmony_ci cleared = 1; 23028c2ecf20Sopenharmony_ci } 23038c2ecf20Sopenharmony_ci } 23048c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 23058c2ecf20Sopenharmony_ci 23068c2ecf20Sopenharmony_ci if (!dispatch) { 23078c2ecf20Sopenharmony_ci if (cleared) 23088c2ecf20Sopenharmony_ci dlm_lockres_calc_usage(dlm, res); 23098c2ecf20Sopenharmony_ci else { 23108c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 23118c2ecf20Sopenharmony_ci "but it is already dropped!\n", dlm->name, 23128c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, node); 23138c2ecf20Sopenharmony_ci dlm_print_one_lock_resource(res); 23148c2ecf20Sopenharmony_ci } 23158c2ecf20Sopenharmony_ci ret = DLM_DEREF_RESPONSE_DONE; 23168c2ecf20Sopenharmony_ci goto done; 23178c2ecf20Sopenharmony_ci } 23188c2ecf20Sopenharmony_ci 23198c2ecf20Sopenharmony_ci item = kzalloc(sizeof(*item), GFP_NOFS); 23208c2ecf20Sopenharmony_ci if (!item) { 23218c2ecf20Sopenharmony_ci ret = -ENOMEM; 23228c2ecf20Sopenharmony_ci mlog_errno(ret); 23238c2ecf20Sopenharmony_ci goto done; 23248c2ecf20Sopenharmony_ci } 23258c2ecf20Sopenharmony_ci 23268c2ecf20Sopenharmony_ci dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); 23278c2ecf20Sopenharmony_ci item->u.dl.deref_res = res; 23288c2ecf20Sopenharmony_ci item->u.dl.deref_node = node; 23298c2ecf20Sopenharmony_ci 23308c2ecf20Sopenharmony_ci spin_lock(&dlm->work_lock); 23318c2ecf20Sopenharmony_ci list_add_tail(&item->list, &dlm->work_list); 23328c2ecf20Sopenharmony_ci spin_unlock(&dlm->work_lock); 23338c2ecf20Sopenharmony_ci 23348c2ecf20Sopenharmony_ci queue_work(dlm->dlm_worker, &dlm->dispatched_work); 23358c2ecf20Sopenharmony_ci return DLM_DEREF_RESPONSE_INPROG; 23368c2ecf20Sopenharmony_ci 23378c2ecf20Sopenharmony_cidone: 23388c2ecf20Sopenharmony_ci if (res) 23398c2ecf20Sopenharmony_ci dlm_lockres_put(res); 23408c2ecf20Sopenharmony_ci dlm_put(dlm); 23418c2ecf20Sopenharmony_ci 23428c2ecf20Sopenharmony_ci return ret; 23438c2ecf20Sopenharmony_ci} 23448c2ecf20Sopenharmony_ci 23458c2ecf20Sopenharmony_ciint dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data, 23468c2ecf20Sopenharmony_ci void **ret_data) 23478c2ecf20Sopenharmony_ci{ 23488c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm = data; 23498c2ecf20Sopenharmony_ci struct dlm_deref_lockres_done *deref 23508c2ecf20Sopenharmony_ci = (struct dlm_deref_lockres_done *)msg->buf; 23518c2ecf20Sopenharmony_ci struct dlm_lock_resource *res = NULL; 23528c2ecf20Sopenharmony_ci char *name; 23538c2ecf20Sopenharmony_ci unsigned int namelen; 23548c2ecf20Sopenharmony_ci int ret = -EINVAL; 23558c2ecf20Sopenharmony_ci u8 node; 23568c2ecf20Sopenharmony_ci unsigned int hash; 23578c2ecf20Sopenharmony_ci 23588c2ecf20Sopenharmony_ci if (!dlm_grab(dlm)) 23598c2ecf20Sopenharmony_ci return 0; 23608c2ecf20Sopenharmony_ci 23618c2ecf20Sopenharmony_ci name = deref->name; 23628c2ecf20Sopenharmony_ci namelen = deref->namelen; 23638c2ecf20Sopenharmony_ci node = deref->node_idx; 23648c2ecf20Sopenharmony_ci 23658c2ecf20Sopenharmony_ci if (namelen > DLM_LOCKID_NAME_MAX) { 23668c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Invalid name length!"); 23678c2ecf20Sopenharmony_ci goto done; 23688c2ecf20Sopenharmony_ci } 23698c2ecf20Sopenharmony_ci if (deref->node_idx >= O2NM_MAX_NODES) { 23708c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Invalid node number: %u\n", node); 23718c2ecf20Sopenharmony_ci goto done; 23728c2ecf20Sopenharmony_ci } 23738c2ecf20Sopenharmony_ci 23748c2ecf20Sopenharmony_ci hash = dlm_lockid_hash(name, namelen); 23758c2ecf20Sopenharmony_ci 23768c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 23778c2ecf20Sopenharmony_ci res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); 23788c2ecf20Sopenharmony_ci if (!res) { 23798c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 23808c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", 23818c2ecf20Sopenharmony_ci dlm->name, namelen, name); 23828c2ecf20Sopenharmony_ci goto done; 23838c2ecf20Sopenharmony_ci } 23848c2ecf20Sopenharmony_ci 23858c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 23868c2ecf20Sopenharmony_ci if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) { 23878c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 23888c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 23898c2ecf20Sopenharmony_ci mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done " 23908c2ecf20Sopenharmony_ci "but it is already derefed!\n", dlm->name, 23918c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, node); 23928c2ecf20Sopenharmony_ci ret = 0; 23938c2ecf20Sopenharmony_ci goto done; 23948c2ecf20Sopenharmony_ci } 23958c2ecf20Sopenharmony_ci 23968c2ecf20Sopenharmony_ci __dlm_do_purge_lockres(dlm, res); 23978c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 23988c2ecf20Sopenharmony_ci wake_up(&res->wq); 23998c2ecf20Sopenharmony_ci 24008c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 24018c2ecf20Sopenharmony_ci 24028c2ecf20Sopenharmony_ci ret = 0; 24038c2ecf20Sopenharmony_cidone: 24048c2ecf20Sopenharmony_ci if (res) 24058c2ecf20Sopenharmony_ci dlm_lockres_put(res); 24068c2ecf20Sopenharmony_ci dlm_put(dlm); 24078c2ecf20Sopenharmony_ci return ret; 24088c2ecf20Sopenharmony_ci} 24098c2ecf20Sopenharmony_ci 24108c2ecf20Sopenharmony_cistatic void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm, 24118c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, u8 node) 24128c2ecf20Sopenharmony_ci{ 24138c2ecf20Sopenharmony_ci struct dlm_deref_lockres_done deref; 24148c2ecf20Sopenharmony_ci int ret = 0, r; 24158c2ecf20Sopenharmony_ci const char *lockname; 24168c2ecf20Sopenharmony_ci unsigned int namelen; 24178c2ecf20Sopenharmony_ci 24188c2ecf20Sopenharmony_ci lockname = res->lockname.name; 24198c2ecf20Sopenharmony_ci namelen = res->lockname.len; 24208c2ecf20Sopenharmony_ci BUG_ON(namelen > O2NM_MAX_NAME_LEN); 24218c2ecf20Sopenharmony_ci 24228c2ecf20Sopenharmony_ci memset(&deref, 0, sizeof(deref)); 24238c2ecf20Sopenharmony_ci deref.node_idx = dlm->node_num; 24248c2ecf20Sopenharmony_ci deref.namelen = namelen; 24258c2ecf20Sopenharmony_ci memcpy(deref.name, lockname, namelen); 24268c2ecf20Sopenharmony_ci 24278c2ecf20Sopenharmony_ci ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key, 24288c2ecf20Sopenharmony_ci &deref, sizeof(deref), node, &r); 24298c2ecf20Sopenharmony_ci if (ret < 0) { 24308c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE " 24318c2ecf20Sopenharmony_ci " to node %u\n", dlm->name, namelen, 24328c2ecf20Sopenharmony_ci lockname, ret, node); 24338c2ecf20Sopenharmony_ci } else if (r < 0) { 24348c2ecf20Sopenharmony_ci /* ignore the error */ 24358c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n", 24368c2ecf20Sopenharmony_ci dlm->name, namelen, lockname, node, r); 24378c2ecf20Sopenharmony_ci dlm_print_one_lock_resource(res); 24388c2ecf20Sopenharmony_ci } 24398c2ecf20Sopenharmony_ci} 24408c2ecf20Sopenharmony_ci 24418c2ecf20Sopenharmony_cistatic void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) 24428c2ecf20Sopenharmony_ci{ 24438c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm; 24448c2ecf20Sopenharmony_ci struct dlm_lock_resource *res; 24458c2ecf20Sopenharmony_ci u8 node; 24468c2ecf20Sopenharmony_ci u8 cleared = 0; 24478c2ecf20Sopenharmony_ci 24488c2ecf20Sopenharmony_ci dlm = item->dlm; 24498c2ecf20Sopenharmony_ci res = item->u.dl.deref_res; 24508c2ecf20Sopenharmony_ci node = item->u.dl.deref_node; 24518c2ecf20Sopenharmony_ci 24528c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 24538c2ecf20Sopenharmony_ci BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 24548c2ecf20Sopenharmony_ci __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 24558c2ecf20Sopenharmony_ci if (test_bit(node, res->refmap)) { 24568c2ecf20Sopenharmony_ci dlm_lockres_clear_refmap_bit(dlm, res, node); 24578c2ecf20Sopenharmony_ci cleared = 1; 24588c2ecf20Sopenharmony_ci } 24598c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 24608c2ecf20Sopenharmony_ci 24618c2ecf20Sopenharmony_ci dlm_drop_lockres_ref_done(dlm, res, node); 24628c2ecf20Sopenharmony_ci 24638c2ecf20Sopenharmony_ci if (cleared) { 24648c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", 24658c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name, node); 24668c2ecf20Sopenharmony_ci dlm_lockres_calc_usage(dlm, res); 24678c2ecf20Sopenharmony_ci } else { 24688c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " 24698c2ecf20Sopenharmony_ci "but it is already dropped!\n", dlm->name, 24708c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, node); 24718c2ecf20Sopenharmony_ci dlm_print_one_lock_resource(res); 24728c2ecf20Sopenharmony_ci } 24738c2ecf20Sopenharmony_ci 24748c2ecf20Sopenharmony_ci dlm_lockres_put(res); 24758c2ecf20Sopenharmony_ci} 24768c2ecf20Sopenharmony_ci 24778c2ecf20Sopenharmony_ci/* 24788c2ecf20Sopenharmony_ci * A migratable resource is one that is : 24798c2ecf20Sopenharmony_ci * 1. locally mastered, and, 24808c2ecf20Sopenharmony_ci * 2. zero local locks, and, 24818c2ecf20Sopenharmony_ci * 3. one or more non-local locks, or, one or more references 24828c2ecf20Sopenharmony_ci * Returns 1 if yes, 0 if not. 24838c2ecf20Sopenharmony_ci */ 24848c2ecf20Sopenharmony_cistatic int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, 24858c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 24868c2ecf20Sopenharmony_ci{ 24878c2ecf20Sopenharmony_ci enum dlm_lockres_list idx; 24888c2ecf20Sopenharmony_ci int nonlocal = 0, node_ref; 24898c2ecf20Sopenharmony_ci struct list_head *queue; 24908c2ecf20Sopenharmony_ci struct dlm_lock *lock; 24918c2ecf20Sopenharmony_ci u64 cookie; 24928c2ecf20Sopenharmony_ci 24938c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 24948c2ecf20Sopenharmony_ci 24958c2ecf20Sopenharmony_ci /* delay migration when the lockres is in MIGRATING state */ 24968c2ecf20Sopenharmony_ci if (res->state & DLM_LOCK_RES_MIGRATING) 24978c2ecf20Sopenharmony_ci return 0; 24988c2ecf20Sopenharmony_ci 24998c2ecf20Sopenharmony_ci /* delay migration when the lockres is in RECOCERING state */ 25008c2ecf20Sopenharmony_ci if (res->state & (DLM_LOCK_RES_RECOVERING| 25018c2ecf20Sopenharmony_ci DLM_LOCK_RES_RECOVERY_WAITING)) 25028c2ecf20Sopenharmony_ci return 0; 25038c2ecf20Sopenharmony_ci 25048c2ecf20Sopenharmony_ci if (res->owner != dlm->node_num) 25058c2ecf20Sopenharmony_ci return 0; 25068c2ecf20Sopenharmony_ci 25078c2ecf20Sopenharmony_ci for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { 25088c2ecf20Sopenharmony_ci queue = dlm_list_idx_to_ptr(res, idx); 25098c2ecf20Sopenharmony_ci list_for_each_entry(lock, queue, list) { 25108c2ecf20Sopenharmony_ci if (lock->ml.node != dlm->node_num) { 25118c2ecf20Sopenharmony_ci nonlocal++; 25128c2ecf20Sopenharmony_ci continue; 25138c2ecf20Sopenharmony_ci } 25148c2ecf20Sopenharmony_ci cookie = be64_to_cpu(lock->ml.cookie); 25158c2ecf20Sopenharmony_ci mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on " 25168c2ecf20Sopenharmony_ci "%s list\n", dlm->name, res->lockname.len, 25178c2ecf20Sopenharmony_ci res->lockname.name, 25188c2ecf20Sopenharmony_ci dlm_get_lock_cookie_node(cookie), 25198c2ecf20Sopenharmony_ci dlm_get_lock_cookie_seq(cookie), 25208c2ecf20Sopenharmony_ci dlm_list_in_text(idx)); 25218c2ecf20Sopenharmony_ci return 0; 25228c2ecf20Sopenharmony_ci } 25238c2ecf20Sopenharmony_ci } 25248c2ecf20Sopenharmony_ci 25258c2ecf20Sopenharmony_ci if (!nonlocal) { 25268c2ecf20Sopenharmony_ci node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 25278c2ecf20Sopenharmony_ci if (node_ref >= O2NM_MAX_NODES) 25288c2ecf20Sopenharmony_ci return 0; 25298c2ecf20Sopenharmony_ci } 25308c2ecf20Sopenharmony_ci 25318c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len, 25328c2ecf20Sopenharmony_ci res->lockname.name); 25338c2ecf20Sopenharmony_ci 25348c2ecf20Sopenharmony_ci return 1; 25358c2ecf20Sopenharmony_ci} 25368c2ecf20Sopenharmony_ci 25378c2ecf20Sopenharmony_ci/* 25388c2ecf20Sopenharmony_ci * DLM_MIGRATE_LOCKRES 25398c2ecf20Sopenharmony_ci */ 25408c2ecf20Sopenharmony_ci 25418c2ecf20Sopenharmony_ci 25428c2ecf20Sopenharmony_cistatic int dlm_migrate_lockres(struct dlm_ctxt *dlm, 25438c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, u8 target) 25448c2ecf20Sopenharmony_ci{ 25458c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle = NULL; 25468c2ecf20Sopenharmony_ci struct dlm_master_list_entry *oldmle = NULL; 25478c2ecf20Sopenharmony_ci struct dlm_migratable_lockres *mres = NULL; 25488c2ecf20Sopenharmony_ci int ret = 0; 25498c2ecf20Sopenharmony_ci const char *name; 25508c2ecf20Sopenharmony_ci unsigned int namelen; 25518c2ecf20Sopenharmony_ci int mle_added = 0; 25528c2ecf20Sopenharmony_ci int wake = 0; 25538c2ecf20Sopenharmony_ci 25548c2ecf20Sopenharmony_ci if (!dlm_grab(dlm)) 25558c2ecf20Sopenharmony_ci return -EINVAL; 25568c2ecf20Sopenharmony_ci 25578c2ecf20Sopenharmony_ci name = res->lockname.name; 25588c2ecf20Sopenharmony_ci namelen = res->lockname.len; 25598c2ecf20Sopenharmony_ci 25608c2ecf20Sopenharmony_ci mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name, 25618c2ecf20Sopenharmony_ci target); 25628c2ecf20Sopenharmony_ci 25638c2ecf20Sopenharmony_ci /* preallocate up front. if this fails, abort */ 25648c2ecf20Sopenharmony_ci ret = -ENOMEM; 25658c2ecf20Sopenharmony_ci mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); 25668c2ecf20Sopenharmony_ci if (!mres) { 25678c2ecf20Sopenharmony_ci mlog_errno(ret); 25688c2ecf20Sopenharmony_ci goto leave; 25698c2ecf20Sopenharmony_ci } 25708c2ecf20Sopenharmony_ci 25718c2ecf20Sopenharmony_ci mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 25728c2ecf20Sopenharmony_ci if (!mle) { 25738c2ecf20Sopenharmony_ci mlog_errno(ret); 25748c2ecf20Sopenharmony_ci goto leave; 25758c2ecf20Sopenharmony_ci } 25768c2ecf20Sopenharmony_ci ret = 0; 25778c2ecf20Sopenharmony_ci 25788c2ecf20Sopenharmony_ci /* 25798c2ecf20Sopenharmony_ci * clear any existing master requests and 25808c2ecf20Sopenharmony_ci * add the migration mle to the list 25818c2ecf20Sopenharmony_ci */ 25828c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 25838c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 25848c2ecf20Sopenharmony_ci ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, 25858c2ecf20Sopenharmony_ci namelen, target, dlm->node_num); 25868c2ecf20Sopenharmony_ci /* get an extra reference on the mle. 25878c2ecf20Sopenharmony_ci * otherwise the assert_master from the new 25888c2ecf20Sopenharmony_ci * master will destroy this. 25898c2ecf20Sopenharmony_ci */ 25908c2ecf20Sopenharmony_ci if (ret != -EEXIST) 25918c2ecf20Sopenharmony_ci dlm_get_mle_inuse(mle); 25928c2ecf20Sopenharmony_ci 25938c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 25948c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 25958c2ecf20Sopenharmony_ci 25968c2ecf20Sopenharmony_ci if (ret == -EEXIST) { 25978c2ecf20Sopenharmony_ci mlog(0, "another process is already migrating it\n"); 25988c2ecf20Sopenharmony_ci goto fail; 25998c2ecf20Sopenharmony_ci } 26008c2ecf20Sopenharmony_ci mle_added = 1; 26018c2ecf20Sopenharmony_ci 26028c2ecf20Sopenharmony_ci /* 26038c2ecf20Sopenharmony_ci * set the MIGRATING flag and flush asts 26048c2ecf20Sopenharmony_ci * if we fail after this we need to re-dirty the lockres 26058c2ecf20Sopenharmony_ci */ 26068c2ecf20Sopenharmony_ci if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { 26078c2ecf20Sopenharmony_ci mlog(ML_ERROR, "tried to migrate %.*s to %u, but " 26088c2ecf20Sopenharmony_ci "the target went down.\n", res->lockname.len, 26098c2ecf20Sopenharmony_ci res->lockname.name, target); 26108c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 26118c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_MIGRATING; 26128c2ecf20Sopenharmony_ci wake = 1; 26138c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 26148c2ecf20Sopenharmony_ci ret = -EINVAL; 26158c2ecf20Sopenharmony_ci } 26168c2ecf20Sopenharmony_ci 26178c2ecf20Sopenharmony_cifail: 26188c2ecf20Sopenharmony_ci if (ret != -EEXIST && oldmle) { 26198c2ecf20Sopenharmony_ci /* master is known, detach if not already detached */ 26208c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, oldmle); 26218c2ecf20Sopenharmony_ci dlm_put_mle(oldmle); 26228c2ecf20Sopenharmony_ci } 26238c2ecf20Sopenharmony_ci 26248c2ecf20Sopenharmony_ci if (ret < 0) { 26258c2ecf20Sopenharmony_ci if (mle_added) { 26268c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, mle); 26278c2ecf20Sopenharmony_ci dlm_put_mle(mle); 26288c2ecf20Sopenharmony_ci dlm_put_mle_inuse(mle); 26298c2ecf20Sopenharmony_ci } else if (mle) { 26308c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 26318c2ecf20Sopenharmony_ci mle = NULL; 26328c2ecf20Sopenharmony_ci } 26338c2ecf20Sopenharmony_ci goto leave; 26348c2ecf20Sopenharmony_ci } 26358c2ecf20Sopenharmony_ci 26368c2ecf20Sopenharmony_ci /* 26378c2ecf20Sopenharmony_ci * at this point, we have a migration target, an mle 26388c2ecf20Sopenharmony_ci * in the master list, and the MIGRATING flag set on 26398c2ecf20Sopenharmony_ci * the lockres 26408c2ecf20Sopenharmony_ci */ 26418c2ecf20Sopenharmony_ci 26428c2ecf20Sopenharmony_ci /* now that remote nodes are spinning on the MIGRATING flag, 26438c2ecf20Sopenharmony_ci * ensure that all assert_master work is flushed. */ 26448c2ecf20Sopenharmony_ci flush_workqueue(dlm->dlm_worker); 26458c2ecf20Sopenharmony_ci 26468c2ecf20Sopenharmony_ci /* notify new node and send all lock state */ 26478c2ecf20Sopenharmony_ci /* call send_one_lockres with migration flag. 26488c2ecf20Sopenharmony_ci * this serves as notice to the target node that a 26498c2ecf20Sopenharmony_ci * migration is starting. */ 26508c2ecf20Sopenharmony_ci ret = dlm_send_one_lockres(dlm, res, mres, target, 26518c2ecf20Sopenharmony_ci DLM_MRES_MIGRATION); 26528c2ecf20Sopenharmony_ci 26538c2ecf20Sopenharmony_ci if (ret < 0) { 26548c2ecf20Sopenharmony_ci mlog(0, "migration to node %u failed with %d\n", 26558c2ecf20Sopenharmony_ci target, ret); 26568c2ecf20Sopenharmony_ci /* migration failed, detach and clean up mle */ 26578c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, mle); 26588c2ecf20Sopenharmony_ci dlm_put_mle(mle); 26598c2ecf20Sopenharmony_ci dlm_put_mle_inuse(mle); 26608c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 26618c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_MIGRATING; 26628c2ecf20Sopenharmony_ci wake = 1; 26638c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 26648c2ecf20Sopenharmony_ci if (dlm_is_host_down(ret)) 26658c2ecf20Sopenharmony_ci dlm_wait_for_node_death(dlm, target, 26668c2ecf20Sopenharmony_ci DLM_NODE_DEATH_WAIT_MAX); 26678c2ecf20Sopenharmony_ci goto leave; 26688c2ecf20Sopenharmony_ci } 26698c2ecf20Sopenharmony_ci 26708c2ecf20Sopenharmony_ci /* at this point, the target sends a message to all nodes, 26718c2ecf20Sopenharmony_ci * (using dlm_do_migrate_request). this node is skipped since 26728c2ecf20Sopenharmony_ci * we had to put an mle in the list to begin the process. this 26738c2ecf20Sopenharmony_ci * node now waits for target to do an assert master. this node 26748c2ecf20Sopenharmony_ci * will be the last one notified, ensuring that the migration 26758c2ecf20Sopenharmony_ci * is complete everywhere. if the target dies while this is 26768c2ecf20Sopenharmony_ci * going on, some nodes could potentially see the target as the 26778c2ecf20Sopenharmony_ci * master, so it is important that my recovery finds the migration 26788c2ecf20Sopenharmony_ci * mle and sets the master to UNKNOWN. */ 26798c2ecf20Sopenharmony_ci 26808c2ecf20Sopenharmony_ci 26818c2ecf20Sopenharmony_ci /* wait for new node to assert master */ 26828c2ecf20Sopenharmony_ci while (1) { 26838c2ecf20Sopenharmony_ci ret = wait_event_interruptible_timeout(mle->wq, 26848c2ecf20Sopenharmony_ci (atomic_read(&mle->woken) == 1), 26858c2ecf20Sopenharmony_ci msecs_to_jiffies(5000)); 26868c2ecf20Sopenharmony_ci 26878c2ecf20Sopenharmony_ci if (ret >= 0) { 26888c2ecf20Sopenharmony_ci if (atomic_read(&mle->woken) == 1 || 26898c2ecf20Sopenharmony_ci res->owner == target) 26908c2ecf20Sopenharmony_ci break; 26918c2ecf20Sopenharmony_ci 26928c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: timed out during migration\n", 26938c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name); 26948c2ecf20Sopenharmony_ci /* avoid hang during shutdown when migrating lockres 26958c2ecf20Sopenharmony_ci * to a node which also goes down */ 26968c2ecf20Sopenharmony_ci if (dlm_is_node_dead(dlm, target)) { 26978c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: expected migration " 26988c2ecf20Sopenharmony_ci "target %u is no longer up, restarting\n", 26998c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, 27008c2ecf20Sopenharmony_ci res->lockname.name, target); 27018c2ecf20Sopenharmony_ci ret = -EINVAL; 27028c2ecf20Sopenharmony_ci /* migration failed, detach and clean up mle */ 27038c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, mle); 27048c2ecf20Sopenharmony_ci dlm_put_mle(mle); 27058c2ecf20Sopenharmony_ci dlm_put_mle_inuse(mle); 27068c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 27078c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_MIGRATING; 27088c2ecf20Sopenharmony_ci wake = 1; 27098c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 27108c2ecf20Sopenharmony_ci goto leave; 27118c2ecf20Sopenharmony_ci } 27128c2ecf20Sopenharmony_ci } else 27138c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: caught signal during migration\n", 27148c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name); 27158c2ecf20Sopenharmony_ci } 27168c2ecf20Sopenharmony_ci 27178c2ecf20Sopenharmony_ci /* all done, set the owner, clear the flag */ 27188c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 27198c2ecf20Sopenharmony_ci dlm_set_lockres_owner(dlm, res, target); 27208c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_MIGRATING; 27218c2ecf20Sopenharmony_ci dlm_remove_nonlocal_locks(dlm, res); 27228c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 27238c2ecf20Sopenharmony_ci wake_up(&res->wq); 27248c2ecf20Sopenharmony_ci 27258c2ecf20Sopenharmony_ci /* master is known, detach if not already detached */ 27268c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, mle); 27278c2ecf20Sopenharmony_ci dlm_put_mle_inuse(mle); 27288c2ecf20Sopenharmony_ci ret = 0; 27298c2ecf20Sopenharmony_ci 27308c2ecf20Sopenharmony_ci dlm_lockres_calc_usage(dlm, res); 27318c2ecf20Sopenharmony_ci 27328c2ecf20Sopenharmony_cileave: 27338c2ecf20Sopenharmony_ci /* re-dirty the lockres if we failed */ 27348c2ecf20Sopenharmony_ci if (ret < 0) 27358c2ecf20Sopenharmony_ci dlm_kick_thread(dlm, res); 27368c2ecf20Sopenharmony_ci 27378c2ecf20Sopenharmony_ci /* wake up waiters if the MIGRATING flag got set 27388c2ecf20Sopenharmony_ci * but migration failed */ 27398c2ecf20Sopenharmony_ci if (wake) 27408c2ecf20Sopenharmony_ci wake_up(&res->wq); 27418c2ecf20Sopenharmony_ci 27428c2ecf20Sopenharmony_ci if (mres) 27438c2ecf20Sopenharmony_ci free_page((unsigned long)mres); 27448c2ecf20Sopenharmony_ci 27458c2ecf20Sopenharmony_ci dlm_put(dlm); 27468c2ecf20Sopenharmony_ci 27478c2ecf20Sopenharmony_ci mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen, 27488c2ecf20Sopenharmony_ci name, target, ret); 27498c2ecf20Sopenharmony_ci return ret; 27508c2ecf20Sopenharmony_ci} 27518c2ecf20Sopenharmony_ci 27528c2ecf20Sopenharmony_ci/* 27538c2ecf20Sopenharmony_ci * Should be called only after beginning the domain leave process. 27548c2ecf20Sopenharmony_ci * There should not be any remaining locks on nonlocal lock resources, 27558c2ecf20Sopenharmony_ci * and there should be no local locks left on locally mastered resources. 27568c2ecf20Sopenharmony_ci * 27578c2ecf20Sopenharmony_ci * Called with the dlm spinlock held, may drop it to do migration, but 27588c2ecf20Sopenharmony_ci * will re-acquire before exit. 27598c2ecf20Sopenharmony_ci * 27608c2ecf20Sopenharmony_ci * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped 27618c2ecf20Sopenharmony_ci */ 27628c2ecf20Sopenharmony_ciint dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) 27638c2ecf20Sopenharmony_ci __must_hold(&dlm->spinlock) 27648c2ecf20Sopenharmony_ci{ 27658c2ecf20Sopenharmony_ci int ret; 27668c2ecf20Sopenharmony_ci int lock_dropped = 0; 27678c2ecf20Sopenharmony_ci u8 target = O2NM_MAX_NODES; 27688c2ecf20Sopenharmony_ci 27698c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 27708c2ecf20Sopenharmony_ci 27718c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 27728c2ecf20Sopenharmony_ci if (dlm_is_lockres_migratable(dlm, res)) 27738c2ecf20Sopenharmony_ci target = dlm_pick_migration_target(dlm, res); 27748c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 27758c2ecf20Sopenharmony_ci 27768c2ecf20Sopenharmony_ci if (target == O2NM_MAX_NODES) 27778c2ecf20Sopenharmony_ci goto leave; 27788c2ecf20Sopenharmony_ci 27798c2ecf20Sopenharmony_ci /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ 27808c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 27818c2ecf20Sopenharmony_ci lock_dropped = 1; 27828c2ecf20Sopenharmony_ci ret = dlm_migrate_lockres(dlm, res, target); 27838c2ecf20Sopenharmony_ci if (ret) 27848c2ecf20Sopenharmony_ci mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n", 27858c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name, 27868c2ecf20Sopenharmony_ci target, ret); 27878c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 27888c2ecf20Sopenharmony_cileave: 27898c2ecf20Sopenharmony_ci return lock_dropped; 27908c2ecf20Sopenharmony_ci} 27918c2ecf20Sopenharmony_ci 27928c2ecf20Sopenharmony_ciint dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) 27938c2ecf20Sopenharmony_ci{ 27948c2ecf20Sopenharmony_ci int ret; 27958c2ecf20Sopenharmony_ci spin_lock(&dlm->ast_lock); 27968c2ecf20Sopenharmony_ci spin_lock(&lock->spinlock); 27978c2ecf20Sopenharmony_ci ret = (list_empty(&lock->bast_list) && !lock->bast_pending); 27988c2ecf20Sopenharmony_ci spin_unlock(&lock->spinlock); 27998c2ecf20Sopenharmony_ci spin_unlock(&dlm->ast_lock); 28008c2ecf20Sopenharmony_ci return ret; 28018c2ecf20Sopenharmony_ci} 28028c2ecf20Sopenharmony_ci 28038c2ecf20Sopenharmony_cistatic int dlm_migration_can_proceed(struct dlm_ctxt *dlm, 28048c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 28058c2ecf20Sopenharmony_ci u8 mig_target) 28068c2ecf20Sopenharmony_ci{ 28078c2ecf20Sopenharmony_ci int can_proceed; 28088c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 28098c2ecf20Sopenharmony_ci can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); 28108c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 28118c2ecf20Sopenharmony_ci 28128c2ecf20Sopenharmony_ci /* target has died, so make the caller break out of the 28138c2ecf20Sopenharmony_ci * wait_event, but caller must recheck the domain_map */ 28148c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 28158c2ecf20Sopenharmony_ci if (!test_bit(mig_target, dlm->domain_map)) 28168c2ecf20Sopenharmony_ci can_proceed = 1; 28178c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 28188c2ecf20Sopenharmony_ci return can_proceed; 28198c2ecf20Sopenharmony_ci} 28208c2ecf20Sopenharmony_ci 28218c2ecf20Sopenharmony_cistatic int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, 28228c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 28238c2ecf20Sopenharmony_ci{ 28248c2ecf20Sopenharmony_ci int ret; 28258c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 28268c2ecf20Sopenharmony_ci ret = !!(res->state & DLM_LOCK_RES_DIRTY); 28278c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 28288c2ecf20Sopenharmony_ci return ret; 28298c2ecf20Sopenharmony_ci} 28308c2ecf20Sopenharmony_ci 28318c2ecf20Sopenharmony_ci 28328c2ecf20Sopenharmony_cistatic int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, 28338c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 28348c2ecf20Sopenharmony_ci u8 target) 28358c2ecf20Sopenharmony_ci{ 28368c2ecf20Sopenharmony_ci int ret = 0; 28378c2ecf20Sopenharmony_ci 28388c2ecf20Sopenharmony_ci mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", 28398c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, dlm->node_num, 28408c2ecf20Sopenharmony_ci target); 28418c2ecf20Sopenharmony_ci /* need to set MIGRATING flag on lockres. this is done by 28428c2ecf20Sopenharmony_ci * ensuring that all asts have been flushed for this lockres. */ 28438c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 28448c2ecf20Sopenharmony_ci BUG_ON(res->migration_pending); 28458c2ecf20Sopenharmony_ci res->migration_pending = 1; 28468c2ecf20Sopenharmony_ci /* strategy is to reserve an extra ast then release 28478c2ecf20Sopenharmony_ci * it below, letting the release do all of the work */ 28488c2ecf20Sopenharmony_ci __dlm_lockres_reserve_ast(res); 28498c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 28508c2ecf20Sopenharmony_ci 28518c2ecf20Sopenharmony_ci /* now flush all the pending asts */ 28528c2ecf20Sopenharmony_ci dlm_kick_thread(dlm, res); 28538c2ecf20Sopenharmony_ci /* before waiting on DIRTY, block processes which may 28548c2ecf20Sopenharmony_ci * try to dirty the lockres before MIGRATING is set */ 28558c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 28568c2ecf20Sopenharmony_ci BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); 28578c2ecf20Sopenharmony_ci res->state |= DLM_LOCK_RES_BLOCK_DIRTY; 28588c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 28598c2ecf20Sopenharmony_ci /* now wait on any pending asts and the DIRTY state */ 28608c2ecf20Sopenharmony_ci wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); 28618c2ecf20Sopenharmony_ci dlm_lockres_release_ast(dlm, res); 28628c2ecf20Sopenharmony_ci 28638c2ecf20Sopenharmony_ci mlog(0, "about to wait on migration_wq, dirty=%s\n", 28648c2ecf20Sopenharmony_ci res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); 28658c2ecf20Sopenharmony_ci /* if the extra ref we just put was the final one, this 28668c2ecf20Sopenharmony_ci * will pass thru immediately. otherwise, we need to wait 28678c2ecf20Sopenharmony_ci * for the last ast to finish. */ 28688c2ecf20Sopenharmony_ciagain: 28698c2ecf20Sopenharmony_ci ret = wait_event_interruptible_timeout(dlm->migration_wq, 28708c2ecf20Sopenharmony_ci dlm_migration_can_proceed(dlm, res, target), 28718c2ecf20Sopenharmony_ci msecs_to_jiffies(1000)); 28728c2ecf20Sopenharmony_ci if (ret < 0) { 28738c2ecf20Sopenharmony_ci mlog(0, "woken again: migrating? %s, dead? %s\n", 28748c2ecf20Sopenharmony_ci res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", 28758c2ecf20Sopenharmony_ci test_bit(target, dlm->domain_map) ? "no":"yes"); 28768c2ecf20Sopenharmony_ci } else { 28778c2ecf20Sopenharmony_ci mlog(0, "all is well: migrating? %s, dead? %s\n", 28788c2ecf20Sopenharmony_ci res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", 28798c2ecf20Sopenharmony_ci test_bit(target, dlm->domain_map) ? "no":"yes"); 28808c2ecf20Sopenharmony_ci } 28818c2ecf20Sopenharmony_ci if (!dlm_migration_can_proceed(dlm, res, target)) { 28828c2ecf20Sopenharmony_ci mlog(0, "trying again...\n"); 28838c2ecf20Sopenharmony_ci goto again; 28848c2ecf20Sopenharmony_ci } 28858c2ecf20Sopenharmony_ci 28868c2ecf20Sopenharmony_ci ret = 0; 28878c2ecf20Sopenharmony_ci /* did the target go down or die? */ 28888c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 28898c2ecf20Sopenharmony_ci if (!test_bit(target, dlm->domain_map)) { 28908c2ecf20Sopenharmony_ci mlog(ML_ERROR, "aha. migration target %u just went down\n", 28918c2ecf20Sopenharmony_ci target); 28928c2ecf20Sopenharmony_ci ret = -EHOSTDOWN; 28938c2ecf20Sopenharmony_ci } 28948c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 28958c2ecf20Sopenharmony_ci 28968c2ecf20Sopenharmony_ci /* 28978c2ecf20Sopenharmony_ci * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for 28988c2ecf20Sopenharmony_ci * another try; otherwise, we are sure the MIGRATING state is there, 28998c2ecf20Sopenharmony_ci * drop the unneeded state which blocked threads trying to DIRTY 29008c2ecf20Sopenharmony_ci */ 29018c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 29028c2ecf20Sopenharmony_ci BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); 29038c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; 29048c2ecf20Sopenharmony_ci if (!ret) 29058c2ecf20Sopenharmony_ci BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); 29068c2ecf20Sopenharmony_ci else 29078c2ecf20Sopenharmony_ci res->migration_pending = 0; 29088c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 29098c2ecf20Sopenharmony_ci 29108c2ecf20Sopenharmony_ci /* 29118c2ecf20Sopenharmony_ci * at this point: 29128c2ecf20Sopenharmony_ci * 29138c2ecf20Sopenharmony_ci * o the DLM_LOCK_RES_MIGRATING flag is set if target not down 29148c2ecf20Sopenharmony_ci * o there are no pending asts on this lockres 29158c2ecf20Sopenharmony_ci * o all processes trying to reserve an ast on this 29168c2ecf20Sopenharmony_ci * lockres must wait for the MIGRATING flag to clear 29178c2ecf20Sopenharmony_ci */ 29188c2ecf20Sopenharmony_ci return ret; 29198c2ecf20Sopenharmony_ci} 29208c2ecf20Sopenharmony_ci 29218c2ecf20Sopenharmony_ci/* last step in the migration process. 29228c2ecf20Sopenharmony_ci * original master calls this to free all of the dlm_lock 29238c2ecf20Sopenharmony_ci * structures that used to be for other nodes. */ 29248c2ecf20Sopenharmony_cistatic void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, 29258c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 29268c2ecf20Sopenharmony_ci{ 29278c2ecf20Sopenharmony_ci struct list_head *queue = &res->granted; 29288c2ecf20Sopenharmony_ci int i, bit; 29298c2ecf20Sopenharmony_ci struct dlm_lock *lock, *next; 29308c2ecf20Sopenharmony_ci 29318c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 29328c2ecf20Sopenharmony_ci 29338c2ecf20Sopenharmony_ci BUG_ON(res->owner == dlm->node_num); 29348c2ecf20Sopenharmony_ci 29358c2ecf20Sopenharmony_ci for (i=0; i<3; i++) { 29368c2ecf20Sopenharmony_ci list_for_each_entry_safe(lock, next, queue, list) { 29378c2ecf20Sopenharmony_ci if (lock->ml.node != dlm->node_num) { 29388c2ecf20Sopenharmony_ci mlog(0, "putting lock for node %u\n", 29398c2ecf20Sopenharmony_ci lock->ml.node); 29408c2ecf20Sopenharmony_ci /* be extra careful */ 29418c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&lock->ast_list)); 29428c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&lock->bast_list)); 29438c2ecf20Sopenharmony_ci BUG_ON(lock->ast_pending); 29448c2ecf20Sopenharmony_ci BUG_ON(lock->bast_pending); 29458c2ecf20Sopenharmony_ci dlm_lockres_clear_refmap_bit(dlm, res, 29468c2ecf20Sopenharmony_ci lock->ml.node); 29478c2ecf20Sopenharmony_ci list_del_init(&lock->list); 29488c2ecf20Sopenharmony_ci dlm_lock_put(lock); 29498c2ecf20Sopenharmony_ci /* In a normal unlock, we would have added a 29508c2ecf20Sopenharmony_ci * DLM_UNLOCK_FREE_LOCK action. Force it. */ 29518c2ecf20Sopenharmony_ci dlm_lock_put(lock); 29528c2ecf20Sopenharmony_ci } 29538c2ecf20Sopenharmony_ci } 29548c2ecf20Sopenharmony_ci queue++; 29558c2ecf20Sopenharmony_ci } 29568c2ecf20Sopenharmony_ci bit = 0; 29578c2ecf20Sopenharmony_ci while (1) { 29588c2ecf20Sopenharmony_ci bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 29598c2ecf20Sopenharmony_ci if (bit >= O2NM_MAX_NODES) 29608c2ecf20Sopenharmony_ci break; 29618c2ecf20Sopenharmony_ci /* do not clear the local node reference, if there is a 29628c2ecf20Sopenharmony_ci * process holding this, let it drop the ref itself */ 29638c2ecf20Sopenharmony_ci if (bit != dlm->node_num) { 29648c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: node %u had a ref to this " 29658c2ecf20Sopenharmony_ci "migrating lockres, clearing\n", dlm->name, 29668c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, bit); 29678c2ecf20Sopenharmony_ci dlm_lockres_clear_refmap_bit(dlm, res, bit); 29688c2ecf20Sopenharmony_ci } 29698c2ecf20Sopenharmony_ci bit++; 29708c2ecf20Sopenharmony_ci } 29718c2ecf20Sopenharmony_ci} 29728c2ecf20Sopenharmony_ci 29738c2ecf20Sopenharmony_ci/* 29748c2ecf20Sopenharmony_ci * Pick a node to migrate the lock resource to. This function selects a 29758c2ecf20Sopenharmony_ci * potential target based first on the locks and then on refmap. It skips 29768c2ecf20Sopenharmony_ci * nodes that are in the process of exiting the domain. 29778c2ecf20Sopenharmony_ci */ 29788c2ecf20Sopenharmony_cistatic u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, 29798c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 29808c2ecf20Sopenharmony_ci{ 29818c2ecf20Sopenharmony_ci enum dlm_lockres_list idx; 29828c2ecf20Sopenharmony_ci struct list_head *queue = &res->granted; 29838c2ecf20Sopenharmony_ci struct dlm_lock *lock; 29848c2ecf20Sopenharmony_ci int noderef; 29858c2ecf20Sopenharmony_ci u8 nodenum = O2NM_MAX_NODES; 29868c2ecf20Sopenharmony_ci 29878c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 29888c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 29898c2ecf20Sopenharmony_ci 29908c2ecf20Sopenharmony_ci /* Go through all the locks */ 29918c2ecf20Sopenharmony_ci for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) { 29928c2ecf20Sopenharmony_ci queue = dlm_list_idx_to_ptr(res, idx); 29938c2ecf20Sopenharmony_ci list_for_each_entry(lock, queue, list) { 29948c2ecf20Sopenharmony_ci if (lock->ml.node == dlm->node_num) 29958c2ecf20Sopenharmony_ci continue; 29968c2ecf20Sopenharmony_ci if (test_bit(lock->ml.node, dlm->exit_domain_map)) 29978c2ecf20Sopenharmony_ci continue; 29988c2ecf20Sopenharmony_ci nodenum = lock->ml.node; 29998c2ecf20Sopenharmony_ci goto bail; 30008c2ecf20Sopenharmony_ci } 30018c2ecf20Sopenharmony_ci } 30028c2ecf20Sopenharmony_ci 30038c2ecf20Sopenharmony_ci /* Go thru the refmap */ 30048c2ecf20Sopenharmony_ci noderef = -1; 30058c2ecf20Sopenharmony_ci while (1) { 30068c2ecf20Sopenharmony_ci noderef = find_next_bit(res->refmap, O2NM_MAX_NODES, 30078c2ecf20Sopenharmony_ci noderef + 1); 30088c2ecf20Sopenharmony_ci if (noderef >= O2NM_MAX_NODES) 30098c2ecf20Sopenharmony_ci break; 30108c2ecf20Sopenharmony_ci if (noderef == dlm->node_num) 30118c2ecf20Sopenharmony_ci continue; 30128c2ecf20Sopenharmony_ci if (test_bit(noderef, dlm->exit_domain_map)) 30138c2ecf20Sopenharmony_ci continue; 30148c2ecf20Sopenharmony_ci nodenum = noderef; 30158c2ecf20Sopenharmony_ci goto bail; 30168c2ecf20Sopenharmony_ci } 30178c2ecf20Sopenharmony_ci 30188c2ecf20Sopenharmony_cibail: 30198c2ecf20Sopenharmony_ci return nodenum; 30208c2ecf20Sopenharmony_ci} 30218c2ecf20Sopenharmony_ci 30228c2ecf20Sopenharmony_ci/* this is called by the new master once all lockres 30238c2ecf20Sopenharmony_ci * data has been received */ 30248c2ecf20Sopenharmony_cistatic int dlm_do_migrate_request(struct dlm_ctxt *dlm, 30258c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 30268c2ecf20Sopenharmony_ci u8 master, u8 new_master, 30278c2ecf20Sopenharmony_ci struct dlm_node_iter *iter) 30288c2ecf20Sopenharmony_ci{ 30298c2ecf20Sopenharmony_ci struct dlm_migrate_request migrate; 30308c2ecf20Sopenharmony_ci int ret, skip, status = 0; 30318c2ecf20Sopenharmony_ci int nodenum; 30328c2ecf20Sopenharmony_ci 30338c2ecf20Sopenharmony_ci memset(&migrate, 0, sizeof(migrate)); 30348c2ecf20Sopenharmony_ci migrate.namelen = res->lockname.len; 30358c2ecf20Sopenharmony_ci memcpy(migrate.name, res->lockname.name, migrate.namelen); 30368c2ecf20Sopenharmony_ci migrate.new_master = new_master; 30378c2ecf20Sopenharmony_ci migrate.master = master; 30388c2ecf20Sopenharmony_ci 30398c2ecf20Sopenharmony_ci ret = 0; 30408c2ecf20Sopenharmony_ci 30418c2ecf20Sopenharmony_ci /* send message to all nodes, except the master and myself */ 30428c2ecf20Sopenharmony_ci while ((nodenum = dlm_node_iter_next(iter)) >= 0) { 30438c2ecf20Sopenharmony_ci if (nodenum == master || 30448c2ecf20Sopenharmony_ci nodenum == new_master) 30458c2ecf20Sopenharmony_ci continue; 30468c2ecf20Sopenharmony_ci 30478c2ecf20Sopenharmony_ci /* We could race exit domain. If exited, skip. */ 30488c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 30498c2ecf20Sopenharmony_ci skip = (!test_bit(nodenum, dlm->domain_map)); 30508c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 30518c2ecf20Sopenharmony_ci if (skip) { 30528c2ecf20Sopenharmony_ci clear_bit(nodenum, iter->node_map); 30538c2ecf20Sopenharmony_ci continue; 30548c2ecf20Sopenharmony_ci } 30558c2ecf20Sopenharmony_ci 30568c2ecf20Sopenharmony_ci ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, 30578c2ecf20Sopenharmony_ci &migrate, sizeof(migrate), nodenum, 30588c2ecf20Sopenharmony_ci &status); 30598c2ecf20Sopenharmony_ci if (ret < 0) { 30608c2ecf20Sopenharmony_ci mlog(ML_ERROR, "%s: res %.*s, Error %d send " 30618c2ecf20Sopenharmony_ci "MIGRATE_REQUEST to node %u\n", dlm->name, 30628c2ecf20Sopenharmony_ci migrate.namelen, migrate.name, ret, nodenum); 30638c2ecf20Sopenharmony_ci if (!dlm_is_host_down(ret)) { 30648c2ecf20Sopenharmony_ci mlog(ML_ERROR, "unhandled error=%d!\n", ret); 30658c2ecf20Sopenharmony_ci BUG(); 30668c2ecf20Sopenharmony_ci } 30678c2ecf20Sopenharmony_ci clear_bit(nodenum, iter->node_map); 30688c2ecf20Sopenharmony_ci ret = 0; 30698c2ecf20Sopenharmony_ci } else if (status < 0) { 30708c2ecf20Sopenharmony_ci mlog(0, "migrate request (node %u) returned %d!\n", 30718c2ecf20Sopenharmony_ci nodenum, status); 30728c2ecf20Sopenharmony_ci ret = status; 30738c2ecf20Sopenharmony_ci } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { 30748c2ecf20Sopenharmony_ci /* during the migration request we short-circuited 30758c2ecf20Sopenharmony_ci * the mastery of the lockres. make sure we have 30768c2ecf20Sopenharmony_ci * a mastery ref for nodenum */ 30778c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: need ref for node %u\n", 30788c2ecf20Sopenharmony_ci dlm->name, res->lockname.len, res->lockname.name, 30798c2ecf20Sopenharmony_ci nodenum); 30808c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 30818c2ecf20Sopenharmony_ci dlm_lockres_set_refmap_bit(dlm, res, nodenum); 30828c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 30838c2ecf20Sopenharmony_ci } 30848c2ecf20Sopenharmony_ci } 30858c2ecf20Sopenharmony_ci 30868c2ecf20Sopenharmony_ci if (ret < 0) 30878c2ecf20Sopenharmony_ci mlog_errno(ret); 30888c2ecf20Sopenharmony_ci 30898c2ecf20Sopenharmony_ci mlog(0, "returning ret=%d\n", ret); 30908c2ecf20Sopenharmony_ci return ret; 30918c2ecf20Sopenharmony_ci} 30928c2ecf20Sopenharmony_ci 30938c2ecf20Sopenharmony_ci 30948c2ecf20Sopenharmony_ci/* if there is an existing mle for this lockres, we now know who the master is. 30958c2ecf20Sopenharmony_ci * (the one who sent us *this* message) we can clear it up right away. 30968c2ecf20Sopenharmony_ci * since the process that put the mle on the list still has a reference to it, 30978c2ecf20Sopenharmony_ci * we can unhash it now, set the master and wake the process. as a result, 30988c2ecf20Sopenharmony_ci * we will have no mle in the list to start with. now we can add an mle for 30998c2ecf20Sopenharmony_ci * the migration and this should be the only one found for those scanning the 31008c2ecf20Sopenharmony_ci * list. */ 31018c2ecf20Sopenharmony_ciint dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, 31028c2ecf20Sopenharmony_ci void **ret_data) 31038c2ecf20Sopenharmony_ci{ 31048c2ecf20Sopenharmony_ci struct dlm_ctxt *dlm = data; 31058c2ecf20Sopenharmony_ci struct dlm_lock_resource *res = NULL; 31068c2ecf20Sopenharmony_ci struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; 31078c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; 31088c2ecf20Sopenharmony_ci const char *name; 31098c2ecf20Sopenharmony_ci unsigned int namelen, hash; 31108c2ecf20Sopenharmony_ci int ret = 0; 31118c2ecf20Sopenharmony_ci 31128c2ecf20Sopenharmony_ci if (!dlm_grab(dlm)) 31138c2ecf20Sopenharmony_ci return 0; 31148c2ecf20Sopenharmony_ci 31158c2ecf20Sopenharmony_ci name = migrate->name; 31168c2ecf20Sopenharmony_ci namelen = migrate->namelen; 31178c2ecf20Sopenharmony_ci hash = dlm_lockid_hash(name, namelen); 31188c2ecf20Sopenharmony_ci 31198c2ecf20Sopenharmony_ci /* preallocate.. if this fails, abort */ 31208c2ecf20Sopenharmony_ci mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); 31218c2ecf20Sopenharmony_ci 31228c2ecf20Sopenharmony_ci if (!mle) { 31238c2ecf20Sopenharmony_ci ret = -ENOMEM; 31248c2ecf20Sopenharmony_ci goto leave; 31258c2ecf20Sopenharmony_ci } 31268c2ecf20Sopenharmony_ci 31278c2ecf20Sopenharmony_ci /* check for pre-existing lock */ 31288c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 31298c2ecf20Sopenharmony_ci res = __dlm_lookup_lockres(dlm, name, namelen, hash); 31308c2ecf20Sopenharmony_ci if (res) { 31318c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 31328c2ecf20Sopenharmony_ci if (res->state & DLM_LOCK_RES_RECOVERING) { 31338c2ecf20Sopenharmony_ci /* if all is working ok, this can only mean that we got 31348c2ecf20Sopenharmony_ci * a migrate request from a node that we now see as 31358c2ecf20Sopenharmony_ci * dead. what can we do here? drop it to the floor? */ 31368c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 31378c2ecf20Sopenharmony_ci mlog(ML_ERROR, "Got a migrate request, but the " 31388c2ecf20Sopenharmony_ci "lockres is marked as recovering!"); 31398c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 31408c2ecf20Sopenharmony_ci ret = -EINVAL; /* need a better solution */ 31418c2ecf20Sopenharmony_ci goto unlock; 31428c2ecf20Sopenharmony_ci } 31438c2ecf20Sopenharmony_ci res->state |= DLM_LOCK_RES_MIGRATING; 31448c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 31458c2ecf20Sopenharmony_ci } 31468c2ecf20Sopenharmony_ci 31478c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 31488c2ecf20Sopenharmony_ci /* ignore status. only nonzero status would BUG. */ 31498c2ecf20Sopenharmony_ci ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, 31508c2ecf20Sopenharmony_ci name, namelen, 31518c2ecf20Sopenharmony_ci migrate->new_master, 31528c2ecf20Sopenharmony_ci migrate->master); 31538c2ecf20Sopenharmony_ci 31548c2ecf20Sopenharmony_ci if (ret < 0) 31558c2ecf20Sopenharmony_ci kmem_cache_free(dlm_mle_cache, mle); 31568c2ecf20Sopenharmony_ci 31578c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 31588c2ecf20Sopenharmony_ciunlock: 31598c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 31608c2ecf20Sopenharmony_ci 31618c2ecf20Sopenharmony_ci if (oldmle) { 31628c2ecf20Sopenharmony_ci /* master is known, detach if not already detached */ 31638c2ecf20Sopenharmony_ci dlm_mle_detach_hb_events(dlm, oldmle); 31648c2ecf20Sopenharmony_ci dlm_put_mle(oldmle); 31658c2ecf20Sopenharmony_ci } 31668c2ecf20Sopenharmony_ci 31678c2ecf20Sopenharmony_ci if (res) 31688c2ecf20Sopenharmony_ci dlm_lockres_put(res); 31698c2ecf20Sopenharmony_cileave: 31708c2ecf20Sopenharmony_ci dlm_put(dlm); 31718c2ecf20Sopenharmony_ci return ret; 31728c2ecf20Sopenharmony_ci} 31738c2ecf20Sopenharmony_ci 31748c2ecf20Sopenharmony_ci/* must be holding dlm->spinlock and dlm->master_lock 31758c2ecf20Sopenharmony_ci * when adding a migration mle, we can clear any other mles 31768c2ecf20Sopenharmony_ci * in the master list because we know with certainty that 31778c2ecf20Sopenharmony_ci * the master is "master". so we remove any old mle from 31788c2ecf20Sopenharmony_ci * the list after setting it's master field, and then add 31798c2ecf20Sopenharmony_ci * the new migration mle. this way we can hold with the rule 31808c2ecf20Sopenharmony_ci * of having only one mle for a given lock name at all times. */ 31818c2ecf20Sopenharmony_cistatic int dlm_add_migration_mle(struct dlm_ctxt *dlm, 31828c2ecf20Sopenharmony_ci struct dlm_lock_resource *res, 31838c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, 31848c2ecf20Sopenharmony_ci struct dlm_master_list_entry **oldmle, 31858c2ecf20Sopenharmony_ci const char *name, unsigned int namelen, 31868c2ecf20Sopenharmony_ci u8 new_master, u8 master) 31878c2ecf20Sopenharmony_ci{ 31888c2ecf20Sopenharmony_ci int found; 31898c2ecf20Sopenharmony_ci int ret = 0; 31908c2ecf20Sopenharmony_ci 31918c2ecf20Sopenharmony_ci *oldmle = NULL; 31928c2ecf20Sopenharmony_ci 31938c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 31948c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->master_lock); 31958c2ecf20Sopenharmony_ci 31968c2ecf20Sopenharmony_ci /* caller is responsible for any ref taken here on oldmle */ 31978c2ecf20Sopenharmony_ci found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); 31988c2ecf20Sopenharmony_ci if (found) { 31998c2ecf20Sopenharmony_ci struct dlm_master_list_entry *tmp = *oldmle; 32008c2ecf20Sopenharmony_ci spin_lock(&tmp->spinlock); 32018c2ecf20Sopenharmony_ci if (tmp->type == DLM_MLE_MIGRATION) { 32028c2ecf20Sopenharmony_ci if (master == dlm->node_num) { 32038c2ecf20Sopenharmony_ci /* ah another process raced me to it */ 32048c2ecf20Sopenharmony_ci mlog(0, "tried to migrate %.*s, but some " 32058c2ecf20Sopenharmony_ci "process beat me to it\n", 32068c2ecf20Sopenharmony_ci namelen, name); 32078c2ecf20Sopenharmony_ci spin_unlock(&tmp->spinlock); 32088c2ecf20Sopenharmony_ci return -EEXIST; 32098c2ecf20Sopenharmony_ci } else { 32108c2ecf20Sopenharmony_ci /* bad. 2 NODES are trying to migrate! */ 32118c2ecf20Sopenharmony_ci mlog(ML_ERROR, "migration error mle: " 32128c2ecf20Sopenharmony_ci "master=%u new_master=%u // request: " 32138c2ecf20Sopenharmony_ci "master=%u new_master=%u // " 32148c2ecf20Sopenharmony_ci "lockres=%.*s\n", 32158c2ecf20Sopenharmony_ci tmp->master, tmp->new_master, 32168c2ecf20Sopenharmony_ci master, new_master, 32178c2ecf20Sopenharmony_ci namelen, name); 32188c2ecf20Sopenharmony_ci BUG(); 32198c2ecf20Sopenharmony_ci } 32208c2ecf20Sopenharmony_ci } else { 32218c2ecf20Sopenharmony_ci /* this is essentially what assert_master does */ 32228c2ecf20Sopenharmony_ci tmp->master = master; 32238c2ecf20Sopenharmony_ci atomic_set(&tmp->woken, 1); 32248c2ecf20Sopenharmony_ci wake_up(&tmp->wq); 32258c2ecf20Sopenharmony_ci /* remove it so that only one mle will be found */ 32268c2ecf20Sopenharmony_ci __dlm_unlink_mle(dlm, tmp); 32278c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, tmp); 32288c2ecf20Sopenharmony_ci if (tmp->type == DLM_MLE_MASTER) { 32298c2ecf20Sopenharmony_ci ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; 32308c2ecf20Sopenharmony_ci mlog(0, "%s:%.*s: master=%u, newmaster=%u, " 32318c2ecf20Sopenharmony_ci "telling master to get ref " 32328c2ecf20Sopenharmony_ci "for cleared out mle during " 32338c2ecf20Sopenharmony_ci "migration\n", dlm->name, 32348c2ecf20Sopenharmony_ci namelen, name, master, 32358c2ecf20Sopenharmony_ci new_master); 32368c2ecf20Sopenharmony_ci } 32378c2ecf20Sopenharmony_ci } 32388c2ecf20Sopenharmony_ci spin_unlock(&tmp->spinlock); 32398c2ecf20Sopenharmony_ci } 32408c2ecf20Sopenharmony_ci 32418c2ecf20Sopenharmony_ci /* now add a migration mle to the tail of the list */ 32428c2ecf20Sopenharmony_ci dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); 32438c2ecf20Sopenharmony_ci mle->new_master = new_master; 32448c2ecf20Sopenharmony_ci /* the new master will be sending an assert master for this. 32458c2ecf20Sopenharmony_ci * at that point we will get the refmap reference */ 32468c2ecf20Sopenharmony_ci mle->master = master; 32478c2ecf20Sopenharmony_ci /* do this for consistency with other mle types */ 32488c2ecf20Sopenharmony_ci set_bit(new_master, mle->maybe_map); 32498c2ecf20Sopenharmony_ci __dlm_insert_mle(dlm, mle); 32508c2ecf20Sopenharmony_ci 32518c2ecf20Sopenharmony_ci return ret; 32528c2ecf20Sopenharmony_ci} 32538c2ecf20Sopenharmony_ci 32548c2ecf20Sopenharmony_ci/* 32558c2ecf20Sopenharmony_ci * Sets the owner of the lockres, associated to the mle, to UNKNOWN 32568c2ecf20Sopenharmony_ci */ 32578c2ecf20Sopenharmony_cistatic struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm, 32588c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle) 32598c2ecf20Sopenharmony_ci{ 32608c2ecf20Sopenharmony_ci struct dlm_lock_resource *res; 32618c2ecf20Sopenharmony_ci 32628c2ecf20Sopenharmony_ci /* Find the lockres associated to the mle and set its owner to UNK */ 32638c2ecf20Sopenharmony_ci res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen, 32648c2ecf20Sopenharmony_ci mle->mnamehash); 32658c2ecf20Sopenharmony_ci if (res) { 32668c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 32678c2ecf20Sopenharmony_ci 32688c2ecf20Sopenharmony_ci /* move lockres onto recovery list */ 32698c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 32708c2ecf20Sopenharmony_ci dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); 32718c2ecf20Sopenharmony_ci dlm_move_lockres_to_recovery_list(dlm, res); 32728c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 32738c2ecf20Sopenharmony_ci dlm_lockres_put(res); 32748c2ecf20Sopenharmony_ci 32758c2ecf20Sopenharmony_ci /* about to get rid of mle, detach from heartbeat */ 32768c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, mle); 32778c2ecf20Sopenharmony_ci 32788c2ecf20Sopenharmony_ci /* dump the mle */ 32798c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 32808c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 32818c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 32828c2ecf20Sopenharmony_ci } 32838c2ecf20Sopenharmony_ci 32848c2ecf20Sopenharmony_ci return res; 32858c2ecf20Sopenharmony_ci} 32868c2ecf20Sopenharmony_ci 32878c2ecf20Sopenharmony_cistatic void dlm_clean_migration_mle(struct dlm_ctxt *dlm, 32888c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle) 32898c2ecf20Sopenharmony_ci{ 32908c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, mle); 32918c2ecf20Sopenharmony_ci 32928c2ecf20Sopenharmony_ci spin_lock(&mle->spinlock); 32938c2ecf20Sopenharmony_ci __dlm_unlink_mle(dlm, mle); 32948c2ecf20Sopenharmony_ci atomic_set(&mle->woken, 1); 32958c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 32968c2ecf20Sopenharmony_ci 32978c2ecf20Sopenharmony_ci wake_up(&mle->wq); 32988c2ecf20Sopenharmony_ci} 32998c2ecf20Sopenharmony_ci 33008c2ecf20Sopenharmony_cistatic void dlm_clean_block_mle(struct dlm_ctxt *dlm, 33018c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle, u8 dead_node) 33028c2ecf20Sopenharmony_ci{ 33038c2ecf20Sopenharmony_ci int bit; 33048c2ecf20Sopenharmony_ci 33058c2ecf20Sopenharmony_ci BUG_ON(mle->type != DLM_MLE_BLOCK); 33068c2ecf20Sopenharmony_ci 33078c2ecf20Sopenharmony_ci spin_lock(&mle->spinlock); 33088c2ecf20Sopenharmony_ci bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); 33098c2ecf20Sopenharmony_ci if (bit != dead_node) { 33108c2ecf20Sopenharmony_ci mlog(0, "mle found, but dead node %u would not have been " 33118c2ecf20Sopenharmony_ci "master\n", dead_node); 33128c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 33138c2ecf20Sopenharmony_ci } else { 33148c2ecf20Sopenharmony_ci /* Must drop the refcount by one since the assert_master will 33158c2ecf20Sopenharmony_ci * never arrive. This may result in the mle being unlinked and 33168c2ecf20Sopenharmony_ci * freed, but there may still be a process waiting in the 33178c2ecf20Sopenharmony_ci * dlmlock path which is fine. */ 33188c2ecf20Sopenharmony_ci mlog(0, "node %u was expected master\n", dead_node); 33198c2ecf20Sopenharmony_ci atomic_set(&mle->woken, 1); 33208c2ecf20Sopenharmony_ci spin_unlock(&mle->spinlock); 33218c2ecf20Sopenharmony_ci wake_up(&mle->wq); 33228c2ecf20Sopenharmony_ci 33238c2ecf20Sopenharmony_ci /* Do not need events any longer, so detach from heartbeat */ 33248c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, mle); 33258c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 33268c2ecf20Sopenharmony_ci } 33278c2ecf20Sopenharmony_ci} 33288c2ecf20Sopenharmony_ci 33298c2ecf20Sopenharmony_civoid dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) 33308c2ecf20Sopenharmony_ci{ 33318c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle; 33328c2ecf20Sopenharmony_ci struct dlm_lock_resource *res; 33338c2ecf20Sopenharmony_ci struct hlist_head *bucket; 33348c2ecf20Sopenharmony_ci struct hlist_node *tmp; 33358c2ecf20Sopenharmony_ci unsigned int i; 33368c2ecf20Sopenharmony_ci 33378c2ecf20Sopenharmony_ci mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); 33388c2ecf20Sopenharmony_citop: 33398c2ecf20Sopenharmony_ci assert_spin_locked(&dlm->spinlock); 33408c2ecf20Sopenharmony_ci 33418c2ecf20Sopenharmony_ci /* clean the master list */ 33428c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 33438c2ecf20Sopenharmony_ci for (i = 0; i < DLM_HASH_BUCKETS; i++) { 33448c2ecf20Sopenharmony_ci bucket = dlm_master_hash(dlm, i); 33458c2ecf20Sopenharmony_ci hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { 33468c2ecf20Sopenharmony_ci BUG_ON(mle->type != DLM_MLE_BLOCK && 33478c2ecf20Sopenharmony_ci mle->type != DLM_MLE_MASTER && 33488c2ecf20Sopenharmony_ci mle->type != DLM_MLE_MIGRATION); 33498c2ecf20Sopenharmony_ci 33508c2ecf20Sopenharmony_ci /* MASTER mles are initiated locally. The waiting 33518c2ecf20Sopenharmony_ci * process will notice the node map change shortly. 33528c2ecf20Sopenharmony_ci * Let that happen as normal. */ 33538c2ecf20Sopenharmony_ci if (mle->type == DLM_MLE_MASTER) 33548c2ecf20Sopenharmony_ci continue; 33558c2ecf20Sopenharmony_ci 33568c2ecf20Sopenharmony_ci /* BLOCK mles are initiated by other nodes. Need to 33578c2ecf20Sopenharmony_ci * clean up if the dead node would have been the 33588c2ecf20Sopenharmony_ci * master. */ 33598c2ecf20Sopenharmony_ci if (mle->type == DLM_MLE_BLOCK) { 33608c2ecf20Sopenharmony_ci dlm_clean_block_mle(dlm, mle, dead_node); 33618c2ecf20Sopenharmony_ci continue; 33628c2ecf20Sopenharmony_ci } 33638c2ecf20Sopenharmony_ci 33648c2ecf20Sopenharmony_ci /* Everything else is a MIGRATION mle */ 33658c2ecf20Sopenharmony_ci 33668c2ecf20Sopenharmony_ci /* The rule for MIGRATION mles is that the master 33678c2ecf20Sopenharmony_ci * becomes UNKNOWN if *either* the original or the new 33688c2ecf20Sopenharmony_ci * master dies. All UNKNOWN lockres' are sent to 33698c2ecf20Sopenharmony_ci * whichever node becomes the recovery master. The new 33708c2ecf20Sopenharmony_ci * master is responsible for determining if there is 33718c2ecf20Sopenharmony_ci * still a master for this lockres, or if he needs to 33728c2ecf20Sopenharmony_ci * take over mastery. Either way, this node should 33738c2ecf20Sopenharmony_ci * expect another message to resolve this. */ 33748c2ecf20Sopenharmony_ci 33758c2ecf20Sopenharmony_ci if (mle->master != dead_node && 33768c2ecf20Sopenharmony_ci mle->new_master != dead_node) 33778c2ecf20Sopenharmony_ci continue; 33788c2ecf20Sopenharmony_ci 33798c2ecf20Sopenharmony_ci if (mle->new_master == dead_node && mle->inuse) { 33808c2ecf20Sopenharmony_ci mlog(ML_NOTICE, "%s: target %u died during " 33818c2ecf20Sopenharmony_ci "migration from %u, the MLE is " 33828c2ecf20Sopenharmony_ci "still keep used, ignore it!\n", 33838c2ecf20Sopenharmony_ci dlm->name, dead_node, 33848c2ecf20Sopenharmony_ci mle->master); 33858c2ecf20Sopenharmony_ci continue; 33868c2ecf20Sopenharmony_ci } 33878c2ecf20Sopenharmony_ci 33888c2ecf20Sopenharmony_ci /* If we have reached this point, this mle needs to be 33898c2ecf20Sopenharmony_ci * removed from the list and freed. */ 33908c2ecf20Sopenharmony_ci dlm_clean_migration_mle(dlm, mle); 33918c2ecf20Sopenharmony_ci 33928c2ecf20Sopenharmony_ci mlog(0, "%s: node %u died during migration from " 33938c2ecf20Sopenharmony_ci "%u to %u!\n", dlm->name, dead_node, mle->master, 33948c2ecf20Sopenharmony_ci mle->new_master); 33958c2ecf20Sopenharmony_ci 33968c2ecf20Sopenharmony_ci /* If we find a lockres associated with the mle, we've 33978c2ecf20Sopenharmony_ci * hit this rare case that messes up our lock ordering. 33988c2ecf20Sopenharmony_ci * If so, we need to drop the master lock so that we can 33998c2ecf20Sopenharmony_ci * take the lockres lock, meaning that we will have to 34008c2ecf20Sopenharmony_ci * restart from the head of list. */ 34018c2ecf20Sopenharmony_ci res = dlm_reset_mleres_owner(dlm, mle); 34028c2ecf20Sopenharmony_ci if (res) 34038c2ecf20Sopenharmony_ci /* restart */ 34048c2ecf20Sopenharmony_ci goto top; 34058c2ecf20Sopenharmony_ci 34068c2ecf20Sopenharmony_ci /* This may be the last reference */ 34078c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 34088c2ecf20Sopenharmony_ci } 34098c2ecf20Sopenharmony_ci } 34108c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 34118c2ecf20Sopenharmony_ci} 34128c2ecf20Sopenharmony_ci 34138c2ecf20Sopenharmony_ciint dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, 34148c2ecf20Sopenharmony_ci u8 old_master) 34158c2ecf20Sopenharmony_ci{ 34168c2ecf20Sopenharmony_ci struct dlm_node_iter iter; 34178c2ecf20Sopenharmony_ci int ret = 0; 34188c2ecf20Sopenharmony_ci 34198c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 34208c2ecf20Sopenharmony_ci dlm_node_iter_init(dlm->domain_map, &iter); 34218c2ecf20Sopenharmony_ci clear_bit(old_master, iter.node_map); 34228c2ecf20Sopenharmony_ci clear_bit(dlm->node_num, iter.node_map); 34238c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 34248c2ecf20Sopenharmony_ci 34258c2ecf20Sopenharmony_ci /* ownership of the lockres is changing. account for the 34268c2ecf20Sopenharmony_ci * mastery reference here since old_master will briefly have 34278c2ecf20Sopenharmony_ci * a reference after the migration completes */ 34288c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 34298c2ecf20Sopenharmony_ci dlm_lockres_set_refmap_bit(dlm, res, old_master); 34308c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 34318c2ecf20Sopenharmony_ci 34328c2ecf20Sopenharmony_ci mlog(0, "now time to do a migrate request to other nodes\n"); 34338c2ecf20Sopenharmony_ci ret = dlm_do_migrate_request(dlm, res, old_master, 34348c2ecf20Sopenharmony_ci dlm->node_num, &iter); 34358c2ecf20Sopenharmony_ci if (ret < 0) { 34368c2ecf20Sopenharmony_ci mlog_errno(ret); 34378c2ecf20Sopenharmony_ci goto leave; 34388c2ecf20Sopenharmony_ci } 34398c2ecf20Sopenharmony_ci 34408c2ecf20Sopenharmony_ci mlog(0, "doing assert master of %.*s to all except the original node\n", 34418c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name); 34428c2ecf20Sopenharmony_ci /* this call now finishes out the nodemap 34438c2ecf20Sopenharmony_ci * even if one or more nodes die */ 34448c2ecf20Sopenharmony_ci ret = dlm_do_assert_master(dlm, res, iter.node_map, 34458c2ecf20Sopenharmony_ci DLM_ASSERT_MASTER_FINISH_MIGRATION); 34468c2ecf20Sopenharmony_ci if (ret < 0) { 34478c2ecf20Sopenharmony_ci /* no longer need to retry. all living nodes contacted. */ 34488c2ecf20Sopenharmony_ci mlog_errno(ret); 34498c2ecf20Sopenharmony_ci ret = 0; 34508c2ecf20Sopenharmony_ci } 34518c2ecf20Sopenharmony_ci 34528c2ecf20Sopenharmony_ci memset(iter.node_map, 0, sizeof(iter.node_map)); 34538c2ecf20Sopenharmony_ci set_bit(old_master, iter.node_map); 34548c2ecf20Sopenharmony_ci mlog(0, "doing assert master of %.*s back to %u\n", 34558c2ecf20Sopenharmony_ci res->lockname.len, res->lockname.name, old_master); 34568c2ecf20Sopenharmony_ci ret = dlm_do_assert_master(dlm, res, iter.node_map, 34578c2ecf20Sopenharmony_ci DLM_ASSERT_MASTER_FINISH_MIGRATION); 34588c2ecf20Sopenharmony_ci if (ret < 0) { 34598c2ecf20Sopenharmony_ci mlog(0, "assert master to original master failed " 34608c2ecf20Sopenharmony_ci "with %d.\n", ret); 34618c2ecf20Sopenharmony_ci /* the only nonzero status here would be because of 34628c2ecf20Sopenharmony_ci * a dead original node. we're done. */ 34638c2ecf20Sopenharmony_ci ret = 0; 34648c2ecf20Sopenharmony_ci } 34658c2ecf20Sopenharmony_ci 34668c2ecf20Sopenharmony_ci /* all done, set the owner, clear the flag */ 34678c2ecf20Sopenharmony_ci spin_lock(&res->spinlock); 34688c2ecf20Sopenharmony_ci dlm_set_lockres_owner(dlm, res, dlm->node_num); 34698c2ecf20Sopenharmony_ci res->state &= ~DLM_LOCK_RES_MIGRATING; 34708c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 34718c2ecf20Sopenharmony_ci /* re-dirty it on the new master */ 34728c2ecf20Sopenharmony_ci dlm_kick_thread(dlm, res); 34738c2ecf20Sopenharmony_ci wake_up(&res->wq); 34748c2ecf20Sopenharmony_cileave: 34758c2ecf20Sopenharmony_ci return ret; 34768c2ecf20Sopenharmony_ci} 34778c2ecf20Sopenharmony_ci 34788c2ecf20Sopenharmony_ci/* 34798c2ecf20Sopenharmony_ci * LOCKRES AST REFCOUNT 34808c2ecf20Sopenharmony_ci * this is integral to migration 34818c2ecf20Sopenharmony_ci */ 34828c2ecf20Sopenharmony_ci 34838c2ecf20Sopenharmony_ci/* for future intent to call an ast, reserve one ahead of time. 34848c2ecf20Sopenharmony_ci * this should be called only after waiting on the lockres 34858c2ecf20Sopenharmony_ci * with dlm_wait_on_lockres, and while still holding the 34868c2ecf20Sopenharmony_ci * spinlock after the call. */ 34878c2ecf20Sopenharmony_civoid __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) 34888c2ecf20Sopenharmony_ci{ 34898c2ecf20Sopenharmony_ci assert_spin_locked(&res->spinlock); 34908c2ecf20Sopenharmony_ci if (res->state & DLM_LOCK_RES_MIGRATING) { 34918c2ecf20Sopenharmony_ci __dlm_print_one_lock_resource(res); 34928c2ecf20Sopenharmony_ci } 34938c2ecf20Sopenharmony_ci BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 34948c2ecf20Sopenharmony_ci 34958c2ecf20Sopenharmony_ci atomic_inc(&res->asts_reserved); 34968c2ecf20Sopenharmony_ci} 34978c2ecf20Sopenharmony_ci 34988c2ecf20Sopenharmony_ci/* 34998c2ecf20Sopenharmony_ci * used to drop the reserved ast, either because it went unused, 35008c2ecf20Sopenharmony_ci * or because the ast/bast was actually called. 35018c2ecf20Sopenharmony_ci * 35028c2ecf20Sopenharmony_ci * also, if there is a pending migration on this lockres, 35038c2ecf20Sopenharmony_ci * and this was the last pending ast on the lockres, 35048c2ecf20Sopenharmony_ci * atomically set the MIGRATING flag before we drop the lock. 35058c2ecf20Sopenharmony_ci * this is how we ensure that migration can proceed with no 35068c2ecf20Sopenharmony_ci * asts in progress. note that it is ok if the state of the 35078c2ecf20Sopenharmony_ci * queues is such that a lock should be granted in the future 35088c2ecf20Sopenharmony_ci * or that a bast should be fired, because the new master will 35098c2ecf20Sopenharmony_ci * shuffle the lists on this lockres as soon as it is migrated. 35108c2ecf20Sopenharmony_ci */ 35118c2ecf20Sopenharmony_civoid dlm_lockres_release_ast(struct dlm_ctxt *dlm, 35128c2ecf20Sopenharmony_ci struct dlm_lock_resource *res) 35138c2ecf20Sopenharmony_ci{ 35148c2ecf20Sopenharmony_ci if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) 35158c2ecf20Sopenharmony_ci return; 35168c2ecf20Sopenharmony_ci 35178c2ecf20Sopenharmony_ci if (!res->migration_pending) { 35188c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 35198c2ecf20Sopenharmony_ci return; 35208c2ecf20Sopenharmony_ci } 35218c2ecf20Sopenharmony_ci 35228c2ecf20Sopenharmony_ci BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 35238c2ecf20Sopenharmony_ci res->migration_pending = 0; 35248c2ecf20Sopenharmony_ci res->state |= DLM_LOCK_RES_MIGRATING; 35258c2ecf20Sopenharmony_ci spin_unlock(&res->spinlock); 35268c2ecf20Sopenharmony_ci wake_up(&res->wq); 35278c2ecf20Sopenharmony_ci wake_up(&dlm->migration_wq); 35288c2ecf20Sopenharmony_ci} 35298c2ecf20Sopenharmony_ci 35308c2ecf20Sopenharmony_civoid dlm_force_free_mles(struct dlm_ctxt *dlm) 35318c2ecf20Sopenharmony_ci{ 35328c2ecf20Sopenharmony_ci int i; 35338c2ecf20Sopenharmony_ci struct hlist_head *bucket; 35348c2ecf20Sopenharmony_ci struct dlm_master_list_entry *mle; 35358c2ecf20Sopenharmony_ci struct hlist_node *tmp; 35368c2ecf20Sopenharmony_ci 35378c2ecf20Sopenharmony_ci /* 35388c2ecf20Sopenharmony_ci * We notified all other nodes that we are exiting the domain and 35398c2ecf20Sopenharmony_ci * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still 35408c2ecf20Sopenharmony_ci * around we force free them and wake any processes that are waiting 35418c2ecf20Sopenharmony_ci * on the mles 35428c2ecf20Sopenharmony_ci */ 35438c2ecf20Sopenharmony_ci spin_lock(&dlm->spinlock); 35448c2ecf20Sopenharmony_ci spin_lock(&dlm->master_lock); 35458c2ecf20Sopenharmony_ci 35468c2ecf20Sopenharmony_ci BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); 35478c2ecf20Sopenharmony_ci BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); 35488c2ecf20Sopenharmony_ci 35498c2ecf20Sopenharmony_ci for (i = 0; i < DLM_HASH_BUCKETS; i++) { 35508c2ecf20Sopenharmony_ci bucket = dlm_master_hash(dlm, i); 35518c2ecf20Sopenharmony_ci hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { 35528c2ecf20Sopenharmony_ci if (mle->type != DLM_MLE_BLOCK) { 35538c2ecf20Sopenharmony_ci mlog(ML_ERROR, "bad mle: %p\n", mle); 35548c2ecf20Sopenharmony_ci dlm_print_one_mle(mle); 35558c2ecf20Sopenharmony_ci } 35568c2ecf20Sopenharmony_ci atomic_set(&mle->woken, 1); 35578c2ecf20Sopenharmony_ci wake_up(&mle->wq); 35588c2ecf20Sopenharmony_ci 35598c2ecf20Sopenharmony_ci __dlm_unlink_mle(dlm, mle); 35608c2ecf20Sopenharmony_ci __dlm_mle_detach_hb_events(dlm, mle); 35618c2ecf20Sopenharmony_ci __dlm_put_mle(mle); 35628c2ecf20Sopenharmony_ci } 35638c2ecf20Sopenharmony_ci } 35648c2ecf20Sopenharmony_ci spin_unlock(&dlm->master_lock); 35658c2ecf20Sopenharmony_ci spin_unlock(&dlm->spinlock); 35668c2ecf20Sopenharmony_ci} 3567