162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2015, SUSE 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci#include <linux/module.h> 862306a36Sopenharmony_ci#include <linux/kthread.h> 962306a36Sopenharmony_ci#include <linux/dlm.h> 1062306a36Sopenharmony_ci#include <linux/sched.h> 1162306a36Sopenharmony_ci#include <linux/raid/md_p.h> 1262306a36Sopenharmony_ci#include "md.h" 1362306a36Sopenharmony_ci#include "md-bitmap.h" 1462306a36Sopenharmony_ci#include "md-cluster.h" 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#define LVB_SIZE 64 1762306a36Sopenharmony_ci#define NEW_DEV_TIMEOUT 5000 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_cistruct dlm_lock_resource { 2062306a36Sopenharmony_ci dlm_lockspace_t *ls; 2162306a36Sopenharmony_ci struct dlm_lksb lksb; 2262306a36Sopenharmony_ci char *name; /* lock name. */ 2362306a36Sopenharmony_ci uint32_t flags; /* flags to pass to dlm_lock() */ 2462306a36Sopenharmony_ci wait_queue_head_t sync_locking; /* wait queue for synchronized locking */ 2562306a36Sopenharmony_ci bool sync_locking_done; 2662306a36Sopenharmony_ci void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ 2762306a36Sopenharmony_ci struct mddev *mddev; /* pointing back to mddev. */ 2862306a36Sopenharmony_ci int mode; 2962306a36Sopenharmony_ci}; 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_cistruct resync_info { 3262306a36Sopenharmony_ci __le64 lo; 3362306a36Sopenharmony_ci __le64 hi; 3462306a36Sopenharmony_ci}; 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci/* md_cluster_info flags */ 3762306a36Sopenharmony_ci#define MD_CLUSTER_WAITING_FOR_NEWDISK 1 3862306a36Sopenharmony_ci#define MD_CLUSTER_SUSPEND_READ_BALANCING 2 3962306a36Sopenharmony_ci#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci/* Lock the send communication. This is done through 4262306a36Sopenharmony_ci * bit manipulation as opposed to a mutex in order to 4362306a36Sopenharmony_ci * accommodate lock and hold. See next comment. 4462306a36Sopenharmony_ci */ 4562306a36Sopenharmony_ci#define MD_CLUSTER_SEND_LOCK 4 4662306a36Sopenharmony_ci/* If cluster operations (such as adding a disk) must lock the 4762306a36Sopenharmony_ci * communication channel, so as to perform extra operations 4862306a36Sopenharmony_ci * (update metadata) and no other operation is allowed on the 4962306a36Sopenharmony_ci * MD. Token needs to be locked and held until the operation 5062306a36Sopenharmony_ci * completes witha md_update_sb(), which would eventually release 5162306a36Sopenharmony_ci * the lock. 5262306a36Sopenharmony_ci */ 5362306a36Sopenharmony_ci#define MD_CLUSTER_SEND_LOCKED_ALREADY 5 5462306a36Sopenharmony_ci/* We should receive message after node joined cluster and 5562306a36Sopenharmony_ci * set up all the related infos such as bitmap and personality */ 5662306a36Sopenharmony_ci#define MD_CLUSTER_ALREADY_IN_CLUSTER 6 5762306a36Sopenharmony_ci#define MD_CLUSTER_PENDING_RECV_EVENT 7 5862306a36Sopenharmony_ci#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_cistruct md_cluster_info { 6162306a36Sopenharmony_ci struct mddev *mddev; /* the md device which md_cluster_info belongs to */ 6262306a36Sopenharmony_ci /* dlm lock space and resources for clustered raid. */ 6362306a36Sopenharmony_ci dlm_lockspace_t *lockspace; 6462306a36Sopenharmony_ci int slot_number; 6562306a36Sopenharmony_ci struct completion completion; 6662306a36Sopenharmony_ci struct mutex recv_mutex; 6762306a36Sopenharmony_ci struct dlm_lock_resource *bitmap_lockres; 6862306a36Sopenharmony_ci struct dlm_lock_resource **other_bitmap_lockres; 6962306a36Sopenharmony_ci struct dlm_lock_resource *resync_lockres; 7062306a36Sopenharmony_ci struct list_head suspend_list; 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci spinlock_t suspend_lock; 7362306a36Sopenharmony_ci /* record the region which write should be suspended */ 7462306a36Sopenharmony_ci sector_t suspend_lo; 7562306a36Sopenharmony_ci sector_t suspend_hi; 7662306a36Sopenharmony_ci int suspend_from; /* the slot which broadcast suspend_lo/hi */ 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_ci struct md_thread __rcu *recovery_thread; 7962306a36Sopenharmony_ci unsigned long recovery_map; 8062306a36Sopenharmony_ci /* communication loc resources */ 8162306a36Sopenharmony_ci struct dlm_lock_resource *ack_lockres; 8262306a36Sopenharmony_ci struct dlm_lock_resource *message_lockres; 8362306a36Sopenharmony_ci struct dlm_lock_resource *token_lockres; 8462306a36Sopenharmony_ci struct dlm_lock_resource *no_new_dev_lockres; 8562306a36Sopenharmony_ci struct md_thread __rcu *recv_thread; 8662306a36Sopenharmony_ci struct completion newdisk_completion; 8762306a36Sopenharmony_ci wait_queue_head_t wait; 8862306a36Sopenharmony_ci unsigned long state; 8962306a36Sopenharmony_ci /* record the region in RESYNCING message */ 9062306a36Sopenharmony_ci sector_t sync_low; 9162306a36Sopenharmony_ci sector_t sync_hi; 9262306a36Sopenharmony_ci}; 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cienum msg_type { 9562306a36Sopenharmony_ci METADATA_UPDATED = 0, 9662306a36Sopenharmony_ci RESYNCING, 9762306a36Sopenharmony_ci NEWDISK, 9862306a36Sopenharmony_ci REMOVE, 9962306a36Sopenharmony_ci RE_ADD, 10062306a36Sopenharmony_ci BITMAP_NEEDS_SYNC, 10162306a36Sopenharmony_ci CHANGE_CAPACITY, 10262306a36Sopenharmony_ci BITMAP_RESIZE, 10362306a36Sopenharmony_ci}; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_cistruct cluster_msg { 10662306a36Sopenharmony_ci __le32 type; 10762306a36Sopenharmony_ci __le32 slot; 10862306a36Sopenharmony_ci /* TODO: Unionize this for smaller footprint */ 10962306a36Sopenharmony_ci __le64 low; 11062306a36Sopenharmony_ci __le64 high; 11162306a36Sopenharmony_ci char uuid[16]; 11262306a36Sopenharmony_ci __le32 raid_slot; 11362306a36Sopenharmony_ci}; 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_cistatic void sync_ast(void *arg) 11662306a36Sopenharmony_ci{ 11762306a36Sopenharmony_ci struct dlm_lock_resource *res; 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci res = arg; 12062306a36Sopenharmony_ci res->sync_locking_done = true; 12162306a36Sopenharmony_ci wake_up(&res->sync_locking); 12262306a36Sopenharmony_ci} 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_cistatic int dlm_lock_sync(struct dlm_lock_resource *res, int mode) 12562306a36Sopenharmony_ci{ 12662306a36Sopenharmony_ci int ret = 0; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci ret = dlm_lock(res->ls, mode, &res->lksb, 12962306a36Sopenharmony_ci res->flags, res->name, strlen(res->name), 13062306a36Sopenharmony_ci 0, sync_ast, res, res->bast); 13162306a36Sopenharmony_ci if (ret) 13262306a36Sopenharmony_ci return ret; 13362306a36Sopenharmony_ci wait_event(res->sync_locking, res->sync_locking_done); 13462306a36Sopenharmony_ci res->sync_locking_done = false; 13562306a36Sopenharmony_ci if (res->lksb.sb_status == 0) 13662306a36Sopenharmony_ci res->mode = mode; 13762306a36Sopenharmony_ci return res->lksb.sb_status; 13862306a36Sopenharmony_ci} 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_cistatic int dlm_unlock_sync(struct dlm_lock_resource *res) 14162306a36Sopenharmony_ci{ 14262306a36Sopenharmony_ci return dlm_lock_sync(res, DLM_LOCK_NL); 14362306a36Sopenharmony_ci} 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci/* 14662306a36Sopenharmony_ci * An variation of dlm_lock_sync, which make lock request could 14762306a36Sopenharmony_ci * be interrupted 14862306a36Sopenharmony_ci */ 14962306a36Sopenharmony_cistatic int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode, 15062306a36Sopenharmony_ci struct mddev *mddev) 15162306a36Sopenharmony_ci{ 15262306a36Sopenharmony_ci int ret = 0; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci ret = dlm_lock(res->ls, mode, &res->lksb, 15562306a36Sopenharmony_ci res->flags, res->name, strlen(res->name), 15662306a36Sopenharmony_ci 0, sync_ast, res, res->bast); 15762306a36Sopenharmony_ci if (ret) 15862306a36Sopenharmony_ci return ret; 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci wait_event(res->sync_locking, res->sync_locking_done 16162306a36Sopenharmony_ci || kthread_should_stop() 16262306a36Sopenharmony_ci || test_bit(MD_CLOSING, &mddev->flags)); 16362306a36Sopenharmony_ci if (!res->sync_locking_done) { 16462306a36Sopenharmony_ci /* 16562306a36Sopenharmony_ci * the convert queue contains the lock request when request is 16662306a36Sopenharmony_ci * interrupted, and sync_ast could still be run, so need to 16762306a36Sopenharmony_ci * cancel the request and reset completion 16862306a36Sopenharmony_ci */ 16962306a36Sopenharmony_ci ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL, 17062306a36Sopenharmony_ci &res->lksb, res); 17162306a36Sopenharmony_ci res->sync_locking_done = false; 17262306a36Sopenharmony_ci if (unlikely(ret != 0)) 17362306a36Sopenharmony_ci pr_info("failed to cancel previous lock request " 17462306a36Sopenharmony_ci "%s return %d\n", res->name, ret); 17562306a36Sopenharmony_ci return -EPERM; 17662306a36Sopenharmony_ci } else 17762306a36Sopenharmony_ci res->sync_locking_done = false; 17862306a36Sopenharmony_ci if (res->lksb.sb_status == 0) 17962306a36Sopenharmony_ci res->mode = mode; 18062306a36Sopenharmony_ci return res->lksb.sb_status; 18162306a36Sopenharmony_ci} 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_cistatic struct dlm_lock_resource *lockres_init(struct mddev *mddev, 18462306a36Sopenharmony_ci char *name, void (*bastfn)(void *arg, int mode), int with_lvb) 18562306a36Sopenharmony_ci{ 18662306a36Sopenharmony_ci struct dlm_lock_resource *res = NULL; 18762306a36Sopenharmony_ci int ret, namelen; 18862306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); 19162306a36Sopenharmony_ci if (!res) 19262306a36Sopenharmony_ci return NULL; 19362306a36Sopenharmony_ci init_waitqueue_head(&res->sync_locking); 19462306a36Sopenharmony_ci res->sync_locking_done = false; 19562306a36Sopenharmony_ci res->ls = cinfo->lockspace; 19662306a36Sopenharmony_ci res->mddev = mddev; 19762306a36Sopenharmony_ci res->mode = DLM_LOCK_IV; 19862306a36Sopenharmony_ci namelen = strlen(name); 19962306a36Sopenharmony_ci res->name = kzalloc(namelen + 1, GFP_KERNEL); 20062306a36Sopenharmony_ci if (!res->name) { 20162306a36Sopenharmony_ci pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); 20262306a36Sopenharmony_ci goto out_err; 20362306a36Sopenharmony_ci } 20462306a36Sopenharmony_ci strscpy(res->name, name, namelen + 1); 20562306a36Sopenharmony_ci if (with_lvb) { 20662306a36Sopenharmony_ci res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); 20762306a36Sopenharmony_ci if (!res->lksb.sb_lvbptr) { 20862306a36Sopenharmony_ci pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name); 20962306a36Sopenharmony_ci goto out_err; 21062306a36Sopenharmony_ci } 21162306a36Sopenharmony_ci res->flags = DLM_LKF_VALBLK; 21262306a36Sopenharmony_ci } 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci if (bastfn) 21562306a36Sopenharmony_ci res->bast = bastfn; 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci res->flags |= DLM_LKF_EXPEDITE; 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci ret = dlm_lock_sync(res, DLM_LOCK_NL); 22062306a36Sopenharmony_ci if (ret) { 22162306a36Sopenharmony_ci pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name); 22262306a36Sopenharmony_ci goto out_err; 22362306a36Sopenharmony_ci } 22462306a36Sopenharmony_ci res->flags &= ~DLM_LKF_EXPEDITE; 22562306a36Sopenharmony_ci res->flags |= DLM_LKF_CONVERT; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci return res; 22862306a36Sopenharmony_ciout_err: 22962306a36Sopenharmony_ci kfree(res->lksb.sb_lvbptr); 23062306a36Sopenharmony_ci kfree(res->name); 23162306a36Sopenharmony_ci kfree(res); 23262306a36Sopenharmony_ci return NULL; 23362306a36Sopenharmony_ci} 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_cistatic void lockres_free(struct dlm_lock_resource *res) 23662306a36Sopenharmony_ci{ 23762306a36Sopenharmony_ci int ret = 0; 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci if (!res) 24062306a36Sopenharmony_ci return; 24162306a36Sopenharmony_ci 24262306a36Sopenharmony_ci /* 24362306a36Sopenharmony_ci * use FORCEUNLOCK flag, so we can unlock even the lock is on the 24462306a36Sopenharmony_ci * waiting or convert queue 24562306a36Sopenharmony_ci */ 24662306a36Sopenharmony_ci ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK, 24762306a36Sopenharmony_ci &res->lksb, res); 24862306a36Sopenharmony_ci if (unlikely(ret != 0)) 24962306a36Sopenharmony_ci pr_err("failed to unlock %s return %d\n", res->name, ret); 25062306a36Sopenharmony_ci else 25162306a36Sopenharmony_ci wait_event(res->sync_locking, res->sync_locking_done); 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci kfree(res->name); 25462306a36Sopenharmony_ci kfree(res->lksb.sb_lvbptr); 25562306a36Sopenharmony_ci kfree(res); 25662306a36Sopenharmony_ci} 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_cistatic void add_resync_info(struct dlm_lock_resource *lockres, 25962306a36Sopenharmony_ci sector_t lo, sector_t hi) 26062306a36Sopenharmony_ci{ 26162306a36Sopenharmony_ci struct resync_info *ri; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci ri = (struct resync_info *)lockres->lksb.sb_lvbptr; 26462306a36Sopenharmony_ci ri->lo = cpu_to_le64(lo); 26562306a36Sopenharmony_ci ri->hi = cpu_to_le64(hi); 26662306a36Sopenharmony_ci} 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_cistatic int read_resync_info(struct mddev *mddev, 26962306a36Sopenharmony_ci struct dlm_lock_resource *lockres) 27062306a36Sopenharmony_ci{ 27162306a36Sopenharmony_ci struct resync_info ri; 27262306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 27362306a36Sopenharmony_ci int ret = 0; 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci dlm_lock_sync(lockres, DLM_LOCK_CR); 27662306a36Sopenharmony_ci memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); 27762306a36Sopenharmony_ci if (le64_to_cpu(ri.hi) > 0) { 27862306a36Sopenharmony_ci cinfo->suspend_hi = le64_to_cpu(ri.hi); 27962306a36Sopenharmony_ci cinfo->suspend_lo = le64_to_cpu(ri.lo); 28062306a36Sopenharmony_ci ret = 1; 28162306a36Sopenharmony_ci } 28262306a36Sopenharmony_ci dlm_unlock_sync(lockres); 28362306a36Sopenharmony_ci return ret; 28462306a36Sopenharmony_ci} 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_cistatic void recover_bitmaps(struct md_thread *thread) 28762306a36Sopenharmony_ci{ 28862306a36Sopenharmony_ci struct mddev *mddev = thread->mddev; 28962306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 29062306a36Sopenharmony_ci struct dlm_lock_resource *bm_lockres; 29162306a36Sopenharmony_ci char str[64]; 29262306a36Sopenharmony_ci int slot, ret; 29362306a36Sopenharmony_ci sector_t lo, hi; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci while (cinfo->recovery_map) { 29662306a36Sopenharmony_ci slot = fls64((u64)cinfo->recovery_map) - 1; 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci snprintf(str, 64, "bitmap%04d", slot); 29962306a36Sopenharmony_ci bm_lockres = lockres_init(mddev, str, NULL, 1); 30062306a36Sopenharmony_ci if (!bm_lockres) { 30162306a36Sopenharmony_ci pr_err("md-cluster: Cannot initialize bitmaps\n"); 30262306a36Sopenharmony_ci goto clear_bit; 30362306a36Sopenharmony_ci } 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev); 30662306a36Sopenharmony_ci if (ret) { 30762306a36Sopenharmony_ci pr_err("md-cluster: Could not DLM lock %s: %d\n", 30862306a36Sopenharmony_ci str, ret); 30962306a36Sopenharmony_ci goto clear_bit; 31062306a36Sopenharmony_ci } 31162306a36Sopenharmony_ci ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); 31262306a36Sopenharmony_ci if (ret) { 31362306a36Sopenharmony_ci pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); 31462306a36Sopenharmony_ci goto clear_bit; 31562306a36Sopenharmony_ci } 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci /* Clear suspend_area associated with the bitmap */ 31862306a36Sopenharmony_ci spin_lock_irq(&cinfo->suspend_lock); 31962306a36Sopenharmony_ci cinfo->suspend_hi = 0; 32062306a36Sopenharmony_ci cinfo->suspend_lo = 0; 32162306a36Sopenharmony_ci cinfo->suspend_from = -1; 32262306a36Sopenharmony_ci spin_unlock_irq(&cinfo->suspend_lock); 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_ci /* Kick off a reshape if needed */ 32562306a36Sopenharmony_ci if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) && 32662306a36Sopenharmony_ci test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 32762306a36Sopenharmony_ci mddev->reshape_position != MaxSector) 32862306a36Sopenharmony_ci md_wakeup_thread(mddev->sync_thread); 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci if (hi > 0) { 33162306a36Sopenharmony_ci if (lo < mddev->recovery_cp) 33262306a36Sopenharmony_ci mddev->recovery_cp = lo; 33362306a36Sopenharmony_ci /* wake up thread to continue resync in case resync 33462306a36Sopenharmony_ci * is not finished */ 33562306a36Sopenharmony_ci if (mddev->recovery_cp != MaxSector) { 33662306a36Sopenharmony_ci /* 33762306a36Sopenharmony_ci * clear the REMOTE flag since we will launch 33862306a36Sopenharmony_ci * resync thread in current node. 33962306a36Sopenharmony_ci */ 34062306a36Sopenharmony_ci clear_bit(MD_RESYNCING_REMOTE, 34162306a36Sopenharmony_ci &mddev->recovery); 34262306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 34362306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 34462306a36Sopenharmony_ci } 34562306a36Sopenharmony_ci } 34662306a36Sopenharmony_ciclear_bit: 34762306a36Sopenharmony_ci lockres_free(bm_lockres); 34862306a36Sopenharmony_ci clear_bit(slot, &cinfo->recovery_map); 34962306a36Sopenharmony_ci } 35062306a36Sopenharmony_ci} 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_cistatic void recover_prep(void *arg) 35362306a36Sopenharmony_ci{ 35462306a36Sopenharmony_ci struct mddev *mddev = arg; 35562306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 35662306a36Sopenharmony_ci set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); 35762306a36Sopenharmony_ci} 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_cistatic void __recover_slot(struct mddev *mddev, int slot) 36062306a36Sopenharmony_ci{ 36162306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci set_bit(slot, &cinfo->recovery_map); 36462306a36Sopenharmony_ci if (!cinfo->recovery_thread) { 36562306a36Sopenharmony_ci rcu_assign_pointer(cinfo->recovery_thread, 36662306a36Sopenharmony_ci md_register_thread(recover_bitmaps, mddev, "recover")); 36762306a36Sopenharmony_ci if (!cinfo->recovery_thread) { 36862306a36Sopenharmony_ci pr_warn("md-cluster: Could not create recovery thread\n"); 36962306a36Sopenharmony_ci return; 37062306a36Sopenharmony_ci } 37162306a36Sopenharmony_ci } 37262306a36Sopenharmony_ci md_wakeup_thread(cinfo->recovery_thread); 37362306a36Sopenharmony_ci} 37462306a36Sopenharmony_ci 37562306a36Sopenharmony_cistatic void recover_slot(void *arg, struct dlm_slot *slot) 37662306a36Sopenharmony_ci{ 37762306a36Sopenharmony_ci struct mddev *mddev = arg; 37862306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 37962306a36Sopenharmony_ci 38062306a36Sopenharmony_ci pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", 38162306a36Sopenharmony_ci mddev->bitmap_info.cluster_name, 38262306a36Sopenharmony_ci slot->nodeid, slot->slot, 38362306a36Sopenharmony_ci cinfo->slot_number); 38462306a36Sopenharmony_ci /* deduct one since dlm slot starts from one while the num of 38562306a36Sopenharmony_ci * cluster-md begins with 0 */ 38662306a36Sopenharmony_ci __recover_slot(mddev, slot->slot - 1); 38762306a36Sopenharmony_ci} 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_cistatic void recover_done(void *arg, struct dlm_slot *slots, 39062306a36Sopenharmony_ci int num_slots, int our_slot, 39162306a36Sopenharmony_ci uint32_t generation) 39262306a36Sopenharmony_ci{ 39362306a36Sopenharmony_ci struct mddev *mddev = arg; 39462306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci cinfo->slot_number = our_slot; 39762306a36Sopenharmony_ci /* completion is only need to be complete when node join cluster, 39862306a36Sopenharmony_ci * it doesn't need to run during another node's failure */ 39962306a36Sopenharmony_ci if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) { 40062306a36Sopenharmony_ci complete(&cinfo->completion); 40162306a36Sopenharmony_ci clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); 40262306a36Sopenharmony_ci } 40362306a36Sopenharmony_ci clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); 40462306a36Sopenharmony_ci} 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci/* the ops is called when node join the cluster, and do lock recovery 40762306a36Sopenharmony_ci * if node failure occurs */ 40862306a36Sopenharmony_cistatic const struct dlm_lockspace_ops md_ls_ops = { 40962306a36Sopenharmony_ci .recover_prep = recover_prep, 41062306a36Sopenharmony_ci .recover_slot = recover_slot, 41162306a36Sopenharmony_ci .recover_done = recover_done, 41262306a36Sopenharmony_ci}; 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci/* 41562306a36Sopenharmony_ci * The BAST function for the ack lock resource 41662306a36Sopenharmony_ci * This function wakes up the receive thread in 41762306a36Sopenharmony_ci * order to receive and process the message. 41862306a36Sopenharmony_ci */ 41962306a36Sopenharmony_cistatic void ack_bast(void *arg, int mode) 42062306a36Sopenharmony_ci{ 42162306a36Sopenharmony_ci struct dlm_lock_resource *res = arg; 42262306a36Sopenharmony_ci struct md_cluster_info *cinfo = res->mddev->cluster_info; 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci if (mode == DLM_LOCK_EX) { 42562306a36Sopenharmony_ci if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state)) 42662306a36Sopenharmony_ci md_wakeup_thread(cinfo->recv_thread); 42762306a36Sopenharmony_ci else 42862306a36Sopenharmony_ci set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state); 42962306a36Sopenharmony_ci } 43062306a36Sopenharmony_ci} 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_cistatic void remove_suspend_info(struct mddev *mddev, int slot) 43362306a36Sopenharmony_ci{ 43462306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 43562306a36Sopenharmony_ci mddev->pers->quiesce(mddev, 1); 43662306a36Sopenharmony_ci spin_lock_irq(&cinfo->suspend_lock); 43762306a36Sopenharmony_ci cinfo->suspend_hi = 0; 43862306a36Sopenharmony_ci cinfo->suspend_lo = 0; 43962306a36Sopenharmony_ci spin_unlock_irq(&cinfo->suspend_lock); 44062306a36Sopenharmony_ci mddev->pers->quiesce(mddev, 0); 44162306a36Sopenharmony_ci} 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_cistatic void process_suspend_info(struct mddev *mddev, 44462306a36Sopenharmony_ci int slot, sector_t lo, sector_t hi) 44562306a36Sopenharmony_ci{ 44662306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 44762306a36Sopenharmony_ci struct mdp_superblock_1 *sb = NULL; 44862306a36Sopenharmony_ci struct md_rdev *rdev; 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci if (!hi) { 45162306a36Sopenharmony_ci /* 45262306a36Sopenharmony_ci * clear the REMOTE flag since resync or recovery is finished 45362306a36Sopenharmony_ci * in remote node. 45462306a36Sopenharmony_ci */ 45562306a36Sopenharmony_ci clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); 45662306a36Sopenharmony_ci remove_suspend_info(mddev, slot); 45762306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 45862306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 45962306a36Sopenharmony_ci return; 46062306a36Sopenharmony_ci } 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci rdev_for_each(rdev, mddev) 46362306a36Sopenharmony_ci if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { 46462306a36Sopenharmony_ci sb = page_address(rdev->sb_page); 46562306a36Sopenharmony_ci break; 46662306a36Sopenharmony_ci } 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_ci /* 46962306a36Sopenharmony_ci * The bitmaps are not same for different nodes 47062306a36Sopenharmony_ci * if RESYNCING is happening in one node, then 47162306a36Sopenharmony_ci * the node which received the RESYNCING message 47262306a36Sopenharmony_ci * probably will perform resync with the region 47362306a36Sopenharmony_ci * [lo, hi] again, so we could reduce resync time 47462306a36Sopenharmony_ci * a lot if we can ensure that the bitmaps among 47562306a36Sopenharmony_ci * different nodes are match up well. 47662306a36Sopenharmony_ci * 47762306a36Sopenharmony_ci * sync_low/hi is used to record the region which 47862306a36Sopenharmony_ci * arrived in the previous RESYNCING message, 47962306a36Sopenharmony_ci * 48062306a36Sopenharmony_ci * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK 48162306a36Sopenharmony_ci * and set RESYNC_MASK since resync thread is running 48262306a36Sopenharmony_ci * in another node, so we don't need to do the resync 48362306a36Sopenharmony_ci * again with the same section. 48462306a36Sopenharmony_ci * 48562306a36Sopenharmony_ci * Skip md_bitmap_sync_with_cluster in case reshape 48662306a36Sopenharmony_ci * happening, because reshaping region is small and 48762306a36Sopenharmony_ci * we don't want to trigger lots of WARN. 48862306a36Sopenharmony_ci */ 48962306a36Sopenharmony_ci if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) 49062306a36Sopenharmony_ci md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, 49162306a36Sopenharmony_ci cinfo->sync_hi, lo, hi); 49262306a36Sopenharmony_ci cinfo->sync_low = lo; 49362306a36Sopenharmony_ci cinfo->sync_hi = hi; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci mddev->pers->quiesce(mddev, 1); 49662306a36Sopenharmony_ci spin_lock_irq(&cinfo->suspend_lock); 49762306a36Sopenharmony_ci cinfo->suspend_from = slot; 49862306a36Sopenharmony_ci cinfo->suspend_lo = lo; 49962306a36Sopenharmony_ci cinfo->suspend_hi = hi; 50062306a36Sopenharmony_ci spin_unlock_irq(&cinfo->suspend_lock); 50162306a36Sopenharmony_ci mddev->pers->quiesce(mddev, 0); 50262306a36Sopenharmony_ci} 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_cistatic void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) 50562306a36Sopenharmony_ci{ 50662306a36Sopenharmony_ci char disk_uuid[64]; 50762306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 50862306a36Sopenharmony_ci char event_name[] = "EVENT=ADD_DEVICE"; 50962306a36Sopenharmony_ci char raid_slot[16]; 51062306a36Sopenharmony_ci char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; 51162306a36Sopenharmony_ci int len; 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_ci len = snprintf(disk_uuid, 64, "DEVICE_UUID="); 51462306a36Sopenharmony_ci sprintf(disk_uuid + len, "%pU", cmsg->uuid); 51562306a36Sopenharmony_ci snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot)); 51662306a36Sopenharmony_ci pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); 51762306a36Sopenharmony_ci init_completion(&cinfo->newdisk_completion); 51862306a36Sopenharmony_ci set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); 51962306a36Sopenharmony_ci kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); 52062306a36Sopenharmony_ci wait_for_completion_timeout(&cinfo->newdisk_completion, 52162306a36Sopenharmony_ci NEW_DEV_TIMEOUT); 52262306a36Sopenharmony_ci clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); 52362306a36Sopenharmony_ci} 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_cistatic void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) 52762306a36Sopenharmony_ci{ 52862306a36Sopenharmony_ci int got_lock = 0; 52962306a36Sopenharmony_ci struct md_thread *thread; 53062306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 53162306a36Sopenharmony_ci mddev->good_device_nr = le32_to_cpu(msg->raid_slot); 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci /* daemaon thread must exist */ 53662306a36Sopenharmony_ci thread = rcu_dereference_protected(mddev->thread, true); 53762306a36Sopenharmony_ci wait_event(thread->wqueue, 53862306a36Sopenharmony_ci (got_lock = mddev_trylock(mddev)) || 53962306a36Sopenharmony_ci test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); 54062306a36Sopenharmony_ci md_reload_sb(mddev, mddev->good_device_nr); 54162306a36Sopenharmony_ci if (got_lock) 54262306a36Sopenharmony_ci mddev_unlock(mddev); 54362306a36Sopenharmony_ci} 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_cistatic void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) 54662306a36Sopenharmony_ci{ 54762306a36Sopenharmony_ci struct md_rdev *rdev; 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci rcu_read_lock(); 55062306a36Sopenharmony_ci rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); 55162306a36Sopenharmony_ci if (rdev) { 55262306a36Sopenharmony_ci set_bit(ClusterRemove, &rdev->flags); 55362306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 55462306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 55562306a36Sopenharmony_ci } 55662306a36Sopenharmony_ci else 55762306a36Sopenharmony_ci pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", 55862306a36Sopenharmony_ci __func__, __LINE__, le32_to_cpu(msg->raid_slot)); 55962306a36Sopenharmony_ci rcu_read_unlock(); 56062306a36Sopenharmony_ci} 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_cistatic void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) 56362306a36Sopenharmony_ci{ 56462306a36Sopenharmony_ci struct md_rdev *rdev; 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci rcu_read_lock(); 56762306a36Sopenharmony_ci rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); 56862306a36Sopenharmony_ci if (rdev && test_bit(Faulty, &rdev->flags)) 56962306a36Sopenharmony_ci clear_bit(Faulty, &rdev->flags); 57062306a36Sopenharmony_ci else 57162306a36Sopenharmony_ci pr_warn("%s: %d Could not find disk(%d) which is faulty", 57262306a36Sopenharmony_ci __func__, __LINE__, le32_to_cpu(msg->raid_slot)); 57362306a36Sopenharmony_ci rcu_read_unlock(); 57462306a36Sopenharmony_ci} 57562306a36Sopenharmony_ci 57662306a36Sopenharmony_cistatic int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) 57762306a36Sopenharmony_ci{ 57862306a36Sopenharmony_ci int ret = 0; 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_ci if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), 58162306a36Sopenharmony_ci "node %d received its own msg\n", le32_to_cpu(msg->slot))) 58262306a36Sopenharmony_ci return -1; 58362306a36Sopenharmony_ci switch (le32_to_cpu(msg->type)) { 58462306a36Sopenharmony_ci case METADATA_UPDATED: 58562306a36Sopenharmony_ci process_metadata_update(mddev, msg); 58662306a36Sopenharmony_ci break; 58762306a36Sopenharmony_ci case CHANGE_CAPACITY: 58862306a36Sopenharmony_ci set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 58962306a36Sopenharmony_ci break; 59062306a36Sopenharmony_ci case RESYNCING: 59162306a36Sopenharmony_ci set_bit(MD_RESYNCING_REMOTE, &mddev->recovery); 59262306a36Sopenharmony_ci process_suspend_info(mddev, le32_to_cpu(msg->slot), 59362306a36Sopenharmony_ci le64_to_cpu(msg->low), 59462306a36Sopenharmony_ci le64_to_cpu(msg->high)); 59562306a36Sopenharmony_ci break; 59662306a36Sopenharmony_ci case NEWDISK: 59762306a36Sopenharmony_ci process_add_new_disk(mddev, msg); 59862306a36Sopenharmony_ci break; 59962306a36Sopenharmony_ci case REMOVE: 60062306a36Sopenharmony_ci process_remove_disk(mddev, msg); 60162306a36Sopenharmony_ci break; 60262306a36Sopenharmony_ci case RE_ADD: 60362306a36Sopenharmony_ci process_readd_disk(mddev, msg); 60462306a36Sopenharmony_ci break; 60562306a36Sopenharmony_ci case BITMAP_NEEDS_SYNC: 60662306a36Sopenharmony_ci __recover_slot(mddev, le32_to_cpu(msg->slot)); 60762306a36Sopenharmony_ci break; 60862306a36Sopenharmony_ci case BITMAP_RESIZE: 60962306a36Sopenharmony_ci if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) 61062306a36Sopenharmony_ci ret = md_bitmap_resize(mddev->bitmap, 61162306a36Sopenharmony_ci le64_to_cpu(msg->high), 0, 0); 61262306a36Sopenharmony_ci break; 61362306a36Sopenharmony_ci default: 61462306a36Sopenharmony_ci ret = -1; 61562306a36Sopenharmony_ci pr_warn("%s:%d Received unknown message from %d\n", 61662306a36Sopenharmony_ci __func__, __LINE__, msg->slot); 61762306a36Sopenharmony_ci } 61862306a36Sopenharmony_ci return ret; 61962306a36Sopenharmony_ci} 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_ci/* 62262306a36Sopenharmony_ci * thread for receiving message 62362306a36Sopenharmony_ci */ 62462306a36Sopenharmony_cistatic void recv_daemon(struct md_thread *thread) 62562306a36Sopenharmony_ci{ 62662306a36Sopenharmony_ci struct md_cluster_info *cinfo = thread->mddev->cluster_info; 62762306a36Sopenharmony_ci struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; 62862306a36Sopenharmony_ci struct dlm_lock_resource *message_lockres = cinfo->message_lockres; 62962306a36Sopenharmony_ci struct cluster_msg msg; 63062306a36Sopenharmony_ci int ret; 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci mutex_lock(&cinfo->recv_mutex); 63362306a36Sopenharmony_ci /*get CR on Message*/ 63462306a36Sopenharmony_ci if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { 63562306a36Sopenharmony_ci pr_err("md/raid1:failed to get CR on MESSAGE\n"); 63662306a36Sopenharmony_ci mutex_unlock(&cinfo->recv_mutex); 63762306a36Sopenharmony_ci return; 63862306a36Sopenharmony_ci } 63962306a36Sopenharmony_ci 64062306a36Sopenharmony_ci /* read lvb and wake up thread to process this message_lockres */ 64162306a36Sopenharmony_ci memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg)); 64262306a36Sopenharmony_ci ret = process_recvd_msg(thread->mddev, &msg); 64362306a36Sopenharmony_ci if (ret) 64462306a36Sopenharmony_ci goto out; 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci /*release CR on ack_lockres*/ 64762306a36Sopenharmony_ci ret = dlm_unlock_sync(ack_lockres); 64862306a36Sopenharmony_ci if (unlikely(ret != 0)) 64962306a36Sopenharmony_ci pr_info("unlock ack failed return %d\n", ret); 65062306a36Sopenharmony_ci /*up-convert to PR on message_lockres*/ 65162306a36Sopenharmony_ci ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR); 65262306a36Sopenharmony_ci if (unlikely(ret != 0)) 65362306a36Sopenharmony_ci pr_info("lock PR on msg failed return %d\n", ret); 65462306a36Sopenharmony_ci /*get CR on ack_lockres again*/ 65562306a36Sopenharmony_ci ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); 65662306a36Sopenharmony_ci if (unlikely(ret != 0)) 65762306a36Sopenharmony_ci pr_info("lock CR on ack failed return %d\n", ret); 65862306a36Sopenharmony_ciout: 65962306a36Sopenharmony_ci /*release CR on message_lockres*/ 66062306a36Sopenharmony_ci ret = dlm_unlock_sync(message_lockres); 66162306a36Sopenharmony_ci if (unlikely(ret != 0)) 66262306a36Sopenharmony_ci pr_info("unlock msg failed return %d\n", ret); 66362306a36Sopenharmony_ci mutex_unlock(&cinfo->recv_mutex); 66462306a36Sopenharmony_ci} 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci/* lock_token() 66762306a36Sopenharmony_ci * Takes the lock on the TOKEN lock resource so no other 66862306a36Sopenharmony_ci * node can communicate while the operation is underway. 66962306a36Sopenharmony_ci */ 67062306a36Sopenharmony_cistatic int lock_token(struct md_cluster_info *cinfo) 67162306a36Sopenharmony_ci{ 67262306a36Sopenharmony_ci int error; 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_ci error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); 67562306a36Sopenharmony_ci if (error) { 67662306a36Sopenharmony_ci pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", 67762306a36Sopenharmony_ci __func__, __LINE__, error); 67862306a36Sopenharmony_ci } else { 67962306a36Sopenharmony_ci /* Lock the receive sequence */ 68062306a36Sopenharmony_ci mutex_lock(&cinfo->recv_mutex); 68162306a36Sopenharmony_ci } 68262306a36Sopenharmony_ci return error; 68362306a36Sopenharmony_ci} 68462306a36Sopenharmony_ci 68562306a36Sopenharmony_ci/* lock_comm() 68662306a36Sopenharmony_ci * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. 68762306a36Sopenharmony_ci */ 68862306a36Sopenharmony_cistatic int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) 68962306a36Sopenharmony_ci{ 69062306a36Sopenharmony_ci int rv, set_bit = 0; 69162306a36Sopenharmony_ci struct mddev *mddev = cinfo->mddev; 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci /* 69462306a36Sopenharmony_ci * If resync thread run after raid1d thread, then process_metadata_update 69562306a36Sopenharmony_ci * could not continue if raid1d held reconfig_mutex (and raid1d is blocked 69662306a36Sopenharmony_ci * since another node already got EX on Token and waiting the EX of Ack), 69762306a36Sopenharmony_ci * so let resync wake up thread in case flag is set. 69862306a36Sopenharmony_ci */ 69962306a36Sopenharmony_ci if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, 70062306a36Sopenharmony_ci &cinfo->state)) { 70162306a36Sopenharmony_ci rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, 70262306a36Sopenharmony_ci &cinfo->state); 70362306a36Sopenharmony_ci WARN_ON_ONCE(rv); 70462306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 70562306a36Sopenharmony_ci set_bit = 1; 70662306a36Sopenharmony_ci } 70762306a36Sopenharmony_ci 70862306a36Sopenharmony_ci wait_event(cinfo->wait, 70962306a36Sopenharmony_ci !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); 71062306a36Sopenharmony_ci rv = lock_token(cinfo); 71162306a36Sopenharmony_ci if (set_bit) 71262306a36Sopenharmony_ci clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 71362306a36Sopenharmony_ci return rv; 71462306a36Sopenharmony_ci} 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_cistatic void unlock_comm(struct md_cluster_info *cinfo) 71762306a36Sopenharmony_ci{ 71862306a36Sopenharmony_ci WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); 71962306a36Sopenharmony_ci mutex_unlock(&cinfo->recv_mutex); 72062306a36Sopenharmony_ci dlm_unlock_sync(cinfo->token_lockres); 72162306a36Sopenharmony_ci clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state); 72262306a36Sopenharmony_ci wake_up(&cinfo->wait); 72362306a36Sopenharmony_ci} 72462306a36Sopenharmony_ci 72562306a36Sopenharmony_ci/* __sendmsg() 72662306a36Sopenharmony_ci * This function performs the actual sending of the message. This function is 72762306a36Sopenharmony_ci * usually called after performing the encompassing operation 72862306a36Sopenharmony_ci * The function: 72962306a36Sopenharmony_ci * 1. Grabs the message lockresource in EX mode 73062306a36Sopenharmony_ci * 2. Copies the message to the message LVB 73162306a36Sopenharmony_ci * 3. Downconverts message lockresource to CW 73262306a36Sopenharmony_ci * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes 73362306a36Sopenharmony_ci * and the other nodes read the message. The thread will wait here until all other 73462306a36Sopenharmony_ci * nodes have released ack lock resource. 73562306a36Sopenharmony_ci * 5. Downconvert ack lockresource to CR 73662306a36Sopenharmony_ci */ 73762306a36Sopenharmony_cistatic int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) 73862306a36Sopenharmony_ci{ 73962306a36Sopenharmony_ci int error; 74062306a36Sopenharmony_ci int slot = cinfo->slot_number - 1; 74162306a36Sopenharmony_ci 74262306a36Sopenharmony_ci cmsg->slot = cpu_to_le32(slot); 74362306a36Sopenharmony_ci /*get EX on Message*/ 74462306a36Sopenharmony_ci error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX); 74562306a36Sopenharmony_ci if (error) { 74662306a36Sopenharmony_ci pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error); 74762306a36Sopenharmony_ci goto failed_message; 74862306a36Sopenharmony_ci } 74962306a36Sopenharmony_ci 75062306a36Sopenharmony_ci memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, 75162306a36Sopenharmony_ci sizeof(struct cluster_msg)); 75262306a36Sopenharmony_ci /*down-convert EX to CW on Message*/ 75362306a36Sopenharmony_ci error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW); 75462306a36Sopenharmony_ci if (error) { 75562306a36Sopenharmony_ci pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n", 75662306a36Sopenharmony_ci error); 75762306a36Sopenharmony_ci goto failed_ack; 75862306a36Sopenharmony_ci } 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci /*up-convert CR to EX on Ack*/ 76162306a36Sopenharmony_ci error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX); 76262306a36Sopenharmony_ci if (error) { 76362306a36Sopenharmony_ci pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n", 76462306a36Sopenharmony_ci error); 76562306a36Sopenharmony_ci goto failed_ack; 76662306a36Sopenharmony_ci } 76762306a36Sopenharmony_ci 76862306a36Sopenharmony_ci /*down-convert EX to CR on Ack*/ 76962306a36Sopenharmony_ci error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR); 77062306a36Sopenharmony_ci if (error) { 77162306a36Sopenharmony_ci pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n", 77262306a36Sopenharmony_ci error); 77362306a36Sopenharmony_ci goto failed_ack; 77462306a36Sopenharmony_ci } 77562306a36Sopenharmony_ci 77662306a36Sopenharmony_cifailed_ack: 77762306a36Sopenharmony_ci error = dlm_unlock_sync(cinfo->message_lockres); 77862306a36Sopenharmony_ci if (unlikely(error != 0)) { 77962306a36Sopenharmony_ci pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n", 78062306a36Sopenharmony_ci error); 78162306a36Sopenharmony_ci /* in case the message can't be released due to some reason */ 78262306a36Sopenharmony_ci goto failed_ack; 78362306a36Sopenharmony_ci } 78462306a36Sopenharmony_cifailed_message: 78562306a36Sopenharmony_ci return error; 78662306a36Sopenharmony_ci} 78762306a36Sopenharmony_ci 78862306a36Sopenharmony_cistatic int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, 78962306a36Sopenharmony_ci bool mddev_locked) 79062306a36Sopenharmony_ci{ 79162306a36Sopenharmony_ci int ret; 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_ci ret = lock_comm(cinfo, mddev_locked); 79462306a36Sopenharmony_ci if (!ret) { 79562306a36Sopenharmony_ci ret = __sendmsg(cinfo, cmsg); 79662306a36Sopenharmony_ci unlock_comm(cinfo); 79762306a36Sopenharmony_ci } 79862306a36Sopenharmony_ci return ret; 79962306a36Sopenharmony_ci} 80062306a36Sopenharmony_ci 80162306a36Sopenharmony_cistatic int gather_all_resync_info(struct mddev *mddev, int total_slots) 80262306a36Sopenharmony_ci{ 80362306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 80462306a36Sopenharmony_ci int i, ret = 0; 80562306a36Sopenharmony_ci struct dlm_lock_resource *bm_lockres; 80662306a36Sopenharmony_ci char str[64]; 80762306a36Sopenharmony_ci sector_t lo, hi; 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_ci 81062306a36Sopenharmony_ci for (i = 0; i < total_slots; i++) { 81162306a36Sopenharmony_ci memset(str, '\0', 64); 81262306a36Sopenharmony_ci snprintf(str, 64, "bitmap%04d", i); 81362306a36Sopenharmony_ci bm_lockres = lockres_init(mddev, str, NULL, 1); 81462306a36Sopenharmony_ci if (!bm_lockres) 81562306a36Sopenharmony_ci return -ENOMEM; 81662306a36Sopenharmony_ci if (i == (cinfo->slot_number - 1)) { 81762306a36Sopenharmony_ci lockres_free(bm_lockres); 81862306a36Sopenharmony_ci continue; 81962306a36Sopenharmony_ci } 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci bm_lockres->flags |= DLM_LKF_NOQUEUE; 82262306a36Sopenharmony_ci ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); 82362306a36Sopenharmony_ci if (ret == -EAGAIN) { 82462306a36Sopenharmony_ci if (read_resync_info(mddev, bm_lockres)) { 82562306a36Sopenharmony_ci pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", 82662306a36Sopenharmony_ci __func__, __LINE__, 82762306a36Sopenharmony_ci (unsigned long long) cinfo->suspend_lo, 82862306a36Sopenharmony_ci (unsigned long long) cinfo->suspend_hi, 82962306a36Sopenharmony_ci i); 83062306a36Sopenharmony_ci cinfo->suspend_from = i; 83162306a36Sopenharmony_ci } 83262306a36Sopenharmony_ci ret = 0; 83362306a36Sopenharmony_ci lockres_free(bm_lockres); 83462306a36Sopenharmony_ci continue; 83562306a36Sopenharmony_ci } 83662306a36Sopenharmony_ci if (ret) { 83762306a36Sopenharmony_ci lockres_free(bm_lockres); 83862306a36Sopenharmony_ci goto out; 83962306a36Sopenharmony_ci } 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci /* Read the disk bitmap sb and check if it needs recovery */ 84262306a36Sopenharmony_ci ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false); 84362306a36Sopenharmony_ci if (ret) { 84462306a36Sopenharmony_ci pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); 84562306a36Sopenharmony_ci lockres_free(bm_lockres); 84662306a36Sopenharmony_ci continue; 84762306a36Sopenharmony_ci } 84862306a36Sopenharmony_ci if ((hi > 0) && (lo < mddev->recovery_cp)) { 84962306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 85062306a36Sopenharmony_ci mddev->recovery_cp = lo; 85162306a36Sopenharmony_ci md_check_recovery(mddev); 85262306a36Sopenharmony_ci } 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci lockres_free(bm_lockres); 85562306a36Sopenharmony_ci } 85662306a36Sopenharmony_ciout: 85762306a36Sopenharmony_ci return ret; 85862306a36Sopenharmony_ci} 85962306a36Sopenharmony_ci 86062306a36Sopenharmony_cistatic int join(struct mddev *mddev, int nodes) 86162306a36Sopenharmony_ci{ 86262306a36Sopenharmony_ci struct md_cluster_info *cinfo; 86362306a36Sopenharmony_ci int ret, ops_rv; 86462306a36Sopenharmony_ci char str[64]; 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); 86762306a36Sopenharmony_ci if (!cinfo) 86862306a36Sopenharmony_ci return -ENOMEM; 86962306a36Sopenharmony_ci 87062306a36Sopenharmony_ci INIT_LIST_HEAD(&cinfo->suspend_list); 87162306a36Sopenharmony_ci spin_lock_init(&cinfo->suspend_lock); 87262306a36Sopenharmony_ci init_completion(&cinfo->completion); 87362306a36Sopenharmony_ci set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); 87462306a36Sopenharmony_ci init_waitqueue_head(&cinfo->wait); 87562306a36Sopenharmony_ci mutex_init(&cinfo->recv_mutex); 87662306a36Sopenharmony_ci 87762306a36Sopenharmony_ci mddev->cluster_info = cinfo; 87862306a36Sopenharmony_ci cinfo->mddev = mddev; 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci memset(str, 0, 64); 88162306a36Sopenharmony_ci sprintf(str, "%pU", mddev->uuid); 88262306a36Sopenharmony_ci ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, 88362306a36Sopenharmony_ci 0, LVB_SIZE, &md_ls_ops, mddev, 88462306a36Sopenharmony_ci &ops_rv, &cinfo->lockspace); 88562306a36Sopenharmony_ci if (ret) 88662306a36Sopenharmony_ci goto err; 88762306a36Sopenharmony_ci wait_for_completion(&cinfo->completion); 88862306a36Sopenharmony_ci if (nodes < cinfo->slot_number) { 88962306a36Sopenharmony_ci pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).", 89062306a36Sopenharmony_ci cinfo->slot_number, nodes); 89162306a36Sopenharmony_ci ret = -ERANGE; 89262306a36Sopenharmony_ci goto err; 89362306a36Sopenharmony_ci } 89462306a36Sopenharmony_ci /* Initiate the communication resources */ 89562306a36Sopenharmony_ci ret = -ENOMEM; 89662306a36Sopenharmony_ci rcu_assign_pointer(cinfo->recv_thread, 89762306a36Sopenharmony_ci md_register_thread(recv_daemon, mddev, "cluster_recv")); 89862306a36Sopenharmony_ci if (!cinfo->recv_thread) { 89962306a36Sopenharmony_ci pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); 90062306a36Sopenharmony_ci goto err; 90162306a36Sopenharmony_ci } 90262306a36Sopenharmony_ci cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1); 90362306a36Sopenharmony_ci if (!cinfo->message_lockres) 90462306a36Sopenharmony_ci goto err; 90562306a36Sopenharmony_ci cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); 90662306a36Sopenharmony_ci if (!cinfo->token_lockres) 90762306a36Sopenharmony_ci goto err; 90862306a36Sopenharmony_ci cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); 90962306a36Sopenharmony_ci if (!cinfo->no_new_dev_lockres) 91062306a36Sopenharmony_ci goto err; 91162306a36Sopenharmony_ci 91262306a36Sopenharmony_ci ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); 91362306a36Sopenharmony_ci if (ret) { 91462306a36Sopenharmony_ci ret = -EAGAIN; 91562306a36Sopenharmony_ci pr_err("md-cluster: can't join cluster to avoid lock issue\n"); 91662306a36Sopenharmony_ci goto err; 91762306a36Sopenharmony_ci } 91862306a36Sopenharmony_ci cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); 91962306a36Sopenharmony_ci if (!cinfo->ack_lockres) { 92062306a36Sopenharmony_ci ret = -ENOMEM; 92162306a36Sopenharmony_ci goto err; 92262306a36Sopenharmony_ci } 92362306a36Sopenharmony_ci /* get sync CR lock on ACK. */ 92462306a36Sopenharmony_ci if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) 92562306a36Sopenharmony_ci pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", 92662306a36Sopenharmony_ci ret); 92762306a36Sopenharmony_ci dlm_unlock_sync(cinfo->token_lockres); 92862306a36Sopenharmony_ci /* get sync CR lock on no-new-dev. */ 92962306a36Sopenharmony_ci if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) 93062306a36Sopenharmony_ci pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret); 93162306a36Sopenharmony_ci 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); 93462306a36Sopenharmony_ci snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); 93562306a36Sopenharmony_ci cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); 93662306a36Sopenharmony_ci if (!cinfo->bitmap_lockres) { 93762306a36Sopenharmony_ci ret = -ENOMEM; 93862306a36Sopenharmony_ci goto err; 93962306a36Sopenharmony_ci } 94062306a36Sopenharmony_ci if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { 94162306a36Sopenharmony_ci pr_err("Failed to get bitmap lock\n"); 94262306a36Sopenharmony_ci ret = -EINVAL; 94362306a36Sopenharmony_ci goto err; 94462306a36Sopenharmony_ci } 94562306a36Sopenharmony_ci 94662306a36Sopenharmony_ci cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); 94762306a36Sopenharmony_ci if (!cinfo->resync_lockres) { 94862306a36Sopenharmony_ci ret = -ENOMEM; 94962306a36Sopenharmony_ci goto err; 95062306a36Sopenharmony_ci } 95162306a36Sopenharmony_ci 95262306a36Sopenharmony_ci return 0; 95362306a36Sopenharmony_cierr: 95462306a36Sopenharmony_ci set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 95562306a36Sopenharmony_ci md_unregister_thread(mddev, &cinfo->recovery_thread); 95662306a36Sopenharmony_ci md_unregister_thread(mddev, &cinfo->recv_thread); 95762306a36Sopenharmony_ci lockres_free(cinfo->message_lockres); 95862306a36Sopenharmony_ci lockres_free(cinfo->token_lockres); 95962306a36Sopenharmony_ci lockres_free(cinfo->ack_lockres); 96062306a36Sopenharmony_ci lockres_free(cinfo->no_new_dev_lockres); 96162306a36Sopenharmony_ci lockres_free(cinfo->resync_lockres); 96262306a36Sopenharmony_ci lockres_free(cinfo->bitmap_lockres); 96362306a36Sopenharmony_ci if (cinfo->lockspace) 96462306a36Sopenharmony_ci dlm_release_lockspace(cinfo->lockspace, 2); 96562306a36Sopenharmony_ci mddev->cluster_info = NULL; 96662306a36Sopenharmony_ci kfree(cinfo); 96762306a36Sopenharmony_ci return ret; 96862306a36Sopenharmony_ci} 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_cistatic void load_bitmaps(struct mddev *mddev, int total_slots) 97162306a36Sopenharmony_ci{ 97262306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci /* load all the node's bitmap info for resync */ 97562306a36Sopenharmony_ci if (gather_all_resync_info(mddev, total_slots)) 97662306a36Sopenharmony_ci pr_err("md-cluster: failed to gather all resyn infos\n"); 97762306a36Sopenharmony_ci set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state); 97862306a36Sopenharmony_ci /* wake up recv thread in case something need to be handled */ 97962306a36Sopenharmony_ci if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state)) 98062306a36Sopenharmony_ci md_wakeup_thread(cinfo->recv_thread); 98162306a36Sopenharmony_ci} 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_cistatic void resync_bitmap(struct mddev *mddev) 98462306a36Sopenharmony_ci{ 98562306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 98662306a36Sopenharmony_ci struct cluster_msg cmsg = {0}; 98762306a36Sopenharmony_ci int err; 98862306a36Sopenharmony_ci 98962306a36Sopenharmony_ci cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); 99062306a36Sopenharmony_ci err = sendmsg(cinfo, &cmsg, 1); 99162306a36Sopenharmony_ci if (err) 99262306a36Sopenharmony_ci pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n", 99362306a36Sopenharmony_ci __func__, __LINE__, err); 99462306a36Sopenharmony_ci} 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_cistatic void unlock_all_bitmaps(struct mddev *mddev); 99762306a36Sopenharmony_cistatic int leave(struct mddev *mddev) 99862306a36Sopenharmony_ci{ 99962306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 100062306a36Sopenharmony_ci 100162306a36Sopenharmony_ci if (!cinfo) 100262306a36Sopenharmony_ci return 0; 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci /* 100562306a36Sopenharmony_ci * BITMAP_NEEDS_SYNC message should be sent when node 100662306a36Sopenharmony_ci * is leaving the cluster with dirty bitmap, also we 100762306a36Sopenharmony_ci * can only deliver it when dlm connection is available. 100862306a36Sopenharmony_ci * 100962306a36Sopenharmony_ci * Also, we should send BITMAP_NEEDS_SYNC message in 101062306a36Sopenharmony_ci * case reshaping is interrupted. 101162306a36Sopenharmony_ci */ 101262306a36Sopenharmony_ci if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || 101362306a36Sopenharmony_ci (mddev->reshape_position != MaxSector && 101462306a36Sopenharmony_ci test_bit(MD_CLOSING, &mddev->flags))) 101562306a36Sopenharmony_ci resync_bitmap(mddev); 101662306a36Sopenharmony_ci 101762306a36Sopenharmony_ci set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 101862306a36Sopenharmony_ci md_unregister_thread(mddev, &cinfo->recovery_thread); 101962306a36Sopenharmony_ci md_unregister_thread(mddev, &cinfo->recv_thread); 102062306a36Sopenharmony_ci lockres_free(cinfo->message_lockres); 102162306a36Sopenharmony_ci lockres_free(cinfo->token_lockres); 102262306a36Sopenharmony_ci lockres_free(cinfo->ack_lockres); 102362306a36Sopenharmony_ci lockres_free(cinfo->no_new_dev_lockres); 102462306a36Sopenharmony_ci lockres_free(cinfo->resync_lockres); 102562306a36Sopenharmony_ci lockres_free(cinfo->bitmap_lockres); 102662306a36Sopenharmony_ci unlock_all_bitmaps(mddev); 102762306a36Sopenharmony_ci dlm_release_lockspace(cinfo->lockspace, 2); 102862306a36Sopenharmony_ci kfree(cinfo); 102962306a36Sopenharmony_ci return 0; 103062306a36Sopenharmony_ci} 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci/* slot_number(): Returns the MD slot number to use 103362306a36Sopenharmony_ci * DLM starts the slot numbers from 1, wheras cluster-md 103462306a36Sopenharmony_ci * wants the number to be from zero, so we deduct one 103562306a36Sopenharmony_ci */ 103662306a36Sopenharmony_cistatic int slot_number(struct mddev *mddev) 103762306a36Sopenharmony_ci{ 103862306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci return cinfo->slot_number - 1; 104162306a36Sopenharmony_ci} 104262306a36Sopenharmony_ci 104362306a36Sopenharmony_ci/* 104462306a36Sopenharmony_ci * Check if the communication is already locked, else lock the communication 104562306a36Sopenharmony_ci * channel. 104662306a36Sopenharmony_ci * If it is already locked, token is in EX mode, and hence lock_token() 104762306a36Sopenharmony_ci * should not be called. 104862306a36Sopenharmony_ci */ 104962306a36Sopenharmony_cistatic int metadata_update_start(struct mddev *mddev) 105062306a36Sopenharmony_ci{ 105162306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 105262306a36Sopenharmony_ci int ret; 105362306a36Sopenharmony_ci 105462306a36Sopenharmony_ci /* 105562306a36Sopenharmony_ci * metadata_update_start is always called with the protection of 105662306a36Sopenharmony_ci * reconfig_mutex, so set WAITING_FOR_TOKEN here. 105762306a36Sopenharmony_ci */ 105862306a36Sopenharmony_ci ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, 105962306a36Sopenharmony_ci &cinfo->state); 106062306a36Sopenharmony_ci WARN_ON_ONCE(ret); 106162306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 106262306a36Sopenharmony_ci 106362306a36Sopenharmony_ci wait_event(cinfo->wait, 106462306a36Sopenharmony_ci !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) || 106562306a36Sopenharmony_ci test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state)); 106662306a36Sopenharmony_ci 106762306a36Sopenharmony_ci /* If token is already locked, return 0 */ 106862306a36Sopenharmony_ci if (cinfo->token_lockres->mode == DLM_LOCK_EX) { 106962306a36Sopenharmony_ci clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 107062306a36Sopenharmony_ci return 0; 107162306a36Sopenharmony_ci } 107262306a36Sopenharmony_ci 107362306a36Sopenharmony_ci ret = lock_token(cinfo); 107462306a36Sopenharmony_ci clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); 107562306a36Sopenharmony_ci return ret; 107662306a36Sopenharmony_ci} 107762306a36Sopenharmony_ci 107862306a36Sopenharmony_cistatic int metadata_update_finish(struct mddev *mddev) 107962306a36Sopenharmony_ci{ 108062306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 108162306a36Sopenharmony_ci struct cluster_msg cmsg; 108262306a36Sopenharmony_ci struct md_rdev *rdev; 108362306a36Sopenharmony_ci int ret = 0; 108462306a36Sopenharmony_ci int raid_slot = -1; 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_ci memset(&cmsg, 0, sizeof(cmsg)); 108762306a36Sopenharmony_ci cmsg.type = cpu_to_le32(METADATA_UPDATED); 108862306a36Sopenharmony_ci /* Pick up a good active device number to send. 108962306a36Sopenharmony_ci */ 109062306a36Sopenharmony_ci rdev_for_each(rdev, mddev) 109162306a36Sopenharmony_ci if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { 109262306a36Sopenharmony_ci raid_slot = rdev->desc_nr; 109362306a36Sopenharmony_ci break; 109462306a36Sopenharmony_ci } 109562306a36Sopenharmony_ci if (raid_slot >= 0) { 109662306a36Sopenharmony_ci cmsg.raid_slot = cpu_to_le32(raid_slot); 109762306a36Sopenharmony_ci ret = __sendmsg(cinfo, &cmsg); 109862306a36Sopenharmony_ci } else 109962306a36Sopenharmony_ci pr_warn("md-cluster: No good device id found to send\n"); 110062306a36Sopenharmony_ci clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); 110162306a36Sopenharmony_ci unlock_comm(cinfo); 110262306a36Sopenharmony_ci return ret; 110362306a36Sopenharmony_ci} 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_cistatic void metadata_update_cancel(struct mddev *mddev) 110662306a36Sopenharmony_ci{ 110762306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 110862306a36Sopenharmony_ci clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); 110962306a36Sopenharmony_ci unlock_comm(cinfo); 111062306a36Sopenharmony_ci} 111162306a36Sopenharmony_ci 111262306a36Sopenharmony_cistatic int update_bitmap_size(struct mddev *mddev, sector_t size) 111362306a36Sopenharmony_ci{ 111462306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 111562306a36Sopenharmony_ci struct cluster_msg cmsg = {0}; 111662306a36Sopenharmony_ci int ret; 111762306a36Sopenharmony_ci 111862306a36Sopenharmony_ci cmsg.type = cpu_to_le32(BITMAP_RESIZE); 111962306a36Sopenharmony_ci cmsg.high = cpu_to_le64(size); 112062306a36Sopenharmony_ci ret = sendmsg(cinfo, &cmsg, 0); 112162306a36Sopenharmony_ci if (ret) 112262306a36Sopenharmony_ci pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n", 112362306a36Sopenharmony_ci __func__, __LINE__, ret); 112462306a36Sopenharmony_ci return ret; 112562306a36Sopenharmony_ci} 112662306a36Sopenharmony_ci 112762306a36Sopenharmony_cistatic int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) 112862306a36Sopenharmony_ci{ 112962306a36Sopenharmony_ci struct bitmap_counts *counts; 113062306a36Sopenharmony_ci char str[64]; 113162306a36Sopenharmony_ci struct dlm_lock_resource *bm_lockres; 113262306a36Sopenharmony_ci struct bitmap *bitmap = mddev->bitmap; 113362306a36Sopenharmony_ci unsigned long my_pages = bitmap->counts.pages; 113462306a36Sopenharmony_ci int i, rv; 113562306a36Sopenharmony_ci 113662306a36Sopenharmony_ci /* 113762306a36Sopenharmony_ci * We need to ensure all the nodes can grow to a larger 113862306a36Sopenharmony_ci * bitmap size before make the reshaping. 113962306a36Sopenharmony_ci */ 114062306a36Sopenharmony_ci rv = update_bitmap_size(mddev, newsize); 114162306a36Sopenharmony_ci if (rv) 114262306a36Sopenharmony_ci return rv; 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci for (i = 0; i < mddev->bitmap_info.nodes; i++) { 114562306a36Sopenharmony_ci if (i == md_cluster_ops->slot_number(mddev)) 114662306a36Sopenharmony_ci continue; 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci bitmap = get_bitmap_from_slot(mddev, i); 114962306a36Sopenharmony_ci if (IS_ERR(bitmap)) { 115062306a36Sopenharmony_ci pr_err("can't get bitmap from slot %d\n", i); 115162306a36Sopenharmony_ci bitmap = NULL; 115262306a36Sopenharmony_ci goto out; 115362306a36Sopenharmony_ci } 115462306a36Sopenharmony_ci counts = &bitmap->counts; 115562306a36Sopenharmony_ci 115662306a36Sopenharmony_ci /* 115762306a36Sopenharmony_ci * If we can hold the bitmap lock of one node then 115862306a36Sopenharmony_ci * the slot is not occupied, update the pages. 115962306a36Sopenharmony_ci */ 116062306a36Sopenharmony_ci snprintf(str, 64, "bitmap%04d", i); 116162306a36Sopenharmony_ci bm_lockres = lockres_init(mddev, str, NULL, 1); 116262306a36Sopenharmony_ci if (!bm_lockres) { 116362306a36Sopenharmony_ci pr_err("Cannot initialize %s lock\n", str); 116462306a36Sopenharmony_ci goto out; 116562306a36Sopenharmony_ci } 116662306a36Sopenharmony_ci bm_lockres->flags |= DLM_LKF_NOQUEUE; 116762306a36Sopenharmony_ci rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); 116862306a36Sopenharmony_ci if (!rv) 116962306a36Sopenharmony_ci counts->pages = my_pages; 117062306a36Sopenharmony_ci lockres_free(bm_lockres); 117162306a36Sopenharmony_ci 117262306a36Sopenharmony_ci if (my_pages != counts->pages) 117362306a36Sopenharmony_ci /* 117462306a36Sopenharmony_ci * Let's revert the bitmap size if one node 117562306a36Sopenharmony_ci * can't resize bitmap 117662306a36Sopenharmony_ci */ 117762306a36Sopenharmony_ci goto out; 117862306a36Sopenharmony_ci md_bitmap_free(bitmap); 117962306a36Sopenharmony_ci } 118062306a36Sopenharmony_ci 118162306a36Sopenharmony_ci return 0; 118262306a36Sopenharmony_ciout: 118362306a36Sopenharmony_ci md_bitmap_free(bitmap); 118462306a36Sopenharmony_ci update_bitmap_size(mddev, oldsize); 118562306a36Sopenharmony_ci return -1; 118662306a36Sopenharmony_ci} 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci/* 118962306a36Sopenharmony_ci * return 0 if all the bitmaps have the same sync_size 119062306a36Sopenharmony_ci */ 119162306a36Sopenharmony_cistatic int cluster_check_sync_size(struct mddev *mddev) 119262306a36Sopenharmony_ci{ 119362306a36Sopenharmony_ci int i, rv; 119462306a36Sopenharmony_ci bitmap_super_t *sb; 119562306a36Sopenharmony_ci unsigned long my_sync_size, sync_size = 0; 119662306a36Sopenharmony_ci int node_num = mddev->bitmap_info.nodes; 119762306a36Sopenharmony_ci int current_slot = md_cluster_ops->slot_number(mddev); 119862306a36Sopenharmony_ci struct bitmap *bitmap = mddev->bitmap; 119962306a36Sopenharmony_ci char str[64]; 120062306a36Sopenharmony_ci struct dlm_lock_resource *bm_lockres; 120162306a36Sopenharmony_ci 120262306a36Sopenharmony_ci sb = kmap_atomic(bitmap->storage.sb_page); 120362306a36Sopenharmony_ci my_sync_size = sb->sync_size; 120462306a36Sopenharmony_ci kunmap_atomic(sb); 120562306a36Sopenharmony_ci 120662306a36Sopenharmony_ci for (i = 0; i < node_num; i++) { 120762306a36Sopenharmony_ci if (i == current_slot) 120862306a36Sopenharmony_ci continue; 120962306a36Sopenharmony_ci 121062306a36Sopenharmony_ci bitmap = get_bitmap_from_slot(mddev, i); 121162306a36Sopenharmony_ci if (IS_ERR(bitmap)) { 121262306a36Sopenharmony_ci pr_err("can't get bitmap from slot %d\n", i); 121362306a36Sopenharmony_ci return -1; 121462306a36Sopenharmony_ci } 121562306a36Sopenharmony_ci 121662306a36Sopenharmony_ci /* 121762306a36Sopenharmony_ci * If we can hold the bitmap lock of one node then 121862306a36Sopenharmony_ci * the slot is not occupied, update the sb. 121962306a36Sopenharmony_ci */ 122062306a36Sopenharmony_ci snprintf(str, 64, "bitmap%04d", i); 122162306a36Sopenharmony_ci bm_lockres = lockres_init(mddev, str, NULL, 1); 122262306a36Sopenharmony_ci if (!bm_lockres) { 122362306a36Sopenharmony_ci pr_err("md-cluster: Cannot initialize %s\n", str); 122462306a36Sopenharmony_ci md_bitmap_free(bitmap); 122562306a36Sopenharmony_ci return -1; 122662306a36Sopenharmony_ci } 122762306a36Sopenharmony_ci bm_lockres->flags |= DLM_LKF_NOQUEUE; 122862306a36Sopenharmony_ci rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); 122962306a36Sopenharmony_ci if (!rv) 123062306a36Sopenharmony_ci md_bitmap_update_sb(bitmap); 123162306a36Sopenharmony_ci lockres_free(bm_lockres); 123262306a36Sopenharmony_ci 123362306a36Sopenharmony_ci sb = kmap_atomic(bitmap->storage.sb_page); 123462306a36Sopenharmony_ci if (sync_size == 0) 123562306a36Sopenharmony_ci sync_size = sb->sync_size; 123662306a36Sopenharmony_ci else if (sync_size != sb->sync_size) { 123762306a36Sopenharmony_ci kunmap_atomic(sb); 123862306a36Sopenharmony_ci md_bitmap_free(bitmap); 123962306a36Sopenharmony_ci return -1; 124062306a36Sopenharmony_ci } 124162306a36Sopenharmony_ci kunmap_atomic(sb); 124262306a36Sopenharmony_ci md_bitmap_free(bitmap); 124362306a36Sopenharmony_ci } 124462306a36Sopenharmony_ci 124562306a36Sopenharmony_ci return (my_sync_size == sync_size) ? 0 : -1; 124662306a36Sopenharmony_ci} 124762306a36Sopenharmony_ci 124862306a36Sopenharmony_ci/* 124962306a36Sopenharmony_ci * Update the size for cluster raid is a little more complex, we perform it 125062306a36Sopenharmony_ci * by the steps: 125162306a36Sopenharmony_ci * 1. hold token lock and update superblock in initiator node. 125262306a36Sopenharmony_ci * 2. send METADATA_UPDATED msg to other nodes. 125362306a36Sopenharmony_ci * 3. The initiator node continues to check each bitmap's sync_size, if all 125462306a36Sopenharmony_ci * bitmaps have the same value of sync_size, then we can set capacity and 125562306a36Sopenharmony_ci * let other nodes to perform it. If one node can't update sync_size 125662306a36Sopenharmony_ci * accordingly, we need to revert to previous value. 125762306a36Sopenharmony_ci */ 125862306a36Sopenharmony_cistatic void update_size(struct mddev *mddev, sector_t old_dev_sectors) 125962306a36Sopenharmony_ci{ 126062306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 126162306a36Sopenharmony_ci struct cluster_msg cmsg; 126262306a36Sopenharmony_ci struct md_rdev *rdev; 126362306a36Sopenharmony_ci int ret = 0; 126462306a36Sopenharmony_ci int raid_slot = -1; 126562306a36Sopenharmony_ci 126662306a36Sopenharmony_ci md_update_sb(mddev, 1); 126762306a36Sopenharmony_ci if (lock_comm(cinfo, 1)) { 126862306a36Sopenharmony_ci pr_err("%s: lock_comm failed\n", __func__); 126962306a36Sopenharmony_ci return; 127062306a36Sopenharmony_ci } 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_ci memset(&cmsg, 0, sizeof(cmsg)); 127362306a36Sopenharmony_ci cmsg.type = cpu_to_le32(METADATA_UPDATED); 127462306a36Sopenharmony_ci rdev_for_each(rdev, mddev) 127562306a36Sopenharmony_ci if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { 127662306a36Sopenharmony_ci raid_slot = rdev->desc_nr; 127762306a36Sopenharmony_ci break; 127862306a36Sopenharmony_ci } 127962306a36Sopenharmony_ci if (raid_slot >= 0) { 128062306a36Sopenharmony_ci cmsg.raid_slot = cpu_to_le32(raid_slot); 128162306a36Sopenharmony_ci /* 128262306a36Sopenharmony_ci * We can only change capiticy after all the nodes can do it, 128362306a36Sopenharmony_ci * so need to wait after other nodes already received the msg 128462306a36Sopenharmony_ci * and handled the change 128562306a36Sopenharmony_ci */ 128662306a36Sopenharmony_ci ret = __sendmsg(cinfo, &cmsg); 128762306a36Sopenharmony_ci if (ret) { 128862306a36Sopenharmony_ci pr_err("%s:%d: failed to send METADATA_UPDATED msg\n", 128962306a36Sopenharmony_ci __func__, __LINE__); 129062306a36Sopenharmony_ci unlock_comm(cinfo); 129162306a36Sopenharmony_ci return; 129262306a36Sopenharmony_ci } 129362306a36Sopenharmony_ci } else { 129462306a36Sopenharmony_ci pr_err("md-cluster: No good device id found to send\n"); 129562306a36Sopenharmony_ci unlock_comm(cinfo); 129662306a36Sopenharmony_ci return; 129762306a36Sopenharmony_ci } 129862306a36Sopenharmony_ci 129962306a36Sopenharmony_ci /* 130062306a36Sopenharmony_ci * check the sync_size from other node's bitmap, if sync_size 130162306a36Sopenharmony_ci * have already updated in other nodes as expected, send an 130262306a36Sopenharmony_ci * empty metadata msg to permit the change of capacity 130362306a36Sopenharmony_ci */ 130462306a36Sopenharmony_ci if (cluster_check_sync_size(mddev) == 0) { 130562306a36Sopenharmony_ci memset(&cmsg, 0, sizeof(cmsg)); 130662306a36Sopenharmony_ci cmsg.type = cpu_to_le32(CHANGE_CAPACITY); 130762306a36Sopenharmony_ci ret = __sendmsg(cinfo, &cmsg); 130862306a36Sopenharmony_ci if (ret) 130962306a36Sopenharmony_ci pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n", 131062306a36Sopenharmony_ci __func__, __LINE__); 131162306a36Sopenharmony_ci set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); 131262306a36Sopenharmony_ci } else { 131362306a36Sopenharmony_ci /* revert to previous sectors */ 131462306a36Sopenharmony_ci ret = mddev->pers->resize(mddev, old_dev_sectors); 131562306a36Sopenharmony_ci ret = __sendmsg(cinfo, &cmsg); 131662306a36Sopenharmony_ci if (ret) 131762306a36Sopenharmony_ci pr_err("%s:%d: failed to send METADATA_UPDATED msg\n", 131862306a36Sopenharmony_ci __func__, __LINE__); 131962306a36Sopenharmony_ci } 132062306a36Sopenharmony_ci unlock_comm(cinfo); 132162306a36Sopenharmony_ci} 132262306a36Sopenharmony_ci 132362306a36Sopenharmony_cistatic int resync_start(struct mddev *mddev) 132462306a36Sopenharmony_ci{ 132562306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 132662306a36Sopenharmony_ci return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev); 132762306a36Sopenharmony_ci} 132862306a36Sopenharmony_ci 132962306a36Sopenharmony_cistatic void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi) 133062306a36Sopenharmony_ci{ 133162306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 133262306a36Sopenharmony_ci 133362306a36Sopenharmony_ci spin_lock_irq(&cinfo->suspend_lock); 133462306a36Sopenharmony_ci *lo = cinfo->suspend_lo; 133562306a36Sopenharmony_ci *hi = cinfo->suspend_hi; 133662306a36Sopenharmony_ci spin_unlock_irq(&cinfo->suspend_lock); 133762306a36Sopenharmony_ci} 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_cistatic int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) 134062306a36Sopenharmony_ci{ 134162306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 134262306a36Sopenharmony_ci struct resync_info ri; 134362306a36Sopenharmony_ci struct cluster_msg cmsg = {0}; 134462306a36Sopenharmony_ci 134562306a36Sopenharmony_ci /* do not send zero again, if we have sent before */ 134662306a36Sopenharmony_ci if (hi == 0) { 134762306a36Sopenharmony_ci memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); 134862306a36Sopenharmony_ci if (le64_to_cpu(ri.hi) == 0) 134962306a36Sopenharmony_ci return 0; 135062306a36Sopenharmony_ci } 135162306a36Sopenharmony_ci 135262306a36Sopenharmony_ci add_resync_info(cinfo->bitmap_lockres, lo, hi); 135362306a36Sopenharmony_ci /* Re-acquire the lock to refresh LVB */ 135462306a36Sopenharmony_ci dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); 135562306a36Sopenharmony_ci cmsg.type = cpu_to_le32(RESYNCING); 135662306a36Sopenharmony_ci cmsg.low = cpu_to_le64(lo); 135762306a36Sopenharmony_ci cmsg.high = cpu_to_le64(hi); 135862306a36Sopenharmony_ci 135962306a36Sopenharmony_ci /* 136062306a36Sopenharmony_ci * mddev_lock is held if resync_info_update is called from 136162306a36Sopenharmony_ci * resync_finish (md_reap_sync_thread -> resync_finish) 136262306a36Sopenharmony_ci */ 136362306a36Sopenharmony_ci if (lo == 0 && hi == 0) 136462306a36Sopenharmony_ci return sendmsg(cinfo, &cmsg, 1); 136562306a36Sopenharmony_ci else 136662306a36Sopenharmony_ci return sendmsg(cinfo, &cmsg, 0); 136762306a36Sopenharmony_ci} 136862306a36Sopenharmony_ci 136962306a36Sopenharmony_cistatic int resync_finish(struct mddev *mddev) 137062306a36Sopenharmony_ci{ 137162306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 137262306a36Sopenharmony_ci int ret = 0; 137362306a36Sopenharmony_ci 137462306a36Sopenharmony_ci clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery); 137562306a36Sopenharmony_ci 137662306a36Sopenharmony_ci /* 137762306a36Sopenharmony_ci * If resync thread is interrupted so we can't say resync is finished, 137862306a36Sopenharmony_ci * another node will launch resync thread to continue. 137962306a36Sopenharmony_ci */ 138062306a36Sopenharmony_ci if (!test_bit(MD_CLOSING, &mddev->flags)) 138162306a36Sopenharmony_ci ret = resync_info_update(mddev, 0, 0); 138262306a36Sopenharmony_ci dlm_unlock_sync(cinfo->resync_lockres); 138362306a36Sopenharmony_ci return ret; 138462306a36Sopenharmony_ci} 138562306a36Sopenharmony_ci 138662306a36Sopenharmony_cistatic int area_resyncing(struct mddev *mddev, int direction, 138762306a36Sopenharmony_ci sector_t lo, sector_t hi) 138862306a36Sopenharmony_ci{ 138962306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 139062306a36Sopenharmony_ci int ret = 0; 139162306a36Sopenharmony_ci 139262306a36Sopenharmony_ci if ((direction == READ) && 139362306a36Sopenharmony_ci test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state)) 139462306a36Sopenharmony_ci return 1; 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_ci spin_lock_irq(&cinfo->suspend_lock); 139762306a36Sopenharmony_ci if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi) 139862306a36Sopenharmony_ci ret = 1; 139962306a36Sopenharmony_ci spin_unlock_irq(&cinfo->suspend_lock); 140062306a36Sopenharmony_ci return ret; 140162306a36Sopenharmony_ci} 140262306a36Sopenharmony_ci 140362306a36Sopenharmony_ci/* add_new_disk() - initiates a disk add 140462306a36Sopenharmony_ci * However, if this fails before writing md_update_sb(), 140562306a36Sopenharmony_ci * add_new_disk_cancel() must be called to release token lock 140662306a36Sopenharmony_ci */ 140762306a36Sopenharmony_cistatic int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) 140862306a36Sopenharmony_ci{ 140962306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 141062306a36Sopenharmony_ci struct cluster_msg cmsg; 141162306a36Sopenharmony_ci int ret = 0; 141262306a36Sopenharmony_ci struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 141362306a36Sopenharmony_ci char *uuid = sb->device_uuid; 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci memset(&cmsg, 0, sizeof(cmsg)); 141662306a36Sopenharmony_ci cmsg.type = cpu_to_le32(NEWDISK); 141762306a36Sopenharmony_ci memcpy(cmsg.uuid, uuid, 16); 141862306a36Sopenharmony_ci cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); 141962306a36Sopenharmony_ci if (lock_comm(cinfo, 1)) 142062306a36Sopenharmony_ci return -EAGAIN; 142162306a36Sopenharmony_ci ret = __sendmsg(cinfo, &cmsg); 142262306a36Sopenharmony_ci if (ret) { 142362306a36Sopenharmony_ci unlock_comm(cinfo); 142462306a36Sopenharmony_ci return ret; 142562306a36Sopenharmony_ci } 142662306a36Sopenharmony_ci cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE; 142762306a36Sopenharmony_ci ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX); 142862306a36Sopenharmony_ci cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE; 142962306a36Sopenharmony_ci /* Some node does not "see" the device */ 143062306a36Sopenharmony_ci if (ret == -EAGAIN) 143162306a36Sopenharmony_ci ret = -ENOENT; 143262306a36Sopenharmony_ci if (ret) 143362306a36Sopenharmony_ci unlock_comm(cinfo); 143462306a36Sopenharmony_ci else { 143562306a36Sopenharmony_ci dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); 143662306a36Sopenharmony_ci /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which 143762306a36Sopenharmony_ci * will run soon after add_new_disk, the below path will be 143862306a36Sopenharmony_ci * invoked: 143962306a36Sopenharmony_ci * md_wakeup_thread(mddev->thread) 144062306a36Sopenharmony_ci * -> conf->thread (raid1d) 144162306a36Sopenharmony_ci * -> md_check_recovery -> md_update_sb 144262306a36Sopenharmony_ci * -> metadata_update_start/finish 144362306a36Sopenharmony_ci * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually. 144462306a36Sopenharmony_ci * 144562306a36Sopenharmony_ci * For other failure cases, metadata_update_cancel and 144662306a36Sopenharmony_ci * add_new_disk_cancel also clear below bit as well. 144762306a36Sopenharmony_ci * */ 144862306a36Sopenharmony_ci set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); 144962306a36Sopenharmony_ci wake_up(&cinfo->wait); 145062306a36Sopenharmony_ci } 145162306a36Sopenharmony_ci return ret; 145262306a36Sopenharmony_ci} 145362306a36Sopenharmony_ci 145462306a36Sopenharmony_cistatic void add_new_disk_cancel(struct mddev *mddev) 145562306a36Sopenharmony_ci{ 145662306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 145762306a36Sopenharmony_ci clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state); 145862306a36Sopenharmony_ci unlock_comm(cinfo); 145962306a36Sopenharmony_ci} 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_cistatic int new_disk_ack(struct mddev *mddev, bool ack) 146262306a36Sopenharmony_ci{ 146362306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 146462306a36Sopenharmony_ci 146562306a36Sopenharmony_ci if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) { 146662306a36Sopenharmony_ci pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev)); 146762306a36Sopenharmony_ci return -EINVAL; 146862306a36Sopenharmony_ci } 146962306a36Sopenharmony_ci 147062306a36Sopenharmony_ci if (ack) 147162306a36Sopenharmony_ci dlm_unlock_sync(cinfo->no_new_dev_lockres); 147262306a36Sopenharmony_ci complete(&cinfo->newdisk_completion); 147362306a36Sopenharmony_ci return 0; 147462306a36Sopenharmony_ci} 147562306a36Sopenharmony_ci 147662306a36Sopenharmony_cistatic int remove_disk(struct mddev *mddev, struct md_rdev *rdev) 147762306a36Sopenharmony_ci{ 147862306a36Sopenharmony_ci struct cluster_msg cmsg = {0}; 147962306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 148062306a36Sopenharmony_ci cmsg.type = cpu_to_le32(REMOVE); 148162306a36Sopenharmony_ci cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); 148262306a36Sopenharmony_ci return sendmsg(cinfo, &cmsg, 1); 148362306a36Sopenharmony_ci} 148462306a36Sopenharmony_ci 148562306a36Sopenharmony_cistatic int lock_all_bitmaps(struct mddev *mddev) 148662306a36Sopenharmony_ci{ 148762306a36Sopenharmony_ci int slot, my_slot, ret, held = 1, i = 0; 148862306a36Sopenharmony_ci char str[64]; 148962306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 149062306a36Sopenharmony_ci 149162306a36Sopenharmony_ci cinfo->other_bitmap_lockres = 149262306a36Sopenharmony_ci kcalloc(mddev->bitmap_info.nodes - 1, 149362306a36Sopenharmony_ci sizeof(struct dlm_lock_resource *), GFP_KERNEL); 149462306a36Sopenharmony_ci if (!cinfo->other_bitmap_lockres) { 149562306a36Sopenharmony_ci pr_err("md: can't alloc mem for other bitmap locks\n"); 149662306a36Sopenharmony_ci return 0; 149762306a36Sopenharmony_ci } 149862306a36Sopenharmony_ci 149962306a36Sopenharmony_ci my_slot = slot_number(mddev); 150062306a36Sopenharmony_ci for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) { 150162306a36Sopenharmony_ci if (slot == my_slot) 150262306a36Sopenharmony_ci continue; 150362306a36Sopenharmony_ci 150462306a36Sopenharmony_ci memset(str, '\0', 64); 150562306a36Sopenharmony_ci snprintf(str, 64, "bitmap%04d", slot); 150662306a36Sopenharmony_ci cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1); 150762306a36Sopenharmony_ci if (!cinfo->other_bitmap_lockres[i]) 150862306a36Sopenharmony_ci return -ENOMEM; 150962306a36Sopenharmony_ci 151062306a36Sopenharmony_ci cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE; 151162306a36Sopenharmony_ci ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW); 151262306a36Sopenharmony_ci if (ret) 151362306a36Sopenharmony_ci held = -1; 151462306a36Sopenharmony_ci i++; 151562306a36Sopenharmony_ci } 151662306a36Sopenharmony_ci 151762306a36Sopenharmony_ci return held; 151862306a36Sopenharmony_ci} 151962306a36Sopenharmony_ci 152062306a36Sopenharmony_cistatic void unlock_all_bitmaps(struct mddev *mddev) 152162306a36Sopenharmony_ci{ 152262306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 152362306a36Sopenharmony_ci int i; 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci /* release other node's bitmap lock if they are existed */ 152662306a36Sopenharmony_ci if (cinfo->other_bitmap_lockres) { 152762306a36Sopenharmony_ci for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { 152862306a36Sopenharmony_ci if (cinfo->other_bitmap_lockres[i]) { 152962306a36Sopenharmony_ci lockres_free(cinfo->other_bitmap_lockres[i]); 153062306a36Sopenharmony_ci } 153162306a36Sopenharmony_ci } 153262306a36Sopenharmony_ci kfree(cinfo->other_bitmap_lockres); 153362306a36Sopenharmony_ci cinfo->other_bitmap_lockres = NULL; 153462306a36Sopenharmony_ci } 153562306a36Sopenharmony_ci} 153662306a36Sopenharmony_ci 153762306a36Sopenharmony_cistatic int gather_bitmaps(struct md_rdev *rdev) 153862306a36Sopenharmony_ci{ 153962306a36Sopenharmony_ci int sn, err; 154062306a36Sopenharmony_ci sector_t lo, hi; 154162306a36Sopenharmony_ci struct cluster_msg cmsg = {0}; 154262306a36Sopenharmony_ci struct mddev *mddev = rdev->mddev; 154362306a36Sopenharmony_ci struct md_cluster_info *cinfo = mddev->cluster_info; 154462306a36Sopenharmony_ci 154562306a36Sopenharmony_ci cmsg.type = cpu_to_le32(RE_ADD); 154662306a36Sopenharmony_ci cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); 154762306a36Sopenharmony_ci err = sendmsg(cinfo, &cmsg, 1); 154862306a36Sopenharmony_ci if (err) 154962306a36Sopenharmony_ci goto out; 155062306a36Sopenharmony_ci 155162306a36Sopenharmony_ci for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { 155262306a36Sopenharmony_ci if (sn == (cinfo->slot_number - 1)) 155362306a36Sopenharmony_ci continue; 155462306a36Sopenharmony_ci err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); 155562306a36Sopenharmony_ci if (err) { 155662306a36Sopenharmony_ci pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); 155762306a36Sopenharmony_ci goto out; 155862306a36Sopenharmony_ci } 155962306a36Sopenharmony_ci if ((hi > 0) && (lo < mddev->recovery_cp)) 156062306a36Sopenharmony_ci mddev->recovery_cp = lo; 156162306a36Sopenharmony_ci } 156262306a36Sopenharmony_ciout: 156362306a36Sopenharmony_ci return err; 156462306a36Sopenharmony_ci} 156562306a36Sopenharmony_ci 156662306a36Sopenharmony_cistatic struct md_cluster_operations cluster_ops = { 156762306a36Sopenharmony_ci .join = join, 156862306a36Sopenharmony_ci .leave = leave, 156962306a36Sopenharmony_ci .slot_number = slot_number, 157062306a36Sopenharmony_ci .resync_start = resync_start, 157162306a36Sopenharmony_ci .resync_finish = resync_finish, 157262306a36Sopenharmony_ci .resync_info_update = resync_info_update, 157362306a36Sopenharmony_ci .resync_info_get = resync_info_get, 157462306a36Sopenharmony_ci .metadata_update_start = metadata_update_start, 157562306a36Sopenharmony_ci .metadata_update_finish = metadata_update_finish, 157662306a36Sopenharmony_ci .metadata_update_cancel = metadata_update_cancel, 157762306a36Sopenharmony_ci .area_resyncing = area_resyncing, 157862306a36Sopenharmony_ci .add_new_disk = add_new_disk, 157962306a36Sopenharmony_ci .add_new_disk_cancel = add_new_disk_cancel, 158062306a36Sopenharmony_ci .new_disk_ack = new_disk_ack, 158162306a36Sopenharmony_ci .remove_disk = remove_disk, 158262306a36Sopenharmony_ci .load_bitmaps = load_bitmaps, 158362306a36Sopenharmony_ci .gather_bitmaps = gather_bitmaps, 158462306a36Sopenharmony_ci .resize_bitmaps = resize_bitmaps, 158562306a36Sopenharmony_ci .lock_all_bitmaps = lock_all_bitmaps, 158662306a36Sopenharmony_ci .unlock_all_bitmaps = unlock_all_bitmaps, 158762306a36Sopenharmony_ci .update_size = update_size, 158862306a36Sopenharmony_ci}; 158962306a36Sopenharmony_ci 159062306a36Sopenharmony_cistatic int __init cluster_init(void) 159162306a36Sopenharmony_ci{ 159262306a36Sopenharmony_ci pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); 159362306a36Sopenharmony_ci pr_info("Registering Cluster MD functions\n"); 159462306a36Sopenharmony_ci register_md_cluster_operations(&cluster_ops, THIS_MODULE); 159562306a36Sopenharmony_ci return 0; 159662306a36Sopenharmony_ci} 159762306a36Sopenharmony_ci 159862306a36Sopenharmony_cistatic void cluster_exit(void) 159962306a36Sopenharmony_ci{ 160062306a36Sopenharmony_ci unregister_md_cluster_operations(); 160162306a36Sopenharmony_ci} 160262306a36Sopenharmony_ci 160362306a36Sopenharmony_cimodule_init(cluster_init); 160462306a36Sopenharmony_cimodule_exit(cluster_exit); 160562306a36Sopenharmony_ciMODULE_AUTHOR("SUSE"); 160662306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 160762306a36Sopenharmony_ciMODULE_DESCRIPTION("Clustering support for MD"); 1608