18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci drbd_actlog.c 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 88c2ecf20Sopenharmony_ci Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 98c2ecf20Sopenharmony_ci Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci */ 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#include <linux/slab.h> 158c2ecf20Sopenharmony_ci#include <linux/crc32c.h> 168c2ecf20Sopenharmony_ci#include <linux/drbd.h> 178c2ecf20Sopenharmony_ci#include <linux/drbd_limits.h> 188c2ecf20Sopenharmony_ci#include "drbd_int.h" 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_cienum al_transaction_types { 228c2ecf20Sopenharmony_ci AL_TR_UPDATE = 0, 238c2ecf20Sopenharmony_ci AL_TR_INITIALIZED = 0xffff 248c2ecf20Sopenharmony_ci}; 258c2ecf20Sopenharmony_ci/* all fields on disc in big endian */ 268c2ecf20Sopenharmony_cistruct __packed al_transaction_on_disk { 278c2ecf20Sopenharmony_ci /* don't we all like magic */ 288c2ecf20Sopenharmony_ci __be32 magic; 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci /* to identify the most recent transaction block 318c2ecf20Sopenharmony_ci * in the on disk ring buffer */ 328c2ecf20Sopenharmony_ci __be32 tr_number; 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci /* checksum on the full 4k block, with this field set to 0. */ 358c2ecf20Sopenharmony_ci __be32 crc32c; 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ci /* type of transaction, special transaction types like: 388c2ecf20Sopenharmony_ci * purge-all, set-all-idle, set-all-active, ... to-be-defined 398c2ecf20Sopenharmony_ci * see also enum al_transaction_types */ 408c2ecf20Sopenharmony_ci __be16 transaction_type; 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci /* we currently allow only a few thousand extents, 438c2ecf20Sopenharmony_ci * so 16bit will be enough for the slot number. */ 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci /* how many updates in this transaction */ 468c2ecf20Sopenharmony_ci __be16 n_updates; 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci /* maximum slot number, "al-extents" in drbd.conf speak. 498c2ecf20Sopenharmony_ci * Having this in each transaction should make reconfiguration 508c2ecf20Sopenharmony_ci * of that parameter easier. */ 518c2ecf20Sopenharmony_ci __be16 context_size; 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci /* slot number the context starts with */ 548c2ecf20Sopenharmony_ci __be16 context_start_slot_nr; 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci /* Some reserved bytes. Expected usage is a 64bit counter of 578c2ecf20Sopenharmony_ci * sectors-written since device creation, and other data generation tag 588c2ecf20Sopenharmony_ci * supporting usage */ 598c2ecf20Sopenharmony_ci __be32 __reserved[4]; 608c2ecf20Sopenharmony_ci 618c2ecf20Sopenharmony_ci /* --- 36 byte used --- */ 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_ci /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes 648c2ecf20Sopenharmony_ci * in one transaction, then use the remaining byte in the 4k block for 658c2ecf20Sopenharmony_ci * context information. "Flexible" number of updates per transaction 668c2ecf20Sopenharmony_ci * does not help, as we have to account for the case when all update 678c2ecf20Sopenharmony_ci * slots are used anyways, so it would only complicate code without 688c2ecf20Sopenharmony_ci * additional benefit. 698c2ecf20Sopenharmony_ci */ 708c2ecf20Sopenharmony_ci __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; 718c2ecf20Sopenharmony_ci 728c2ecf20Sopenharmony_ci /* but the extent number is 32bit, which at an extent size of 4 MiB 738c2ecf20Sopenharmony_ci * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ 748c2ecf20Sopenharmony_ci __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci /* --- 420 bytes used (36 + 64*6) --- */ 778c2ecf20Sopenharmony_ci 788c2ecf20Sopenharmony_ci /* 4096 - 420 = 3676 = 919 * 4 */ 798c2ecf20Sopenharmony_ci __be32 context[AL_CONTEXT_PER_TRANSACTION]; 808c2ecf20Sopenharmony_ci}; 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_civoid *drbd_md_get_buffer(struct drbd_device *device, const char *intent) 838c2ecf20Sopenharmony_ci{ 848c2ecf20Sopenharmony_ci int r; 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci wait_event(device->misc_wait, 878c2ecf20Sopenharmony_ci (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || 888c2ecf20Sopenharmony_ci device->state.disk <= D_FAILED); 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci if (r) 918c2ecf20Sopenharmony_ci return NULL; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci device->md_io.current_use = intent; 948c2ecf20Sopenharmony_ci device->md_io.start_jif = jiffies; 958c2ecf20Sopenharmony_ci device->md_io.submit_jif = device->md_io.start_jif - 1; 968c2ecf20Sopenharmony_ci return page_address(device->md_io.page); 978c2ecf20Sopenharmony_ci} 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_civoid drbd_md_put_buffer(struct drbd_device *device) 1008c2ecf20Sopenharmony_ci{ 1018c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&device->md_io.in_use)) 1028c2ecf20Sopenharmony_ci wake_up(&device->misc_wait); 1038c2ecf20Sopenharmony_ci} 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_civoid wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, 1068c2ecf20Sopenharmony_ci unsigned int *done) 1078c2ecf20Sopenharmony_ci{ 1088c2ecf20Sopenharmony_ci long dt; 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci rcu_read_lock(); 1118c2ecf20Sopenharmony_ci dt = rcu_dereference(bdev->disk_conf)->disk_timeout; 1128c2ecf20Sopenharmony_ci rcu_read_unlock(); 1138c2ecf20Sopenharmony_ci dt = dt * HZ / 10; 1148c2ecf20Sopenharmony_ci if (dt == 0) 1158c2ecf20Sopenharmony_ci dt = MAX_SCHEDULE_TIMEOUT; 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci dt = wait_event_timeout(device->misc_wait, 1188c2ecf20Sopenharmony_ci *done || test_bit(FORCE_DETACH, &device->flags), dt); 1198c2ecf20Sopenharmony_ci if (dt == 0) { 1208c2ecf20Sopenharmony_ci drbd_err(device, "meta-data IO operation timed out\n"); 1218c2ecf20Sopenharmony_ci drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH); 1228c2ecf20Sopenharmony_ci } 1238c2ecf20Sopenharmony_ci} 1248c2ecf20Sopenharmony_ci 1258c2ecf20Sopenharmony_cistatic int _drbd_md_sync_page_io(struct drbd_device *device, 1268c2ecf20Sopenharmony_ci struct drbd_backing_dev *bdev, 1278c2ecf20Sopenharmony_ci sector_t sector, int op) 1288c2ecf20Sopenharmony_ci{ 1298c2ecf20Sopenharmony_ci struct bio *bio; 1308c2ecf20Sopenharmony_ci /* we do all our meta data IO in aligned 4k blocks. */ 1318c2ecf20Sopenharmony_ci const int size = 4096; 1328c2ecf20Sopenharmony_ci int err, op_flags = 0; 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_ci device->md_io.done = 0; 1358c2ecf20Sopenharmony_ci device->md_io.error = -ENODEV; 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags)) 1388c2ecf20Sopenharmony_ci op_flags |= REQ_FUA | REQ_PREFLUSH; 1398c2ecf20Sopenharmony_ci op_flags |= REQ_SYNC; 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci bio = bio_alloc_drbd(GFP_NOIO); 1428c2ecf20Sopenharmony_ci bio_set_dev(bio, bdev->md_bdev); 1438c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector = sector; 1448c2ecf20Sopenharmony_ci err = -EIO; 1458c2ecf20Sopenharmony_ci if (bio_add_page(bio, device->md_io.page, size, 0) != size) 1468c2ecf20Sopenharmony_ci goto out; 1478c2ecf20Sopenharmony_ci bio->bi_private = device; 1488c2ecf20Sopenharmony_ci bio->bi_end_io = drbd_md_endio; 1498c2ecf20Sopenharmony_ci bio_set_op_attrs(bio, op, op_flags); 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_ci if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL) 1528c2ecf20Sopenharmony_ci /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ 1538c2ecf20Sopenharmony_ci ; 1548c2ecf20Sopenharmony_ci else if (!get_ldev_if_state(device, D_ATTACHING)) { 1558c2ecf20Sopenharmony_ci /* Corresponding put_ldev in drbd_md_endio() */ 1568c2ecf20Sopenharmony_ci drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 1578c2ecf20Sopenharmony_ci err = -ENODEV; 1588c2ecf20Sopenharmony_ci goto out; 1598c2ecf20Sopenharmony_ci } 1608c2ecf20Sopenharmony_ci 1618c2ecf20Sopenharmony_ci bio_get(bio); /* one bio_put() is in the completion handler */ 1628c2ecf20Sopenharmony_ci atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ 1638c2ecf20Sopenharmony_ci device->md_io.submit_jif = jiffies; 1648c2ecf20Sopenharmony_ci if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 1658c2ecf20Sopenharmony_ci bio_io_error(bio); 1668c2ecf20Sopenharmony_ci else 1678c2ecf20Sopenharmony_ci submit_bio(bio); 1688c2ecf20Sopenharmony_ci wait_until_done_or_force_detached(device, bdev, &device->md_io.done); 1698c2ecf20Sopenharmony_ci if (!bio->bi_status) 1708c2ecf20Sopenharmony_ci err = device->md_io.error; 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci out: 1738c2ecf20Sopenharmony_ci bio_put(bio); 1748c2ecf20Sopenharmony_ci return err; 1758c2ecf20Sopenharmony_ci} 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_ciint drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, 1788c2ecf20Sopenharmony_ci sector_t sector, int op) 1798c2ecf20Sopenharmony_ci{ 1808c2ecf20Sopenharmony_ci int err; 1818c2ecf20Sopenharmony_ci D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci BUG_ON(!bdev->md_bdev); 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", 1868c2ecf20Sopenharmony_ci current->comm, current->pid, __func__, 1878c2ecf20Sopenharmony_ci (unsigned long long)sector, (op == REQ_OP_WRITE) ? "WRITE" : "READ", 1888c2ecf20Sopenharmony_ci (void*)_RET_IP_ ); 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci if (sector < drbd_md_first_sector(bdev) || 1918c2ecf20Sopenharmony_ci sector + 7 > drbd_md_last_sector(bdev)) 1928c2ecf20Sopenharmony_ci drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 1938c2ecf20Sopenharmony_ci current->comm, current->pid, __func__, 1948c2ecf20Sopenharmony_ci (unsigned long long)sector, 1958c2ecf20Sopenharmony_ci (op == REQ_OP_WRITE) ? "WRITE" : "READ"); 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci err = _drbd_md_sync_page_io(device, bdev, sector, op); 1988c2ecf20Sopenharmony_ci if (err) { 1998c2ecf20Sopenharmony_ci drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 2008c2ecf20Sopenharmony_ci (unsigned long long)sector, 2018c2ecf20Sopenharmony_ci (op == REQ_OP_WRITE) ? "WRITE" : "READ", err); 2028c2ecf20Sopenharmony_ci } 2038c2ecf20Sopenharmony_ci return err; 2048c2ecf20Sopenharmony_ci} 2058c2ecf20Sopenharmony_ci 2068c2ecf20Sopenharmony_cistatic struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr) 2078c2ecf20Sopenharmony_ci{ 2088c2ecf20Sopenharmony_ci struct lc_element *tmp; 2098c2ecf20Sopenharmony_ci tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 2108c2ecf20Sopenharmony_ci if (unlikely(tmp != NULL)) { 2118c2ecf20Sopenharmony_ci struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 2128c2ecf20Sopenharmony_ci if (test_bit(BME_NO_WRITES, &bm_ext->flags)) 2138c2ecf20Sopenharmony_ci return bm_ext; 2148c2ecf20Sopenharmony_ci } 2158c2ecf20Sopenharmony_ci return NULL; 2168c2ecf20Sopenharmony_ci} 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_cistatic struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock) 2198c2ecf20Sopenharmony_ci{ 2208c2ecf20Sopenharmony_ci struct lc_element *al_ext; 2218c2ecf20Sopenharmony_ci struct bm_extent *bm_ext; 2228c2ecf20Sopenharmony_ci int wake; 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 2258c2ecf20Sopenharmony_ci bm_ext = find_active_resync_extent(device, enr); 2268c2ecf20Sopenharmony_ci if (bm_ext) { 2278c2ecf20Sopenharmony_ci wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 2288c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 2298c2ecf20Sopenharmony_ci if (wake) 2308c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 2318c2ecf20Sopenharmony_ci return NULL; 2328c2ecf20Sopenharmony_ci } 2338c2ecf20Sopenharmony_ci if (nonblock) 2348c2ecf20Sopenharmony_ci al_ext = lc_try_get(device->act_log, enr); 2358c2ecf20Sopenharmony_ci else 2368c2ecf20Sopenharmony_ci al_ext = lc_get(device->act_log, enr); 2378c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 2388c2ecf20Sopenharmony_ci return al_ext; 2398c2ecf20Sopenharmony_ci} 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_cibool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i) 2428c2ecf20Sopenharmony_ci{ 2438c2ecf20Sopenharmony_ci /* for bios crossing activity log extent boundaries, 2448c2ecf20Sopenharmony_ci * we may need to activate two extents in one go */ 2458c2ecf20Sopenharmony_ci unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 2468c2ecf20Sopenharmony_ci unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci D_ASSERT(device, first <= last); 2498c2ecf20Sopenharmony_ci D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci /* FIXME figure out a fast path for bios crossing AL extent boundaries */ 2528c2ecf20Sopenharmony_ci if (first != last) 2538c2ecf20Sopenharmony_ci return false; 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci return _al_get(device, first, true); 2568c2ecf20Sopenharmony_ci} 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_cibool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) 2598c2ecf20Sopenharmony_ci{ 2608c2ecf20Sopenharmony_ci /* for bios crossing activity log extent boundaries, 2618c2ecf20Sopenharmony_ci * we may need to activate two extents in one go */ 2628c2ecf20Sopenharmony_ci unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 2638c2ecf20Sopenharmony_ci unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 2648c2ecf20Sopenharmony_ci unsigned enr; 2658c2ecf20Sopenharmony_ci bool need_transaction = false; 2668c2ecf20Sopenharmony_ci 2678c2ecf20Sopenharmony_ci D_ASSERT(device, first <= last); 2688c2ecf20Sopenharmony_ci D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci for (enr = first; enr <= last; enr++) { 2718c2ecf20Sopenharmony_ci struct lc_element *al_ext; 2728c2ecf20Sopenharmony_ci wait_event(device->al_wait, 2738c2ecf20Sopenharmony_ci (al_ext = _al_get(device, enr, false)) != NULL); 2748c2ecf20Sopenharmony_ci if (al_ext->lc_number != enr) 2758c2ecf20Sopenharmony_ci need_transaction = true; 2768c2ecf20Sopenharmony_ci } 2778c2ecf20Sopenharmony_ci return need_transaction; 2788c2ecf20Sopenharmony_ci} 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) 2818c2ecf20Sopenharmony_ci/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT 2828c2ecf20Sopenharmony_ci * are still coupled, or assume too much about their relation. 2838c2ecf20Sopenharmony_ci * Code below will not work if this is violated. 2848c2ecf20Sopenharmony_ci * Will be cleaned up with some followup patch. 2858c2ecf20Sopenharmony_ci */ 2868c2ecf20Sopenharmony_ci# error FIXME 2878c2ecf20Sopenharmony_ci#endif 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_cistatic unsigned int al_extent_to_bm_page(unsigned int al_enr) 2908c2ecf20Sopenharmony_ci{ 2918c2ecf20Sopenharmony_ci return al_enr >> 2928c2ecf20Sopenharmony_ci /* bit to page */ 2938c2ecf20Sopenharmony_ci ((PAGE_SHIFT + 3) - 2948c2ecf20Sopenharmony_ci /* al extent number to bit */ 2958c2ecf20Sopenharmony_ci (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 2968c2ecf20Sopenharmony_ci} 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_cistatic sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) 2998c2ecf20Sopenharmony_ci{ 3008c2ecf20Sopenharmony_ci const unsigned int stripes = device->ldev->md.al_stripes; 3018c2ecf20Sopenharmony_ci const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci /* transaction number, modulo on-disk ring buffer wrap around */ 3048c2ecf20Sopenharmony_ci unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci /* ... to aligned 4k on disk block */ 3078c2ecf20Sopenharmony_ci t = ((t % stripes) * stripe_size_4kB) + t/stripes; 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci /* ... to 512 byte sector in activity log */ 3108c2ecf20Sopenharmony_ci t *= 8; 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci /* ... plus offset to the on disk position */ 3138c2ecf20Sopenharmony_ci return device->ldev->md.md_offset + device->ldev->md.al_offset + t; 3148c2ecf20Sopenharmony_ci} 3158c2ecf20Sopenharmony_ci 3168c2ecf20Sopenharmony_cistatic int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer) 3178c2ecf20Sopenharmony_ci{ 3188c2ecf20Sopenharmony_ci struct lc_element *e; 3198c2ecf20Sopenharmony_ci sector_t sector; 3208c2ecf20Sopenharmony_ci int i, mx; 3218c2ecf20Sopenharmony_ci unsigned extent_nr; 3228c2ecf20Sopenharmony_ci unsigned crc = 0; 3238c2ecf20Sopenharmony_ci int err = 0; 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_ci memset(buffer, 0, sizeof(*buffer)); 3268c2ecf20Sopenharmony_ci buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); 3278c2ecf20Sopenharmony_ci buffer->tr_number = cpu_to_be32(device->al_tr_number); 3288c2ecf20Sopenharmony_ci 3298c2ecf20Sopenharmony_ci i = 0; 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci drbd_bm_reset_al_hints(device); 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci /* Even though no one can start to change this list 3348c2ecf20Sopenharmony_ci * once we set the LC_LOCKED -- from drbd_al_begin_io(), 3358c2ecf20Sopenharmony_ci * lc_try_lock_for_transaction() --, someone may still 3368c2ecf20Sopenharmony_ci * be in the process of changing it. */ 3378c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 3388c2ecf20Sopenharmony_ci list_for_each_entry(e, &device->act_log->to_be_changed, list) { 3398c2ecf20Sopenharmony_ci if (i == AL_UPDATES_PER_TRANSACTION) { 3408c2ecf20Sopenharmony_ci i++; 3418c2ecf20Sopenharmony_ci break; 3428c2ecf20Sopenharmony_ci } 3438c2ecf20Sopenharmony_ci buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); 3448c2ecf20Sopenharmony_ci buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); 3458c2ecf20Sopenharmony_ci if (e->lc_number != LC_FREE) 3468c2ecf20Sopenharmony_ci drbd_bm_mark_for_writeout(device, 3478c2ecf20Sopenharmony_ci al_extent_to_bm_page(e->lc_number)); 3488c2ecf20Sopenharmony_ci i++; 3498c2ecf20Sopenharmony_ci } 3508c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 3518c2ecf20Sopenharmony_ci BUG_ON(i > AL_UPDATES_PER_TRANSACTION); 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci buffer->n_updates = cpu_to_be16(i); 3548c2ecf20Sopenharmony_ci for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { 3558c2ecf20Sopenharmony_ci buffer->update_slot_nr[i] = cpu_to_be16(-1); 3568c2ecf20Sopenharmony_ci buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); 3578c2ecf20Sopenharmony_ci } 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci buffer->context_size = cpu_to_be16(device->act_log->nr_elements); 3608c2ecf20Sopenharmony_ci buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); 3618c2ecf20Sopenharmony_ci 3628c2ecf20Sopenharmony_ci mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, 3638c2ecf20Sopenharmony_ci device->act_log->nr_elements - device->al_tr_cycle); 3648c2ecf20Sopenharmony_ci for (i = 0; i < mx; i++) { 3658c2ecf20Sopenharmony_ci unsigned idx = device->al_tr_cycle + i; 3668c2ecf20Sopenharmony_ci extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; 3678c2ecf20Sopenharmony_ci buffer->context[i] = cpu_to_be32(extent_nr); 3688c2ecf20Sopenharmony_ci } 3698c2ecf20Sopenharmony_ci for (; i < AL_CONTEXT_PER_TRANSACTION; i++) 3708c2ecf20Sopenharmony_ci buffer->context[i] = cpu_to_be32(LC_FREE); 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_ci device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; 3738c2ecf20Sopenharmony_ci if (device->al_tr_cycle >= device->act_log->nr_elements) 3748c2ecf20Sopenharmony_ci device->al_tr_cycle = 0; 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_ci sector = al_tr_number_to_on_disk_sector(device); 3778c2ecf20Sopenharmony_ci 3788c2ecf20Sopenharmony_ci crc = crc32c(0, buffer, 4096); 3798c2ecf20Sopenharmony_ci buffer->crc32c = cpu_to_be32(crc); 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci if (drbd_bm_write_hinted(device)) 3828c2ecf20Sopenharmony_ci err = -EIO; 3838c2ecf20Sopenharmony_ci else { 3848c2ecf20Sopenharmony_ci bool write_al_updates; 3858c2ecf20Sopenharmony_ci rcu_read_lock(); 3868c2ecf20Sopenharmony_ci write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 3878c2ecf20Sopenharmony_ci rcu_read_unlock(); 3888c2ecf20Sopenharmony_ci if (write_al_updates) { 3898c2ecf20Sopenharmony_ci if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { 3908c2ecf20Sopenharmony_ci err = -EIO; 3918c2ecf20Sopenharmony_ci drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); 3928c2ecf20Sopenharmony_ci } else { 3938c2ecf20Sopenharmony_ci device->al_tr_number++; 3948c2ecf20Sopenharmony_ci device->al_writ_cnt++; 3958c2ecf20Sopenharmony_ci } 3968c2ecf20Sopenharmony_ci } 3978c2ecf20Sopenharmony_ci } 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci return err; 4008c2ecf20Sopenharmony_ci} 4018c2ecf20Sopenharmony_ci 4028c2ecf20Sopenharmony_cistatic int al_write_transaction(struct drbd_device *device) 4038c2ecf20Sopenharmony_ci{ 4048c2ecf20Sopenharmony_ci struct al_transaction_on_disk *buffer; 4058c2ecf20Sopenharmony_ci int err; 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci if (!get_ldev(device)) { 4088c2ecf20Sopenharmony_ci drbd_err(device, "disk is %s, cannot start al transaction\n", 4098c2ecf20Sopenharmony_ci drbd_disk_str(device->state.disk)); 4108c2ecf20Sopenharmony_ci return -EIO; 4118c2ecf20Sopenharmony_ci } 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci /* The bitmap write may have failed, causing a state change. */ 4148c2ecf20Sopenharmony_ci if (device->state.disk < D_INCONSISTENT) { 4158c2ecf20Sopenharmony_ci drbd_err(device, 4168c2ecf20Sopenharmony_ci "disk is %s, cannot write al transaction\n", 4178c2ecf20Sopenharmony_ci drbd_disk_str(device->state.disk)); 4188c2ecf20Sopenharmony_ci put_ldev(device); 4198c2ecf20Sopenharmony_ci return -EIO; 4208c2ecf20Sopenharmony_ci } 4218c2ecf20Sopenharmony_ci 4228c2ecf20Sopenharmony_ci /* protects md_io_buffer, al_tr_cycle, ... */ 4238c2ecf20Sopenharmony_ci buffer = drbd_md_get_buffer(device, __func__); 4248c2ecf20Sopenharmony_ci if (!buffer) { 4258c2ecf20Sopenharmony_ci drbd_err(device, "disk failed while waiting for md_io buffer\n"); 4268c2ecf20Sopenharmony_ci put_ldev(device); 4278c2ecf20Sopenharmony_ci return -ENODEV; 4288c2ecf20Sopenharmony_ci } 4298c2ecf20Sopenharmony_ci 4308c2ecf20Sopenharmony_ci err = __al_write_transaction(device, buffer); 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_ci drbd_md_put_buffer(device); 4338c2ecf20Sopenharmony_ci put_ldev(device); 4348c2ecf20Sopenharmony_ci 4358c2ecf20Sopenharmony_ci return err; 4368c2ecf20Sopenharmony_ci} 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_civoid drbd_al_begin_io_commit(struct drbd_device *device) 4408c2ecf20Sopenharmony_ci{ 4418c2ecf20Sopenharmony_ci bool locked = false; 4428c2ecf20Sopenharmony_ci 4438c2ecf20Sopenharmony_ci /* Serialize multiple transactions. 4448c2ecf20Sopenharmony_ci * This uses test_and_set_bit, memory barrier is implicit. 4458c2ecf20Sopenharmony_ci */ 4468c2ecf20Sopenharmony_ci wait_event(device->al_wait, 4478c2ecf20Sopenharmony_ci device->act_log->pending_changes == 0 || 4488c2ecf20Sopenharmony_ci (locked = lc_try_lock_for_transaction(device->act_log))); 4498c2ecf20Sopenharmony_ci 4508c2ecf20Sopenharmony_ci if (locked) { 4518c2ecf20Sopenharmony_ci /* Double check: it may have been committed by someone else, 4528c2ecf20Sopenharmony_ci * while we have been waiting for the lock. */ 4538c2ecf20Sopenharmony_ci if (device->act_log->pending_changes) { 4548c2ecf20Sopenharmony_ci bool write_al_updates; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci rcu_read_lock(); 4578c2ecf20Sopenharmony_ci write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 4588c2ecf20Sopenharmony_ci rcu_read_unlock(); 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_ci if (write_al_updates) 4618c2ecf20Sopenharmony_ci al_write_transaction(device); 4628c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 4638c2ecf20Sopenharmony_ci /* FIXME 4648c2ecf20Sopenharmony_ci if (err) 4658c2ecf20Sopenharmony_ci we need an "lc_cancel" here; 4668c2ecf20Sopenharmony_ci */ 4678c2ecf20Sopenharmony_ci lc_committed(device->act_log); 4688c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 4698c2ecf20Sopenharmony_ci } 4708c2ecf20Sopenharmony_ci lc_unlock(device->act_log); 4718c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 4728c2ecf20Sopenharmony_ci } 4738c2ecf20Sopenharmony_ci} 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_ci/* 4768c2ecf20Sopenharmony_ci * @delegate: delegate activity log I/O to the worker thread 4778c2ecf20Sopenharmony_ci */ 4788c2ecf20Sopenharmony_civoid drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) 4798c2ecf20Sopenharmony_ci{ 4808c2ecf20Sopenharmony_ci if (drbd_al_begin_io_prepare(device, i)) 4818c2ecf20Sopenharmony_ci drbd_al_begin_io_commit(device); 4828c2ecf20Sopenharmony_ci} 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ciint drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) 4858c2ecf20Sopenharmony_ci{ 4868c2ecf20Sopenharmony_ci struct lru_cache *al = device->act_log; 4878c2ecf20Sopenharmony_ci /* for bios crossing activity log extent boundaries, 4888c2ecf20Sopenharmony_ci * we may need to activate two extents in one go */ 4898c2ecf20Sopenharmony_ci unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 4908c2ecf20Sopenharmony_ci unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 4918c2ecf20Sopenharmony_ci unsigned nr_al_extents; 4928c2ecf20Sopenharmony_ci unsigned available_update_slots; 4938c2ecf20Sopenharmony_ci unsigned enr; 4948c2ecf20Sopenharmony_ci 4958c2ecf20Sopenharmony_ci D_ASSERT(device, first <= last); 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_ci nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ 4988c2ecf20Sopenharmony_ci available_update_slots = min(al->nr_elements - al->used, 4998c2ecf20Sopenharmony_ci al->max_pending_changes - al->pending_changes); 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_ci /* We want all necessary updates for a given request within the same transaction 5028c2ecf20Sopenharmony_ci * We could first check how many updates are *actually* needed, 5038c2ecf20Sopenharmony_ci * and use that instead of the worst-case nr_al_extents */ 5048c2ecf20Sopenharmony_ci if (available_update_slots < nr_al_extents) { 5058c2ecf20Sopenharmony_ci /* Too many activity log extents are currently "hot". 5068c2ecf20Sopenharmony_ci * 5078c2ecf20Sopenharmony_ci * If we have accumulated pending changes already, 5088c2ecf20Sopenharmony_ci * we made progress. 5098c2ecf20Sopenharmony_ci * 5108c2ecf20Sopenharmony_ci * If we cannot get even a single pending change through, 5118c2ecf20Sopenharmony_ci * stop the fast path until we made some progress, 5128c2ecf20Sopenharmony_ci * or requests to "cold" extents could be starved. */ 5138c2ecf20Sopenharmony_ci if (!al->pending_changes) 5148c2ecf20Sopenharmony_ci __set_bit(__LC_STARVING, &device->act_log->flags); 5158c2ecf20Sopenharmony_ci return -ENOBUFS; 5168c2ecf20Sopenharmony_ci } 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci /* Is resync active in this area? */ 5198c2ecf20Sopenharmony_ci for (enr = first; enr <= last; enr++) { 5208c2ecf20Sopenharmony_ci struct lc_element *tmp; 5218c2ecf20Sopenharmony_ci tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 5228c2ecf20Sopenharmony_ci if (unlikely(tmp != NULL)) { 5238c2ecf20Sopenharmony_ci struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 5248c2ecf20Sopenharmony_ci if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 5258c2ecf20Sopenharmony_ci if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) 5268c2ecf20Sopenharmony_ci return -EBUSY; 5278c2ecf20Sopenharmony_ci return -EWOULDBLOCK; 5288c2ecf20Sopenharmony_ci } 5298c2ecf20Sopenharmony_ci } 5308c2ecf20Sopenharmony_ci } 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ci /* Checkout the refcounts. 5338c2ecf20Sopenharmony_ci * Given that we checked for available elements and update slots above, 5348c2ecf20Sopenharmony_ci * this has to be successful. */ 5358c2ecf20Sopenharmony_ci for (enr = first; enr <= last; enr++) { 5368c2ecf20Sopenharmony_ci struct lc_element *al_ext; 5378c2ecf20Sopenharmony_ci al_ext = lc_get_cumulative(device->act_log, enr); 5388c2ecf20Sopenharmony_ci if (!al_ext) 5398c2ecf20Sopenharmony_ci drbd_info(device, "LOGIC BUG for enr=%u\n", enr); 5408c2ecf20Sopenharmony_ci } 5418c2ecf20Sopenharmony_ci return 0; 5428c2ecf20Sopenharmony_ci} 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_civoid drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) 5458c2ecf20Sopenharmony_ci{ 5468c2ecf20Sopenharmony_ci /* for bios crossing activity log extent boundaries, 5478c2ecf20Sopenharmony_ci * we may need to activate two extents in one go */ 5488c2ecf20Sopenharmony_ci unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 5498c2ecf20Sopenharmony_ci unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 5508c2ecf20Sopenharmony_ci unsigned enr; 5518c2ecf20Sopenharmony_ci struct lc_element *extent; 5528c2ecf20Sopenharmony_ci unsigned long flags; 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci D_ASSERT(device, first <= last); 5558c2ecf20Sopenharmony_ci spin_lock_irqsave(&device->al_lock, flags); 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci for (enr = first; enr <= last; enr++) { 5588c2ecf20Sopenharmony_ci extent = lc_find(device->act_log, enr); 5598c2ecf20Sopenharmony_ci if (!extent) { 5608c2ecf20Sopenharmony_ci drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); 5618c2ecf20Sopenharmony_ci continue; 5628c2ecf20Sopenharmony_ci } 5638c2ecf20Sopenharmony_ci lc_put(device->act_log, extent); 5648c2ecf20Sopenharmony_ci } 5658c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&device->al_lock, flags); 5668c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 5678c2ecf20Sopenharmony_ci} 5688c2ecf20Sopenharmony_ci 5698c2ecf20Sopenharmony_cistatic int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 5708c2ecf20Sopenharmony_ci{ 5718c2ecf20Sopenharmony_ci int rv; 5728c2ecf20Sopenharmony_ci 5738c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 5748c2ecf20Sopenharmony_ci rv = (al_ext->refcnt == 0); 5758c2ecf20Sopenharmony_ci if (likely(rv)) 5768c2ecf20Sopenharmony_ci lc_del(device->act_log, al_ext); 5778c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 5788c2ecf20Sopenharmony_ci 5798c2ecf20Sopenharmony_ci return rv; 5808c2ecf20Sopenharmony_ci} 5818c2ecf20Sopenharmony_ci 5828c2ecf20Sopenharmony_ci/** 5838c2ecf20Sopenharmony_ci * drbd_al_shrink() - Removes all active extents form the activity log 5848c2ecf20Sopenharmony_ci * @device: DRBD device. 5858c2ecf20Sopenharmony_ci * 5868c2ecf20Sopenharmony_ci * Removes all active extents form the activity log, waiting until 5878c2ecf20Sopenharmony_ci * the reference count of each entry dropped to 0 first, of course. 5888c2ecf20Sopenharmony_ci * 5898c2ecf20Sopenharmony_ci * You need to lock device->act_log with lc_try_lock() / lc_unlock() 5908c2ecf20Sopenharmony_ci */ 5918c2ecf20Sopenharmony_civoid drbd_al_shrink(struct drbd_device *device) 5928c2ecf20Sopenharmony_ci{ 5938c2ecf20Sopenharmony_ci struct lc_element *al_ext; 5948c2ecf20Sopenharmony_ci int i; 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_ci D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags)); 5978c2ecf20Sopenharmony_ci 5988c2ecf20Sopenharmony_ci for (i = 0; i < device->act_log->nr_elements; i++) { 5998c2ecf20Sopenharmony_ci al_ext = lc_element_by_index(device->act_log, i); 6008c2ecf20Sopenharmony_ci if (al_ext->lc_number == LC_FREE) 6018c2ecf20Sopenharmony_ci continue; 6028c2ecf20Sopenharmony_ci wait_event(device->al_wait, _try_lc_del(device, al_ext)); 6038c2ecf20Sopenharmony_ci } 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 6068c2ecf20Sopenharmony_ci} 6078c2ecf20Sopenharmony_ci 6088c2ecf20Sopenharmony_ciint drbd_al_initialize(struct drbd_device *device, void *buffer) 6098c2ecf20Sopenharmony_ci{ 6108c2ecf20Sopenharmony_ci struct al_transaction_on_disk *al = buffer; 6118c2ecf20Sopenharmony_ci struct drbd_md *md = &device->ldev->md; 6128c2ecf20Sopenharmony_ci int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 6138c2ecf20Sopenharmony_ci int i; 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_ci __al_write_transaction(device, al); 6168c2ecf20Sopenharmony_ci /* There may or may not have been a pending transaction. */ 6178c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 6188c2ecf20Sopenharmony_ci lc_committed(device->act_log); 6198c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 6208c2ecf20Sopenharmony_ci 6218c2ecf20Sopenharmony_ci /* The rest of the transactions will have an empty "updates" list, and 6228c2ecf20Sopenharmony_ci * are written out only to provide the context, and to initialize the 6238c2ecf20Sopenharmony_ci * on-disk ring buffer. */ 6248c2ecf20Sopenharmony_ci for (i = 1; i < al_size_4k; i++) { 6258c2ecf20Sopenharmony_ci int err = __al_write_transaction(device, al); 6268c2ecf20Sopenharmony_ci if (err) 6278c2ecf20Sopenharmony_ci return err; 6288c2ecf20Sopenharmony_ci } 6298c2ecf20Sopenharmony_ci return 0; 6308c2ecf20Sopenharmony_ci} 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_cistatic const char *drbd_change_sync_fname[] = { 6338c2ecf20Sopenharmony_ci [RECORD_RS_FAILED] = "drbd_rs_failed_io", 6348c2ecf20Sopenharmony_ci [SET_IN_SYNC] = "drbd_set_in_sync", 6358c2ecf20Sopenharmony_ci [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" 6368c2ecf20Sopenharmony_ci}; 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ci/* ATTENTION. The AL's extents are 4MB each, while the extents in the 6398c2ecf20Sopenharmony_ci * resync LRU-cache are 16MB each. 6408c2ecf20Sopenharmony_ci * The caller of this function has to hold an get_ldev() reference. 6418c2ecf20Sopenharmony_ci * 6428c2ecf20Sopenharmony_ci * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), 6438c2ecf20Sopenharmony_ci * potentially pulling in (and recounting the corresponding bits) 6448c2ecf20Sopenharmony_ci * this resync extent into the resync extent lru cache. 6458c2ecf20Sopenharmony_ci * 6468c2ecf20Sopenharmony_ci * Returns whether all bits have been cleared for this resync extent, 6478c2ecf20Sopenharmony_ci * precisely: (rs_left <= rs_failed) 6488c2ecf20Sopenharmony_ci * 6498c2ecf20Sopenharmony_ci * TODO will be obsoleted once we have a caching lru of the on disk bitmap 6508c2ecf20Sopenharmony_ci */ 6518c2ecf20Sopenharmony_cistatic bool update_rs_extent(struct drbd_device *device, 6528c2ecf20Sopenharmony_ci unsigned int enr, int count, 6538c2ecf20Sopenharmony_ci enum update_sync_bits_mode mode) 6548c2ecf20Sopenharmony_ci{ 6558c2ecf20Sopenharmony_ci struct lc_element *e; 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_ci D_ASSERT(device, atomic_read(&device->local_cnt)); 6588c2ecf20Sopenharmony_ci 6598c2ecf20Sopenharmony_ci /* When setting out-of-sync bits, 6608c2ecf20Sopenharmony_ci * we don't need it cached (lc_find). 6618c2ecf20Sopenharmony_ci * But if it is present in the cache, 6628c2ecf20Sopenharmony_ci * we should update the cached bit count. 6638c2ecf20Sopenharmony_ci * Otherwise, that extent should be in the resync extent lru cache 6648c2ecf20Sopenharmony_ci * already -- or we want to pull it in if necessary -- (lc_get), 6658c2ecf20Sopenharmony_ci * then update and check rs_left and rs_failed. */ 6668c2ecf20Sopenharmony_ci if (mode == SET_OUT_OF_SYNC) 6678c2ecf20Sopenharmony_ci e = lc_find(device->resync, enr); 6688c2ecf20Sopenharmony_ci else 6698c2ecf20Sopenharmony_ci e = lc_get(device->resync, enr); 6708c2ecf20Sopenharmony_ci if (e) { 6718c2ecf20Sopenharmony_ci struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 6728c2ecf20Sopenharmony_ci if (ext->lce.lc_number == enr) { 6738c2ecf20Sopenharmony_ci if (mode == SET_IN_SYNC) 6748c2ecf20Sopenharmony_ci ext->rs_left -= count; 6758c2ecf20Sopenharmony_ci else if (mode == SET_OUT_OF_SYNC) 6768c2ecf20Sopenharmony_ci ext->rs_left += count; 6778c2ecf20Sopenharmony_ci else 6788c2ecf20Sopenharmony_ci ext->rs_failed += count; 6798c2ecf20Sopenharmony_ci if (ext->rs_left < ext->rs_failed) { 6808c2ecf20Sopenharmony_ci drbd_warn(device, "BAD! enr=%u rs_left=%d " 6818c2ecf20Sopenharmony_ci "rs_failed=%d count=%d cstate=%s\n", 6828c2ecf20Sopenharmony_ci ext->lce.lc_number, ext->rs_left, 6838c2ecf20Sopenharmony_ci ext->rs_failed, count, 6848c2ecf20Sopenharmony_ci drbd_conn_str(device->state.conn)); 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci /* We don't expect to be able to clear more bits 6878c2ecf20Sopenharmony_ci * than have been set when we originally counted 6888c2ecf20Sopenharmony_ci * the set bits to cache that value in ext->rs_left. 6898c2ecf20Sopenharmony_ci * Whatever the reason (disconnect during resync, 6908c2ecf20Sopenharmony_ci * delayed local completion of an application write), 6918c2ecf20Sopenharmony_ci * try to fix it up by recounting here. */ 6928c2ecf20Sopenharmony_ci ext->rs_left = drbd_bm_e_weight(device, enr); 6938c2ecf20Sopenharmony_ci } 6948c2ecf20Sopenharmony_ci } else { 6958c2ecf20Sopenharmony_ci /* Normally this element should be in the cache, 6968c2ecf20Sopenharmony_ci * since drbd_rs_begin_io() pulled it already in. 6978c2ecf20Sopenharmony_ci * 6988c2ecf20Sopenharmony_ci * But maybe an application write finished, and we set 6998c2ecf20Sopenharmony_ci * something outside the resync lru_cache in sync. 7008c2ecf20Sopenharmony_ci */ 7018c2ecf20Sopenharmony_ci int rs_left = drbd_bm_e_weight(device, enr); 7028c2ecf20Sopenharmony_ci if (ext->flags != 0) { 7038c2ecf20Sopenharmony_ci drbd_warn(device, "changing resync lce: %d[%u;%02lx]" 7048c2ecf20Sopenharmony_ci " -> %d[%u;00]\n", 7058c2ecf20Sopenharmony_ci ext->lce.lc_number, ext->rs_left, 7068c2ecf20Sopenharmony_ci ext->flags, enr, rs_left); 7078c2ecf20Sopenharmony_ci ext->flags = 0; 7088c2ecf20Sopenharmony_ci } 7098c2ecf20Sopenharmony_ci if (ext->rs_failed) { 7108c2ecf20Sopenharmony_ci drbd_warn(device, "Kicking resync_lru element enr=%u " 7118c2ecf20Sopenharmony_ci "out with rs_failed=%d\n", 7128c2ecf20Sopenharmony_ci ext->lce.lc_number, ext->rs_failed); 7138c2ecf20Sopenharmony_ci } 7148c2ecf20Sopenharmony_ci ext->rs_left = rs_left; 7158c2ecf20Sopenharmony_ci ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; 7168c2ecf20Sopenharmony_ci /* we don't keep a persistent log of the resync lru, 7178c2ecf20Sopenharmony_ci * we can commit any change right away. */ 7188c2ecf20Sopenharmony_ci lc_committed(device->resync); 7198c2ecf20Sopenharmony_ci } 7208c2ecf20Sopenharmony_ci if (mode != SET_OUT_OF_SYNC) 7218c2ecf20Sopenharmony_ci lc_put(device->resync, &ext->lce); 7228c2ecf20Sopenharmony_ci /* no race, we are within the al_lock! */ 7238c2ecf20Sopenharmony_ci 7248c2ecf20Sopenharmony_ci if (ext->rs_left <= ext->rs_failed) { 7258c2ecf20Sopenharmony_ci ext->rs_failed = 0; 7268c2ecf20Sopenharmony_ci return true; 7278c2ecf20Sopenharmony_ci } 7288c2ecf20Sopenharmony_ci } else if (mode != SET_OUT_OF_SYNC) { 7298c2ecf20Sopenharmony_ci /* be quiet if lc_find() did not find it. */ 7308c2ecf20Sopenharmony_ci drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 7318c2ecf20Sopenharmony_ci device->resync_locked, 7328c2ecf20Sopenharmony_ci device->resync->nr_elements, 7338c2ecf20Sopenharmony_ci device->resync->flags); 7348c2ecf20Sopenharmony_ci } 7358c2ecf20Sopenharmony_ci return false; 7368c2ecf20Sopenharmony_ci} 7378c2ecf20Sopenharmony_ci 7388c2ecf20Sopenharmony_civoid drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) 7398c2ecf20Sopenharmony_ci{ 7408c2ecf20Sopenharmony_ci unsigned long now = jiffies; 7418c2ecf20Sopenharmony_ci unsigned long last = device->rs_mark_time[device->rs_last_mark]; 7428c2ecf20Sopenharmony_ci int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; 7438c2ecf20Sopenharmony_ci if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { 7448c2ecf20Sopenharmony_ci if (device->rs_mark_left[device->rs_last_mark] != still_to_go && 7458c2ecf20Sopenharmony_ci device->state.conn != C_PAUSED_SYNC_T && 7468c2ecf20Sopenharmony_ci device->state.conn != C_PAUSED_SYNC_S) { 7478c2ecf20Sopenharmony_ci device->rs_mark_time[next] = now; 7488c2ecf20Sopenharmony_ci device->rs_mark_left[next] = still_to_go; 7498c2ecf20Sopenharmony_ci device->rs_last_mark = next; 7508c2ecf20Sopenharmony_ci } 7518c2ecf20Sopenharmony_ci } 7528c2ecf20Sopenharmony_ci} 7538c2ecf20Sopenharmony_ci 7548c2ecf20Sopenharmony_ci/* It is called lazy update, so don't do write-out too often. */ 7558c2ecf20Sopenharmony_cistatic bool lazy_bitmap_update_due(struct drbd_device *device) 7568c2ecf20Sopenharmony_ci{ 7578c2ecf20Sopenharmony_ci return time_after(jiffies, device->rs_last_bcast + 2*HZ); 7588c2ecf20Sopenharmony_ci} 7598c2ecf20Sopenharmony_ci 7608c2ecf20Sopenharmony_cistatic void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) 7618c2ecf20Sopenharmony_ci{ 7628c2ecf20Sopenharmony_ci if (rs_done) { 7638c2ecf20Sopenharmony_ci struct drbd_connection *connection = first_peer_device(device)->connection; 7648c2ecf20Sopenharmony_ci if (connection->agreed_pro_version <= 95 || 7658c2ecf20Sopenharmony_ci is_sync_target_state(device->state.conn)) 7668c2ecf20Sopenharmony_ci set_bit(RS_DONE, &device->flags); 7678c2ecf20Sopenharmony_ci /* and also set RS_PROGRESS below */ 7688c2ecf20Sopenharmony_ci 7698c2ecf20Sopenharmony_ci /* Else: rather wait for explicit notification via receive_state, 7708c2ecf20Sopenharmony_ci * to avoid uuids-rotated-too-fast causing full resync 7718c2ecf20Sopenharmony_ci * in next handshake, in case the replication link breaks 7728c2ecf20Sopenharmony_ci * at the most unfortunate time... */ 7738c2ecf20Sopenharmony_ci } else if (!lazy_bitmap_update_due(device)) 7748c2ecf20Sopenharmony_ci return; 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ci drbd_device_post_work(device, RS_PROGRESS); 7778c2ecf20Sopenharmony_ci} 7788c2ecf20Sopenharmony_ci 7798c2ecf20Sopenharmony_cistatic int update_sync_bits(struct drbd_device *device, 7808c2ecf20Sopenharmony_ci unsigned long sbnr, unsigned long ebnr, 7818c2ecf20Sopenharmony_ci enum update_sync_bits_mode mode) 7828c2ecf20Sopenharmony_ci{ 7838c2ecf20Sopenharmony_ci /* 7848c2ecf20Sopenharmony_ci * We keep a count of set bits per resync-extent in the ->rs_left 7858c2ecf20Sopenharmony_ci * caching member, so we need to loop and work within the resync extent 7868c2ecf20Sopenharmony_ci * alignment. Typically this loop will execute exactly once. 7878c2ecf20Sopenharmony_ci */ 7888c2ecf20Sopenharmony_ci unsigned long flags; 7898c2ecf20Sopenharmony_ci unsigned long count = 0; 7908c2ecf20Sopenharmony_ci unsigned int cleared = 0; 7918c2ecf20Sopenharmony_ci while (sbnr <= ebnr) { 7928c2ecf20Sopenharmony_ci /* set temporary boundary bit number to last bit number within 7938c2ecf20Sopenharmony_ci * the resync extent of the current start bit number, 7948c2ecf20Sopenharmony_ci * but cap at provided end bit number */ 7958c2ecf20Sopenharmony_ci unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); 7968c2ecf20Sopenharmony_ci unsigned long c; 7978c2ecf20Sopenharmony_ci 7988c2ecf20Sopenharmony_ci if (mode == RECORD_RS_FAILED) 7998c2ecf20Sopenharmony_ci /* Only called from drbd_rs_failed_io(), bits 8008c2ecf20Sopenharmony_ci * supposedly still set. Recount, maybe some 8018c2ecf20Sopenharmony_ci * of the bits have been successfully cleared 8028c2ecf20Sopenharmony_ci * by application IO meanwhile. 8038c2ecf20Sopenharmony_ci */ 8048c2ecf20Sopenharmony_ci c = drbd_bm_count_bits(device, sbnr, tbnr); 8058c2ecf20Sopenharmony_ci else if (mode == SET_IN_SYNC) 8068c2ecf20Sopenharmony_ci c = drbd_bm_clear_bits(device, sbnr, tbnr); 8078c2ecf20Sopenharmony_ci else /* if (mode == SET_OUT_OF_SYNC) */ 8088c2ecf20Sopenharmony_ci c = drbd_bm_set_bits(device, sbnr, tbnr); 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_ci if (c) { 8118c2ecf20Sopenharmony_ci spin_lock_irqsave(&device->al_lock, flags); 8128c2ecf20Sopenharmony_ci cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); 8138c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&device->al_lock, flags); 8148c2ecf20Sopenharmony_ci count += c; 8158c2ecf20Sopenharmony_ci } 8168c2ecf20Sopenharmony_ci sbnr = tbnr + 1; 8178c2ecf20Sopenharmony_ci } 8188c2ecf20Sopenharmony_ci if (count) { 8198c2ecf20Sopenharmony_ci if (mode == SET_IN_SYNC) { 8208c2ecf20Sopenharmony_ci unsigned long still_to_go = drbd_bm_total_weight(device); 8218c2ecf20Sopenharmony_ci bool rs_is_done = (still_to_go <= device->rs_failed); 8228c2ecf20Sopenharmony_ci drbd_advance_rs_marks(device, still_to_go); 8238c2ecf20Sopenharmony_ci if (cleared || rs_is_done) 8248c2ecf20Sopenharmony_ci maybe_schedule_on_disk_bitmap_update(device, rs_is_done); 8258c2ecf20Sopenharmony_ci } else if (mode == RECORD_RS_FAILED) 8268c2ecf20Sopenharmony_ci device->rs_failed += count; 8278c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 8288c2ecf20Sopenharmony_ci } 8298c2ecf20Sopenharmony_ci return count; 8308c2ecf20Sopenharmony_ci} 8318c2ecf20Sopenharmony_ci 8328c2ecf20Sopenharmony_cistatic bool plausible_request_size(int size) 8338c2ecf20Sopenharmony_ci{ 8348c2ecf20Sopenharmony_ci return size > 0 8358c2ecf20Sopenharmony_ci && size <= DRBD_MAX_BATCH_BIO_SIZE 8368c2ecf20Sopenharmony_ci && IS_ALIGNED(size, 512); 8378c2ecf20Sopenharmony_ci} 8388c2ecf20Sopenharmony_ci 8398c2ecf20Sopenharmony_ci/* clear the bit corresponding to the piece of storage in question: 8408c2ecf20Sopenharmony_ci * size byte of data starting from sector. Only clear a bits of the affected 8418c2ecf20Sopenharmony_ci * one ore more _aligned_ BM_BLOCK_SIZE blocks. 8428c2ecf20Sopenharmony_ci * 8438c2ecf20Sopenharmony_ci * called by worker on C_SYNC_TARGET and receiver on SyncSource. 8448c2ecf20Sopenharmony_ci * 8458c2ecf20Sopenharmony_ci */ 8468c2ecf20Sopenharmony_ciint __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, 8478c2ecf20Sopenharmony_ci enum update_sync_bits_mode mode) 8488c2ecf20Sopenharmony_ci{ 8498c2ecf20Sopenharmony_ci /* Is called from worker and receiver context _only_ */ 8508c2ecf20Sopenharmony_ci unsigned long sbnr, ebnr, lbnr; 8518c2ecf20Sopenharmony_ci unsigned long count = 0; 8528c2ecf20Sopenharmony_ci sector_t esector, nr_sectors; 8538c2ecf20Sopenharmony_ci 8548c2ecf20Sopenharmony_ci /* This would be an empty REQ_PREFLUSH, be silent. */ 8558c2ecf20Sopenharmony_ci if ((mode == SET_OUT_OF_SYNC) && size == 0) 8568c2ecf20Sopenharmony_ci return 0; 8578c2ecf20Sopenharmony_ci 8588c2ecf20Sopenharmony_ci if (!plausible_request_size(size)) { 8598c2ecf20Sopenharmony_ci drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", 8608c2ecf20Sopenharmony_ci drbd_change_sync_fname[mode], 8618c2ecf20Sopenharmony_ci (unsigned long long)sector, size); 8628c2ecf20Sopenharmony_ci return 0; 8638c2ecf20Sopenharmony_ci } 8648c2ecf20Sopenharmony_ci 8658c2ecf20Sopenharmony_ci if (!get_ldev(device)) 8668c2ecf20Sopenharmony_ci return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ 8678c2ecf20Sopenharmony_ci 8688c2ecf20Sopenharmony_ci nr_sectors = get_capacity(device->vdisk); 8698c2ecf20Sopenharmony_ci esector = sector + (size >> 9) - 1; 8708c2ecf20Sopenharmony_ci 8718c2ecf20Sopenharmony_ci if (!expect(sector < nr_sectors)) 8728c2ecf20Sopenharmony_ci goto out; 8738c2ecf20Sopenharmony_ci if (!expect(esector < nr_sectors)) 8748c2ecf20Sopenharmony_ci esector = nr_sectors - 1; 8758c2ecf20Sopenharmony_ci 8768c2ecf20Sopenharmony_ci lbnr = BM_SECT_TO_BIT(nr_sectors-1); 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci if (mode == SET_IN_SYNC) { 8798c2ecf20Sopenharmony_ci /* Round up start sector, round down end sector. We make sure 8808c2ecf20Sopenharmony_ci * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ 8818c2ecf20Sopenharmony_ci if (unlikely(esector < BM_SECT_PER_BIT-1)) 8828c2ecf20Sopenharmony_ci goto out; 8838c2ecf20Sopenharmony_ci if (unlikely(esector == (nr_sectors-1))) 8848c2ecf20Sopenharmony_ci ebnr = lbnr; 8858c2ecf20Sopenharmony_ci else 8868c2ecf20Sopenharmony_ci ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 8878c2ecf20Sopenharmony_ci sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 8888c2ecf20Sopenharmony_ci } else { 8898c2ecf20Sopenharmony_ci /* We set it out of sync, or record resync failure. 8908c2ecf20Sopenharmony_ci * Should not round anything here. */ 8918c2ecf20Sopenharmony_ci sbnr = BM_SECT_TO_BIT(sector); 8928c2ecf20Sopenharmony_ci ebnr = BM_SECT_TO_BIT(esector); 8938c2ecf20Sopenharmony_ci } 8948c2ecf20Sopenharmony_ci 8958c2ecf20Sopenharmony_ci count = update_sync_bits(device, sbnr, ebnr, mode); 8968c2ecf20Sopenharmony_ciout: 8978c2ecf20Sopenharmony_ci put_ldev(device); 8988c2ecf20Sopenharmony_ci return count; 8998c2ecf20Sopenharmony_ci} 9008c2ecf20Sopenharmony_ci 9018c2ecf20Sopenharmony_cistatic 9028c2ecf20Sopenharmony_cistruct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr) 9038c2ecf20Sopenharmony_ci{ 9048c2ecf20Sopenharmony_ci struct lc_element *e; 9058c2ecf20Sopenharmony_ci struct bm_extent *bm_ext; 9068c2ecf20Sopenharmony_ci int wakeup = 0; 9078c2ecf20Sopenharmony_ci unsigned long rs_flags; 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 9108c2ecf20Sopenharmony_ci if (device->resync_locked > device->resync->nr_elements/2) { 9118c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 9128c2ecf20Sopenharmony_ci return NULL; 9138c2ecf20Sopenharmony_ci } 9148c2ecf20Sopenharmony_ci e = lc_get(device->resync, enr); 9158c2ecf20Sopenharmony_ci bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 9168c2ecf20Sopenharmony_ci if (bm_ext) { 9178c2ecf20Sopenharmony_ci if (bm_ext->lce.lc_number != enr) { 9188c2ecf20Sopenharmony_ci bm_ext->rs_left = drbd_bm_e_weight(device, enr); 9198c2ecf20Sopenharmony_ci bm_ext->rs_failed = 0; 9208c2ecf20Sopenharmony_ci lc_committed(device->resync); 9218c2ecf20Sopenharmony_ci wakeup = 1; 9228c2ecf20Sopenharmony_ci } 9238c2ecf20Sopenharmony_ci if (bm_ext->lce.refcnt == 1) 9248c2ecf20Sopenharmony_ci device->resync_locked++; 9258c2ecf20Sopenharmony_ci set_bit(BME_NO_WRITES, &bm_ext->flags); 9268c2ecf20Sopenharmony_ci } 9278c2ecf20Sopenharmony_ci rs_flags = device->resync->flags; 9288c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 9298c2ecf20Sopenharmony_ci if (wakeup) 9308c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 9318c2ecf20Sopenharmony_ci 9328c2ecf20Sopenharmony_ci if (!bm_ext) { 9338c2ecf20Sopenharmony_ci if (rs_flags & LC_STARVING) 9348c2ecf20Sopenharmony_ci drbd_warn(device, "Have to wait for element" 9358c2ecf20Sopenharmony_ci " (resync LRU too small?)\n"); 9368c2ecf20Sopenharmony_ci BUG_ON(rs_flags & LC_LOCKED); 9378c2ecf20Sopenharmony_ci } 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci return bm_ext; 9408c2ecf20Sopenharmony_ci} 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_cistatic int _is_in_al(struct drbd_device *device, unsigned int enr) 9438c2ecf20Sopenharmony_ci{ 9448c2ecf20Sopenharmony_ci int rv; 9458c2ecf20Sopenharmony_ci 9468c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 9478c2ecf20Sopenharmony_ci rv = lc_is_used(device->act_log, enr); 9488c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 9498c2ecf20Sopenharmony_ci 9508c2ecf20Sopenharmony_ci return rv; 9518c2ecf20Sopenharmony_ci} 9528c2ecf20Sopenharmony_ci 9538c2ecf20Sopenharmony_ci/** 9548c2ecf20Sopenharmony_ci * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 9558c2ecf20Sopenharmony_ci * @device: DRBD device. 9568c2ecf20Sopenharmony_ci * @sector: The sector number. 9578c2ecf20Sopenharmony_ci * 9588c2ecf20Sopenharmony_ci * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. 9598c2ecf20Sopenharmony_ci */ 9608c2ecf20Sopenharmony_ciint drbd_rs_begin_io(struct drbd_device *device, sector_t sector) 9618c2ecf20Sopenharmony_ci{ 9628c2ecf20Sopenharmony_ci unsigned int enr = BM_SECT_TO_EXT(sector); 9638c2ecf20Sopenharmony_ci struct bm_extent *bm_ext; 9648c2ecf20Sopenharmony_ci int i, sig; 9658c2ecf20Sopenharmony_ci bool sa; 9668c2ecf20Sopenharmony_ci 9678c2ecf20Sopenharmony_ciretry: 9688c2ecf20Sopenharmony_ci sig = wait_event_interruptible(device->al_wait, 9698c2ecf20Sopenharmony_ci (bm_ext = _bme_get(device, enr))); 9708c2ecf20Sopenharmony_ci if (sig) 9718c2ecf20Sopenharmony_ci return -EINTR; 9728c2ecf20Sopenharmony_ci 9738c2ecf20Sopenharmony_ci if (test_bit(BME_LOCKED, &bm_ext->flags)) 9748c2ecf20Sopenharmony_ci return 0; 9758c2ecf20Sopenharmony_ci 9768c2ecf20Sopenharmony_ci /* step aside only while we are above c-min-rate; unless disabled. */ 9778c2ecf20Sopenharmony_ci sa = drbd_rs_c_min_rate_throttle(device); 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 9808c2ecf20Sopenharmony_ci sig = wait_event_interruptible(device->al_wait, 9818c2ecf20Sopenharmony_ci !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || 9828c2ecf20Sopenharmony_ci (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); 9838c2ecf20Sopenharmony_ci 9848c2ecf20Sopenharmony_ci if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { 9858c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 9868c2ecf20Sopenharmony_ci if (lc_put(device->resync, &bm_ext->lce) == 0) { 9878c2ecf20Sopenharmony_ci bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ 9888c2ecf20Sopenharmony_ci device->resync_locked--; 9898c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 9908c2ecf20Sopenharmony_ci } 9918c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 9928c2ecf20Sopenharmony_ci if (sig) 9938c2ecf20Sopenharmony_ci return -EINTR; 9948c2ecf20Sopenharmony_ci if (schedule_timeout_interruptible(HZ/10)) 9958c2ecf20Sopenharmony_ci return -EINTR; 9968c2ecf20Sopenharmony_ci goto retry; 9978c2ecf20Sopenharmony_ci } 9988c2ecf20Sopenharmony_ci } 9998c2ecf20Sopenharmony_ci set_bit(BME_LOCKED, &bm_ext->flags); 10008c2ecf20Sopenharmony_ci return 0; 10018c2ecf20Sopenharmony_ci} 10028c2ecf20Sopenharmony_ci 10038c2ecf20Sopenharmony_ci/** 10048c2ecf20Sopenharmony_ci * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 10058c2ecf20Sopenharmony_ci * @device: DRBD device. 10068c2ecf20Sopenharmony_ci * @sector: The sector number. 10078c2ecf20Sopenharmony_ci * 10088c2ecf20Sopenharmony_ci * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 10098c2ecf20Sopenharmony_ci * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN 10108c2ecf20Sopenharmony_ci * if there is still application IO going on in this area. 10118c2ecf20Sopenharmony_ci */ 10128c2ecf20Sopenharmony_ciint drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) 10138c2ecf20Sopenharmony_ci{ 10148c2ecf20Sopenharmony_ci unsigned int enr = BM_SECT_TO_EXT(sector); 10158c2ecf20Sopenharmony_ci const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 10168c2ecf20Sopenharmony_ci struct lc_element *e; 10178c2ecf20Sopenharmony_ci struct bm_extent *bm_ext; 10188c2ecf20Sopenharmony_ci int i; 10198c2ecf20Sopenharmony_ci bool throttle = drbd_rs_should_slow_down(device, sector, true); 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ci /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, 10228c2ecf20Sopenharmony_ci * not yet BME_LOCKED) extent needs to be kicked out explicitly if we 10238c2ecf20Sopenharmony_ci * need to throttle. There is at most one such half-locked extent, 10248c2ecf20Sopenharmony_ci * which is remembered in resync_wenr. */ 10258c2ecf20Sopenharmony_ci 10268c2ecf20Sopenharmony_ci if (throttle && device->resync_wenr != enr) 10278c2ecf20Sopenharmony_ci return -EAGAIN; 10288c2ecf20Sopenharmony_ci 10298c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 10308c2ecf20Sopenharmony_ci if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { 10318c2ecf20Sopenharmony_ci /* in case you have very heavy scattered io, it may 10328c2ecf20Sopenharmony_ci * stall the syncer undefined if we give up the ref count 10338c2ecf20Sopenharmony_ci * when we try again and requeue. 10348c2ecf20Sopenharmony_ci * 10358c2ecf20Sopenharmony_ci * if we don't give up the refcount, but the next time 10368c2ecf20Sopenharmony_ci * we are scheduled this extent has been "synced" by new 10378c2ecf20Sopenharmony_ci * application writes, we'd miss the lc_put on the 10388c2ecf20Sopenharmony_ci * extent we keep the refcount on. 10398c2ecf20Sopenharmony_ci * so we remembered which extent we had to try again, and 10408c2ecf20Sopenharmony_ci * if the next requested one is something else, we do 10418c2ecf20Sopenharmony_ci * the lc_put here... 10428c2ecf20Sopenharmony_ci * we also have to wake_up 10438c2ecf20Sopenharmony_ci */ 10448c2ecf20Sopenharmony_ci e = lc_find(device->resync, device->resync_wenr); 10458c2ecf20Sopenharmony_ci bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 10468c2ecf20Sopenharmony_ci if (bm_ext) { 10478c2ecf20Sopenharmony_ci D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 10488c2ecf20Sopenharmony_ci D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 10498c2ecf20Sopenharmony_ci clear_bit(BME_NO_WRITES, &bm_ext->flags); 10508c2ecf20Sopenharmony_ci device->resync_wenr = LC_FREE; 10518c2ecf20Sopenharmony_ci if (lc_put(device->resync, &bm_ext->lce) == 0) { 10528c2ecf20Sopenharmony_ci bm_ext->flags = 0; 10538c2ecf20Sopenharmony_ci device->resync_locked--; 10548c2ecf20Sopenharmony_ci } 10558c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 10568c2ecf20Sopenharmony_ci } else { 10578c2ecf20Sopenharmony_ci drbd_alert(device, "LOGIC BUG\n"); 10588c2ecf20Sopenharmony_ci } 10598c2ecf20Sopenharmony_ci } 10608c2ecf20Sopenharmony_ci /* TRY. */ 10618c2ecf20Sopenharmony_ci e = lc_try_get(device->resync, enr); 10628c2ecf20Sopenharmony_ci bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 10638c2ecf20Sopenharmony_ci if (bm_ext) { 10648c2ecf20Sopenharmony_ci if (test_bit(BME_LOCKED, &bm_ext->flags)) 10658c2ecf20Sopenharmony_ci goto proceed; 10668c2ecf20Sopenharmony_ci if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 10678c2ecf20Sopenharmony_ci device->resync_locked++; 10688c2ecf20Sopenharmony_ci } else { 10698c2ecf20Sopenharmony_ci /* we did set the BME_NO_WRITES, 10708c2ecf20Sopenharmony_ci * but then could not set BME_LOCKED, 10718c2ecf20Sopenharmony_ci * so we tried again. 10728c2ecf20Sopenharmony_ci * drop the extra reference. */ 10738c2ecf20Sopenharmony_ci bm_ext->lce.refcnt--; 10748c2ecf20Sopenharmony_ci D_ASSERT(device, bm_ext->lce.refcnt > 0); 10758c2ecf20Sopenharmony_ci } 10768c2ecf20Sopenharmony_ci goto check_al; 10778c2ecf20Sopenharmony_ci } else { 10788c2ecf20Sopenharmony_ci /* do we rather want to try later? */ 10798c2ecf20Sopenharmony_ci if (device->resync_locked > device->resync->nr_elements-3) 10808c2ecf20Sopenharmony_ci goto try_again; 10818c2ecf20Sopenharmony_ci /* Do or do not. There is no try. -- Yoda */ 10828c2ecf20Sopenharmony_ci e = lc_get(device->resync, enr); 10838c2ecf20Sopenharmony_ci bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 10848c2ecf20Sopenharmony_ci if (!bm_ext) { 10858c2ecf20Sopenharmony_ci const unsigned long rs_flags = device->resync->flags; 10868c2ecf20Sopenharmony_ci if (rs_flags & LC_STARVING) 10878c2ecf20Sopenharmony_ci drbd_warn(device, "Have to wait for element" 10888c2ecf20Sopenharmony_ci " (resync LRU too small?)\n"); 10898c2ecf20Sopenharmony_ci BUG_ON(rs_flags & LC_LOCKED); 10908c2ecf20Sopenharmony_ci goto try_again; 10918c2ecf20Sopenharmony_ci } 10928c2ecf20Sopenharmony_ci if (bm_ext->lce.lc_number != enr) { 10938c2ecf20Sopenharmony_ci bm_ext->rs_left = drbd_bm_e_weight(device, enr); 10948c2ecf20Sopenharmony_ci bm_ext->rs_failed = 0; 10958c2ecf20Sopenharmony_ci lc_committed(device->resync); 10968c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 10978c2ecf20Sopenharmony_ci D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0); 10988c2ecf20Sopenharmony_ci } 10998c2ecf20Sopenharmony_ci set_bit(BME_NO_WRITES, &bm_ext->flags); 11008c2ecf20Sopenharmony_ci D_ASSERT(device, bm_ext->lce.refcnt == 1); 11018c2ecf20Sopenharmony_ci device->resync_locked++; 11028c2ecf20Sopenharmony_ci goto check_al; 11038c2ecf20Sopenharmony_ci } 11048c2ecf20Sopenharmony_cicheck_al: 11058c2ecf20Sopenharmony_ci for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 11068c2ecf20Sopenharmony_ci if (lc_is_used(device->act_log, al_enr+i)) 11078c2ecf20Sopenharmony_ci goto try_again; 11088c2ecf20Sopenharmony_ci } 11098c2ecf20Sopenharmony_ci set_bit(BME_LOCKED, &bm_ext->flags); 11108c2ecf20Sopenharmony_ciproceed: 11118c2ecf20Sopenharmony_ci device->resync_wenr = LC_FREE; 11128c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 11138c2ecf20Sopenharmony_ci return 0; 11148c2ecf20Sopenharmony_ci 11158c2ecf20Sopenharmony_citry_again: 11168c2ecf20Sopenharmony_ci if (bm_ext) { 11178c2ecf20Sopenharmony_ci if (throttle) { 11188c2ecf20Sopenharmony_ci D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 11198c2ecf20Sopenharmony_ci D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 11208c2ecf20Sopenharmony_ci clear_bit(BME_NO_WRITES, &bm_ext->flags); 11218c2ecf20Sopenharmony_ci device->resync_wenr = LC_FREE; 11228c2ecf20Sopenharmony_ci if (lc_put(device->resync, &bm_ext->lce) == 0) { 11238c2ecf20Sopenharmony_ci bm_ext->flags = 0; 11248c2ecf20Sopenharmony_ci device->resync_locked--; 11258c2ecf20Sopenharmony_ci } 11268c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 11278c2ecf20Sopenharmony_ci } else 11288c2ecf20Sopenharmony_ci device->resync_wenr = enr; 11298c2ecf20Sopenharmony_ci } 11308c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 11318c2ecf20Sopenharmony_ci return -EAGAIN; 11328c2ecf20Sopenharmony_ci} 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_civoid drbd_rs_complete_io(struct drbd_device *device, sector_t sector) 11358c2ecf20Sopenharmony_ci{ 11368c2ecf20Sopenharmony_ci unsigned int enr = BM_SECT_TO_EXT(sector); 11378c2ecf20Sopenharmony_ci struct lc_element *e; 11388c2ecf20Sopenharmony_ci struct bm_extent *bm_ext; 11398c2ecf20Sopenharmony_ci unsigned long flags; 11408c2ecf20Sopenharmony_ci 11418c2ecf20Sopenharmony_ci spin_lock_irqsave(&device->al_lock, flags); 11428c2ecf20Sopenharmony_ci e = lc_find(device->resync, enr); 11438c2ecf20Sopenharmony_ci bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 11448c2ecf20Sopenharmony_ci if (!bm_ext) { 11458c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&device->al_lock, flags); 11468c2ecf20Sopenharmony_ci if (__ratelimit(&drbd_ratelimit_state)) 11478c2ecf20Sopenharmony_ci drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); 11488c2ecf20Sopenharmony_ci return; 11498c2ecf20Sopenharmony_ci } 11508c2ecf20Sopenharmony_ci 11518c2ecf20Sopenharmony_ci if (bm_ext->lce.refcnt == 0) { 11528c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&device->al_lock, flags); 11538c2ecf20Sopenharmony_ci drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, " 11548c2ecf20Sopenharmony_ci "but refcnt is 0!?\n", 11558c2ecf20Sopenharmony_ci (unsigned long long)sector, enr); 11568c2ecf20Sopenharmony_ci return; 11578c2ecf20Sopenharmony_ci } 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_ci if (lc_put(device->resync, &bm_ext->lce) == 0) { 11608c2ecf20Sopenharmony_ci bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ 11618c2ecf20Sopenharmony_ci device->resync_locked--; 11628c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 11638c2ecf20Sopenharmony_ci } 11648c2ecf20Sopenharmony_ci 11658c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&device->al_lock, flags); 11668c2ecf20Sopenharmony_ci} 11678c2ecf20Sopenharmony_ci 11688c2ecf20Sopenharmony_ci/** 11698c2ecf20Sopenharmony_ci * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 11708c2ecf20Sopenharmony_ci * @device: DRBD device. 11718c2ecf20Sopenharmony_ci */ 11728c2ecf20Sopenharmony_civoid drbd_rs_cancel_all(struct drbd_device *device) 11738c2ecf20Sopenharmony_ci{ 11748c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */ 11778c2ecf20Sopenharmony_ci lc_reset(device->resync); 11788c2ecf20Sopenharmony_ci put_ldev(device); 11798c2ecf20Sopenharmony_ci } 11808c2ecf20Sopenharmony_ci device->resync_locked = 0; 11818c2ecf20Sopenharmony_ci device->resync_wenr = LC_FREE; 11828c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 11838c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 11848c2ecf20Sopenharmony_ci} 11858c2ecf20Sopenharmony_ci 11868c2ecf20Sopenharmony_ci/** 11878c2ecf20Sopenharmony_ci * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 11888c2ecf20Sopenharmony_ci * @device: DRBD device. 11898c2ecf20Sopenharmony_ci * 11908c2ecf20Sopenharmony_ci * Returns 0 upon success, -EAGAIN if at least one reference count was 11918c2ecf20Sopenharmony_ci * not zero. 11928c2ecf20Sopenharmony_ci */ 11938c2ecf20Sopenharmony_ciint drbd_rs_del_all(struct drbd_device *device) 11948c2ecf20Sopenharmony_ci{ 11958c2ecf20Sopenharmony_ci struct lc_element *e; 11968c2ecf20Sopenharmony_ci struct bm_extent *bm_ext; 11978c2ecf20Sopenharmony_ci int i; 11988c2ecf20Sopenharmony_ci 11998c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 12008c2ecf20Sopenharmony_ci 12018c2ecf20Sopenharmony_ci if (get_ldev_if_state(device, D_FAILED)) { 12028c2ecf20Sopenharmony_ci /* ok, ->resync is there. */ 12038c2ecf20Sopenharmony_ci for (i = 0; i < device->resync->nr_elements; i++) { 12048c2ecf20Sopenharmony_ci e = lc_element_by_index(device->resync, i); 12058c2ecf20Sopenharmony_ci bm_ext = lc_entry(e, struct bm_extent, lce); 12068c2ecf20Sopenharmony_ci if (bm_ext->lce.lc_number == LC_FREE) 12078c2ecf20Sopenharmony_ci continue; 12088c2ecf20Sopenharmony_ci if (bm_ext->lce.lc_number == device->resync_wenr) { 12098c2ecf20Sopenharmony_ci drbd_info(device, "dropping %u in drbd_rs_del_all, apparently" 12108c2ecf20Sopenharmony_ci " got 'synced' by application io\n", 12118c2ecf20Sopenharmony_ci device->resync_wenr); 12128c2ecf20Sopenharmony_ci D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 12138c2ecf20Sopenharmony_ci D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 12148c2ecf20Sopenharmony_ci clear_bit(BME_NO_WRITES, &bm_ext->flags); 12158c2ecf20Sopenharmony_ci device->resync_wenr = LC_FREE; 12168c2ecf20Sopenharmony_ci lc_put(device->resync, &bm_ext->lce); 12178c2ecf20Sopenharmony_ci } 12188c2ecf20Sopenharmony_ci if (bm_ext->lce.refcnt != 0) { 12198c2ecf20Sopenharmony_ci drbd_info(device, "Retrying drbd_rs_del_all() later. " 12208c2ecf20Sopenharmony_ci "refcnt=%d\n", bm_ext->lce.refcnt); 12218c2ecf20Sopenharmony_ci put_ldev(device); 12228c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 12238c2ecf20Sopenharmony_ci return -EAGAIN; 12248c2ecf20Sopenharmony_ci } 12258c2ecf20Sopenharmony_ci D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 12268c2ecf20Sopenharmony_ci D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags)); 12278c2ecf20Sopenharmony_ci lc_del(device->resync, &bm_ext->lce); 12288c2ecf20Sopenharmony_ci } 12298c2ecf20Sopenharmony_ci D_ASSERT(device, device->resync->used == 0); 12308c2ecf20Sopenharmony_ci put_ldev(device); 12318c2ecf20Sopenharmony_ci } 12328c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 12338c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 12348c2ecf20Sopenharmony_ci 12358c2ecf20Sopenharmony_ci return 0; 12368c2ecf20Sopenharmony_ci} 1237