18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci drbd_req.c 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 88c2ecf20Sopenharmony_ci Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 98c2ecf20Sopenharmony_ci Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci 128c2ecf20Sopenharmony_ci */ 138c2ecf20Sopenharmony_ci 148c2ecf20Sopenharmony_ci#include <linux/module.h> 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ci#include <linux/slab.h> 178c2ecf20Sopenharmony_ci#include <linux/drbd.h> 188c2ecf20Sopenharmony_ci#include "drbd_int.h" 198c2ecf20Sopenharmony_ci#include "drbd_req.h" 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_cistatic bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size); 238c2ecf20Sopenharmony_ci 248c2ecf20Sopenharmony_cistatic struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src) 258c2ecf20Sopenharmony_ci{ 268c2ecf20Sopenharmony_ci struct drbd_request *req; 278c2ecf20Sopenharmony_ci 288c2ecf20Sopenharmony_ci req = mempool_alloc(&drbd_request_mempool, GFP_NOIO); 298c2ecf20Sopenharmony_ci if (!req) 308c2ecf20Sopenharmony_ci return NULL; 318c2ecf20Sopenharmony_ci memset(req, 0, sizeof(*req)); 328c2ecf20Sopenharmony_ci 338c2ecf20Sopenharmony_ci drbd_req_make_private_bio(req, bio_src); 348c2ecf20Sopenharmony_ci req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) 358c2ecf20Sopenharmony_ci | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) 368c2ecf20Sopenharmony_ci | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0) 378c2ecf20Sopenharmony_ci | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); 388c2ecf20Sopenharmony_ci req->device = device; 398c2ecf20Sopenharmony_ci req->master_bio = bio_src; 408c2ecf20Sopenharmony_ci req->epoch = 0; 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci drbd_clear_interval(&req->i); 438c2ecf20Sopenharmony_ci req->i.sector = bio_src->bi_iter.bi_sector; 448c2ecf20Sopenharmony_ci req->i.size = bio_src->bi_iter.bi_size; 458c2ecf20Sopenharmony_ci req->i.local = true; 468c2ecf20Sopenharmony_ci req->i.waiting = false; 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&req->tl_requests); 498c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&req->w.list); 508c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&req->req_pending_master_completion); 518c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&req->req_pending_local); 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci /* one reference to be put by __drbd_make_request */ 548c2ecf20Sopenharmony_ci atomic_set(&req->completion_ref, 1); 558c2ecf20Sopenharmony_ci /* one kref as long as completion_ref > 0 */ 568c2ecf20Sopenharmony_ci kref_init(&req->kref); 578c2ecf20Sopenharmony_ci return req; 588c2ecf20Sopenharmony_ci} 598c2ecf20Sopenharmony_ci 608c2ecf20Sopenharmony_cistatic void drbd_remove_request_interval(struct rb_root *root, 618c2ecf20Sopenharmony_ci struct drbd_request *req) 628c2ecf20Sopenharmony_ci{ 638c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 648c2ecf20Sopenharmony_ci struct drbd_interval *i = &req->i; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci drbd_remove_interval(root, i); 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci /* Wake up any processes waiting for this request to complete. */ 698c2ecf20Sopenharmony_ci if (i->waiting) 708c2ecf20Sopenharmony_ci wake_up(&device->misc_wait); 718c2ecf20Sopenharmony_ci} 728c2ecf20Sopenharmony_ci 738c2ecf20Sopenharmony_civoid drbd_req_destroy(struct kref *kref) 748c2ecf20Sopenharmony_ci{ 758c2ecf20Sopenharmony_ci struct drbd_request *req = container_of(kref, struct drbd_request, kref); 768c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 778c2ecf20Sopenharmony_ci const unsigned s = req->rq_state; 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci if ((req->master_bio && !(s & RQ_POSTPONED)) || 808c2ecf20Sopenharmony_ci atomic_read(&req->completion_ref) || 818c2ecf20Sopenharmony_ci (s & RQ_LOCAL_PENDING) || 828c2ecf20Sopenharmony_ci ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) { 838c2ecf20Sopenharmony_ci drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n", 848c2ecf20Sopenharmony_ci s, atomic_read(&req->completion_ref)); 858c2ecf20Sopenharmony_ci return; 868c2ecf20Sopenharmony_ci } 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci /* If called from mod_rq_state (expected normal case) or 898c2ecf20Sopenharmony_ci * drbd_send_and_submit (the less likely normal path), this holds the 908c2ecf20Sopenharmony_ci * req_lock, and req->tl_requests will typicaly be on ->transfer_log, 918c2ecf20Sopenharmony_ci * though it may be still empty (never added to the transfer log). 928c2ecf20Sopenharmony_ci * 938c2ecf20Sopenharmony_ci * If called from do_retry(), we do NOT hold the req_lock, but we are 948c2ecf20Sopenharmony_ci * still allowed to unconditionally list_del(&req->tl_requests), 958c2ecf20Sopenharmony_ci * because it will be on a local on-stack list only. */ 968c2ecf20Sopenharmony_ci list_del_init(&req->tl_requests); 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci /* finally remove the request from the conflict detection 998c2ecf20Sopenharmony_ci * respective block_id verification interval tree. */ 1008c2ecf20Sopenharmony_ci if (!drbd_interval_empty(&req->i)) { 1018c2ecf20Sopenharmony_ci struct rb_root *root; 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci if (s & RQ_WRITE) 1048c2ecf20Sopenharmony_ci root = &device->write_requests; 1058c2ecf20Sopenharmony_ci else 1068c2ecf20Sopenharmony_ci root = &device->read_requests; 1078c2ecf20Sopenharmony_ci drbd_remove_request_interval(root, req); 1088c2ecf20Sopenharmony_ci } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0) 1098c2ecf20Sopenharmony_ci drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n", 1108c2ecf20Sopenharmony_ci s, (unsigned long long)req->i.sector, req->i.size); 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci /* if it was a write, we may have to set the corresponding 1138c2ecf20Sopenharmony_ci * bit(s) out-of-sync first. If it had a local part, we need to 1148c2ecf20Sopenharmony_ci * release the reference to the activity log. */ 1158c2ecf20Sopenharmony_ci if (s & RQ_WRITE) { 1168c2ecf20Sopenharmony_ci /* Set out-of-sync unless both OK flags are set 1178c2ecf20Sopenharmony_ci * (local only or remote failed). 1188c2ecf20Sopenharmony_ci * Other places where we set out-of-sync: 1198c2ecf20Sopenharmony_ci * READ with local io-error */ 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_ci /* There is a special case: 1228c2ecf20Sopenharmony_ci * we may notice late that IO was suspended, 1238c2ecf20Sopenharmony_ci * and postpone, or schedule for retry, a write, 1248c2ecf20Sopenharmony_ci * before it even was submitted or sent. 1258c2ecf20Sopenharmony_ci * In that case we do not want to touch the bitmap at all. 1268c2ecf20Sopenharmony_ci */ 1278c2ecf20Sopenharmony_ci if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { 1288c2ecf20Sopenharmony_ci if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) 1298c2ecf20Sopenharmony_ci drbd_set_out_of_sync(device, req->i.sector, req->i.size); 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) 1328c2ecf20Sopenharmony_ci drbd_set_in_sync(device, req->i.sector, req->i.size); 1338c2ecf20Sopenharmony_ci } 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci /* one might be tempted to move the drbd_al_complete_io 1368c2ecf20Sopenharmony_ci * to the local io completion callback drbd_request_endio. 1378c2ecf20Sopenharmony_ci * but, if this was a mirror write, we may only 1388c2ecf20Sopenharmony_ci * drbd_al_complete_io after this is RQ_NET_DONE, 1398c2ecf20Sopenharmony_ci * otherwise the extent could be dropped from the al 1408c2ecf20Sopenharmony_ci * before it has actually been written on the peer. 1418c2ecf20Sopenharmony_ci * if we crash before our peer knows about the request, 1428c2ecf20Sopenharmony_ci * but after the extent has been dropped from the al, 1438c2ecf20Sopenharmony_ci * we would forget to resync the corresponding extent. 1448c2ecf20Sopenharmony_ci */ 1458c2ecf20Sopenharmony_ci if (s & RQ_IN_ACT_LOG) { 1468c2ecf20Sopenharmony_ci if (get_ldev_if_state(device, D_FAILED)) { 1478c2ecf20Sopenharmony_ci drbd_al_complete_io(device, &req->i); 1488c2ecf20Sopenharmony_ci put_ldev(device); 1498c2ecf20Sopenharmony_ci } else if (__ratelimit(&drbd_ratelimit_state)) { 1508c2ecf20Sopenharmony_ci drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), " 1518c2ecf20Sopenharmony_ci "but my Disk seems to have failed :(\n", 1528c2ecf20Sopenharmony_ci (unsigned long long) req->i.sector, req->i.size); 1538c2ecf20Sopenharmony_ci } 1548c2ecf20Sopenharmony_ci } 1558c2ecf20Sopenharmony_ci } 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci mempool_free(req, &drbd_request_mempool); 1588c2ecf20Sopenharmony_ci} 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_cistatic void wake_all_senders(struct drbd_connection *connection) 1618c2ecf20Sopenharmony_ci{ 1628c2ecf20Sopenharmony_ci wake_up(&connection->sender_work.q_wait); 1638c2ecf20Sopenharmony_ci} 1648c2ecf20Sopenharmony_ci 1658c2ecf20Sopenharmony_ci/* must hold resource->req_lock */ 1668c2ecf20Sopenharmony_civoid start_new_tl_epoch(struct drbd_connection *connection) 1678c2ecf20Sopenharmony_ci{ 1688c2ecf20Sopenharmony_ci /* no point closing an epoch, if it is empty, anyways. */ 1698c2ecf20Sopenharmony_ci if (connection->current_tle_writes == 0) 1708c2ecf20Sopenharmony_ci return; 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci connection->current_tle_writes = 0; 1738c2ecf20Sopenharmony_ci atomic_inc(&connection->current_tle_nr); 1748c2ecf20Sopenharmony_ci wake_all_senders(connection); 1758c2ecf20Sopenharmony_ci} 1768c2ecf20Sopenharmony_ci 1778c2ecf20Sopenharmony_civoid complete_master_bio(struct drbd_device *device, 1788c2ecf20Sopenharmony_ci struct bio_and_error *m) 1798c2ecf20Sopenharmony_ci{ 1808c2ecf20Sopenharmony_ci if (unlikely(m->error)) 1818c2ecf20Sopenharmony_ci m->bio->bi_status = errno_to_blk_status(m->error); 1828c2ecf20Sopenharmony_ci bio_endio(m->bio); 1838c2ecf20Sopenharmony_ci dec_ap_bio(device); 1848c2ecf20Sopenharmony_ci} 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci/* Helper for __req_mod(). 1888c2ecf20Sopenharmony_ci * Set m->bio to the master bio, if it is fit to be completed, 1898c2ecf20Sopenharmony_ci * or leave it alone (it is initialized to NULL in __req_mod), 1908c2ecf20Sopenharmony_ci * if it has already been completed, or cannot be completed yet. 1918c2ecf20Sopenharmony_ci * If m->bio is set, the error status to be returned is placed in m->error. 1928c2ecf20Sopenharmony_ci */ 1938c2ecf20Sopenharmony_cistatic 1948c2ecf20Sopenharmony_civoid drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) 1958c2ecf20Sopenharmony_ci{ 1968c2ecf20Sopenharmony_ci const unsigned s = req->rq_state; 1978c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 1988c2ecf20Sopenharmony_ci int error, ok; 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci /* we must not complete the master bio, while it is 2018c2ecf20Sopenharmony_ci * still being processed by _drbd_send_zc_bio (drbd_send_dblock) 2028c2ecf20Sopenharmony_ci * not yet acknowledged by the peer 2038c2ecf20Sopenharmony_ci * not yet completed by the local io subsystem 2048c2ecf20Sopenharmony_ci * these flags may get cleared in any order by 2058c2ecf20Sopenharmony_ci * the worker, 2068c2ecf20Sopenharmony_ci * the receiver, 2078c2ecf20Sopenharmony_ci * the bio_endio completion callbacks. 2088c2ecf20Sopenharmony_ci */ 2098c2ecf20Sopenharmony_ci if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) || 2108c2ecf20Sopenharmony_ci (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) || 2118c2ecf20Sopenharmony_ci (s & RQ_COMPLETION_SUSP)) { 2128c2ecf20Sopenharmony_ci drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s); 2138c2ecf20Sopenharmony_ci return; 2148c2ecf20Sopenharmony_ci } 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci if (!req->master_bio) { 2178c2ecf20Sopenharmony_ci drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n"); 2188c2ecf20Sopenharmony_ci return; 2198c2ecf20Sopenharmony_ci } 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci /* 2228c2ecf20Sopenharmony_ci * figure out whether to report success or failure. 2238c2ecf20Sopenharmony_ci * 2248c2ecf20Sopenharmony_ci * report success when at least one of the operations succeeded. 2258c2ecf20Sopenharmony_ci * or, to put the other way, 2268c2ecf20Sopenharmony_ci * only report failure, when both operations failed. 2278c2ecf20Sopenharmony_ci * 2288c2ecf20Sopenharmony_ci * what to do about the failures is handled elsewhere. 2298c2ecf20Sopenharmony_ci * what we need to do here is just: complete the master_bio. 2308c2ecf20Sopenharmony_ci * 2318c2ecf20Sopenharmony_ci * local completion error, if any, has been stored as ERR_PTR 2328c2ecf20Sopenharmony_ci * in private_bio within drbd_request_endio. 2338c2ecf20Sopenharmony_ci */ 2348c2ecf20Sopenharmony_ci ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); 2358c2ecf20Sopenharmony_ci error = PTR_ERR(req->private_bio); 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_ci /* Before we can signal completion to the upper layers, 2388c2ecf20Sopenharmony_ci * we may need to close the current transfer log epoch. 2398c2ecf20Sopenharmony_ci * We are within the request lock, so we can simply compare 2408c2ecf20Sopenharmony_ci * the request epoch number with the current transfer log 2418c2ecf20Sopenharmony_ci * epoch number. If they match, increase the current_tle_nr, 2428c2ecf20Sopenharmony_ci * and reset the transfer log epoch write_cnt. 2438c2ecf20Sopenharmony_ci */ 2448c2ecf20Sopenharmony_ci if (op_is_write(bio_op(req->master_bio)) && 2458c2ecf20Sopenharmony_ci req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr)) 2468c2ecf20Sopenharmony_ci start_new_tl_epoch(first_peer_device(device)->connection); 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci /* Update disk stats */ 2498c2ecf20Sopenharmony_ci bio_end_io_acct(req->master_bio, req->start_jif); 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci /* If READ failed, 2528c2ecf20Sopenharmony_ci * have it be pushed back to the retry work queue, 2538c2ecf20Sopenharmony_ci * so it will re-enter __drbd_make_request(), 2548c2ecf20Sopenharmony_ci * and be re-assigned to a suitable local or remote path, 2558c2ecf20Sopenharmony_ci * or failed if we do not have access to good data anymore. 2568c2ecf20Sopenharmony_ci * 2578c2ecf20Sopenharmony_ci * Unless it was failed early by __drbd_make_request(), 2588c2ecf20Sopenharmony_ci * because no path was available, in which case 2598c2ecf20Sopenharmony_ci * it was not even added to the transfer_log. 2608c2ecf20Sopenharmony_ci * 2618c2ecf20Sopenharmony_ci * read-ahead may fail, and will not be retried. 2628c2ecf20Sopenharmony_ci * 2638c2ecf20Sopenharmony_ci * WRITE should have used all available paths already. 2648c2ecf20Sopenharmony_ci */ 2658c2ecf20Sopenharmony_ci if (!ok && 2668c2ecf20Sopenharmony_ci bio_op(req->master_bio) == REQ_OP_READ && 2678c2ecf20Sopenharmony_ci !(req->master_bio->bi_opf & REQ_RAHEAD) && 2688c2ecf20Sopenharmony_ci !list_empty(&req->tl_requests)) 2698c2ecf20Sopenharmony_ci req->rq_state |= RQ_POSTPONED; 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci if (!(req->rq_state & RQ_POSTPONED)) { 2728c2ecf20Sopenharmony_ci m->error = ok ? 0 : (error ?: -EIO); 2738c2ecf20Sopenharmony_ci m->bio = req->master_bio; 2748c2ecf20Sopenharmony_ci req->master_bio = NULL; 2758c2ecf20Sopenharmony_ci /* We leave it in the tree, to be able to verify later 2768c2ecf20Sopenharmony_ci * write-acks in protocol != C during resync. 2778c2ecf20Sopenharmony_ci * But we mark it as "complete", so it won't be counted as 2788c2ecf20Sopenharmony_ci * conflict in a multi-primary setup. */ 2798c2ecf20Sopenharmony_ci req->i.completed = true; 2808c2ecf20Sopenharmony_ci } 2818c2ecf20Sopenharmony_ci 2828c2ecf20Sopenharmony_ci if (req->i.waiting) 2838c2ecf20Sopenharmony_ci wake_up(&device->misc_wait); 2848c2ecf20Sopenharmony_ci 2858c2ecf20Sopenharmony_ci /* Either we are about to complete to upper layers, 2868c2ecf20Sopenharmony_ci * or we will restart this request. 2878c2ecf20Sopenharmony_ci * In either case, the request object will be destroyed soon, 2888c2ecf20Sopenharmony_ci * so better remove it from all lists. */ 2898c2ecf20Sopenharmony_ci list_del_init(&req->req_pending_master_completion); 2908c2ecf20Sopenharmony_ci} 2918c2ecf20Sopenharmony_ci 2928c2ecf20Sopenharmony_ci/* still holds resource->req_lock */ 2938c2ecf20Sopenharmony_cistatic void drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) 2948c2ecf20Sopenharmony_ci{ 2958c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 2968c2ecf20Sopenharmony_ci D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED)); 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci if (!put) 2998c2ecf20Sopenharmony_ci return; 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_ci if (!atomic_sub_and_test(put, &req->completion_ref)) 3028c2ecf20Sopenharmony_ci return; 3038c2ecf20Sopenharmony_ci 3048c2ecf20Sopenharmony_ci drbd_req_complete(req, m); 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_ci /* local completion may still come in later, 3078c2ecf20Sopenharmony_ci * we need to keep the req object around. */ 3088c2ecf20Sopenharmony_ci if (req->rq_state & RQ_LOCAL_ABORTED) 3098c2ecf20Sopenharmony_ci return; 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci if (req->rq_state & RQ_POSTPONED) { 3128c2ecf20Sopenharmony_ci /* don't destroy the req object just yet, 3138c2ecf20Sopenharmony_ci * but queue it for retry */ 3148c2ecf20Sopenharmony_ci drbd_restart_request(req); 3158c2ecf20Sopenharmony_ci return; 3168c2ecf20Sopenharmony_ci } 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci kref_put(&req->kref, drbd_req_destroy); 3198c2ecf20Sopenharmony_ci} 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_cistatic void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) 3228c2ecf20Sopenharmony_ci{ 3238c2ecf20Sopenharmony_ci struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 3248c2ecf20Sopenharmony_ci if (!connection) 3258c2ecf20Sopenharmony_ci return; 3268c2ecf20Sopenharmony_ci if (connection->req_next == NULL) 3278c2ecf20Sopenharmony_ci connection->req_next = req; 3288c2ecf20Sopenharmony_ci} 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_cistatic void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) 3318c2ecf20Sopenharmony_ci{ 3328c2ecf20Sopenharmony_ci struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 3338c2ecf20Sopenharmony_ci if (!connection) 3348c2ecf20Sopenharmony_ci return; 3358c2ecf20Sopenharmony_ci if (connection->req_next != req) 3368c2ecf20Sopenharmony_ci return; 3378c2ecf20Sopenharmony_ci list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { 3388c2ecf20Sopenharmony_ci const unsigned s = req->rq_state; 3398c2ecf20Sopenharmony_ci if (s & RQ_NET_QUEUED) 3408c2ecf20Sopenharmony_ci break; 3418c2ecf20Sopenharmony_ci } 3428c2ecf20Sopenharmony_ci if (&req->tl_requests == &connection->transfer_log) 3438c2ecf20Sopenharmony_ci req = NULL; 3448c2ecf20Sopenharmony_ci connection->req_next = req; 3458c2ecf20Sopenharmony_ci} 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_cistatic void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) 3488c2ecf20Sopenharmony_ci{ 3498c2ecf20Sopenharmony_ci struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 3508c2ecf20Sopenharmony_ci if (!connection) 3518c2ecf20Sopenharmony_ci return; 3528c2ecf20Sopenharmony_ci if (connection->req_ack_pending == NULL) 3538c2ecf20Sopenharmony_ci connection->req_ack_pending = req; 3548c2ecf20Sopenharmony_ci} 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_cistatic void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) 3578c2ecf20Sopenharmony_ci{ 3588c2ecf20Sopenharmony_ci struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 3598c2ecf20Sopenharmony_ci if (!connection) 3608c2ecf20Sopenharmony_ci return; 3618c2ecf20Sopenharmony_ci if (connection->req_ack_pending != req) 3628c2ecf20Sopenharmony_ci return; 3638c2ecf20Sopenharmony_ci list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { 3648c2ecf20Sopenharmony_ci const unsigned s = req->rq_state; 3658c2ecf20Sopenharmony_ci if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) 3668c2ecf20Sopenharmony_ci break; 3678c2ecf20Sopenharmony_ci } 3688c2ecf20Sopenharmony_ci if (&req->tl_requests == &connection->transfer_log) 3698c2ecf20Sopenharmony_ci req = NULL; 3708c2ecf20Sopenharmony_ci connection->req_ack_pending = req; 3718c2ecf20Sopenharmony_ci} 3728c2ecf20Sopenharmony_ci 3738c2ecf20Sopenharmony_cistatic void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) 3748c2ecf20Sopenharmony_ci{ 3758c2ecf20Sopenharmony_ci struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 3768c2ecf20Sopenharmony_ci if (!connection) 3778c2ecf20Sopenharmony_ci return; 3788c2ecf20Sopenharmony_ci if (connection->req_not_net_done == NULL) 3798c2ecf20Sopenharmony_ci connection->req_not_net_done = req; 3808c2ecf20Sopenharmony_ci} 3818c2ecf20Sopenharmony_ci 3828c2ecf20Sopenharmony_cistatic void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) 3838c2ecf20Sopenharmony_ci{ 3848c2ecf20Sopenharmony_ci struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; 3858c2ecf20Sopenharmony_ci if (!connection) 3868c2ecf20Sopenharmony_ci return; 3878c2ecf20Sopenharmony_ci if (connection->req_not_net_done != req) 3888c2ecf20Sopenharmony_ci return; 3898c2ecf20Sopenharmony_ci list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { 3908c2ecf20Sopenharmony_ci const unsigned s = req->rq_state; 3918c2ecf20Sopenharmony_ci if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) 3928c2ecf20Sopenharmony_ci break; 3938c2ecf20Sopenharmony_ci } 3948c2ecf20Sopenharmony_ci if (&req->tl_requests == &connection->transfer_log) 3958c2ecf20Sopenharmony_ci req = NULL; 3968c2ecf20Sopenharmony_ci connection->req_not_net_done = req; 3978c2ecf20Sopenharmony_ci} 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci/* I'd like this to be the only place that manipulates 4008c2ecf20Sopenharmony_ci * req->completion_ref and req->kref. */ 4018c2ecf20Sopenharmony_cistatic void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, 4028c2ecf20Sopenharmony_ci int clear, int set) 4038c2ecf20Sopenharmony_ci{ 4048c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 4058c2ecf20Sopenharmony_ci struct drbd_peer_device *peer_device = first_peer_device(device); 4068c2ecf20Sopenharmony_ci unsigned s = req->rq_state; 4078c2ecf20Sopenharmony_ci int c_put = 0; 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP)) 4108c2ecf20Sopenharmony_ci set |= RQ_COMPLETION_SUSP; 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci /* apply */ 4138c2ecf20Sopenharmony_ci 4148c2ecf20Sopenharmony_ci req->rq_state &= ~clear; 4158c2ecf20Sopenharmony_ci req->rq_state |= set; 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci /* no change? */ 4188c2ecf20Sopenharmony_ci if (req->rq_state == s) 4198c2ecf20Sopenharmony_ci return; 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci /* intent: get references */ 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci kref_get(&req->kref); 4248c2ecf20Sopenharmony_ci 4258c2ecf20Sopenharmony_ci if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) 4268c2ecf20Sopenharmony_ci atomic_inc(&req->completion_ref); 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_ci if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) { 4298c2ecf20Sopenharmony_ci inc_ap_pending(device); 4308c2ecf20Sopenharmony_ci atomic_inc(&req->completion_ref); 4318c2ecf20Sopenharmony_ci } 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) { 4348c2ecf20Sopenharmony_ci atomic_inc(&req->completion_ref); 4358c2ecf20Sopenharmony_ci set_if_null_req_next(peer_device, req); 4368c2ecf20Sopenharmony_ci } 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) 4398c2ecf20Sopenharmony_ci kref_get(&req->kref); /* wait for the DONE */ 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { 4428c2ecf20Sopenharmony_ci /* potentially already completed in the ack_receiver thread */ 4438c2ecf20Sopenharmony_ci if (!(s & RQ_NET_DONE)) { 4448c2ecf20Sopenharmony_ci atomic_add(req->i.size >> 9, &device->ap_in_flight); 4458c2ecf20Sopenharmony_ci set_if_null_req_not_net_done(peer_device, req); 4468c2ecf20Sopenharmony_ci } 4478c2ecf20Sopenharmony_ci if (req->rq_state & RQ_NET_PENDING) 4488c2ecf20Sopenharmony_ci set_if_null_req_ack_pending(peer_device, req); 4498c2ecf20Sopenharmony_ci } 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) 4528c2ecf20Sopenharmony_ci atomic_inc(&req->completion_ref); 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci /* progress: put references */ 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP)) 4578c2ecf20Sopenharmony_ci ++c_put; 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { 4608c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING); 4618c2ecf20Sopenharmony_ci ++c_put; 4628c2ecf20Sopenharmony_ci } 4638c2ecf20Sopenharmony_ci 4648c2ecf20Sopenharmony_ci if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { 4658c2ecf20Sopenharmony_ci if (req->rq_state & RQ_LOCAL_ABORTED) 4668c2ecf20Sopenharmony_ci kref_put(&req->kref, drbd_req_destroy); 4678c2ecf20Sopenharmony_ci else 4688c2ecf20Sopenharmony_ci ++c_put; 4698c2ecf20Sopenharmony_ci list_del_init(&req->req_pending_local); 4708c2ecf20Sopenharmony_ci } 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { 4738c2ecf20Sopenharmony_ci dec_ap_pending(device); 4748c2ecf20Sopenharmony_ci ++c_put; 4758c2ecf20Sopenharmony_ci req->acked_jif = jiffies; 4768c2ecf20Sopenharmony_ci advance_conn_req_ack_pending(peer_device, req); 4778c2ecf20Sopenharmony_ci } 4788c2ecf20Sopenharmony_ci 4798c2ecf20Sopenharmony_ci if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) { 4808c2ecf20Sopenharmony_ci ++c_put; 4818c2ecf20Sopenharmony_ci advance_conn_req_next(peer_device, req); 4828c2ecf20Sopenharmony_ci } 4838c2ecf20Sopenharmony_ci 4848c2ecf20Sopenharmony_ci if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { 4858c2ecf20Sopenharmony_ci if (s & RQ_NET_SENT) 4868c2ecf20Sopenharmony_ci atomic_sub(req->i.size >> 9, &device->ap_in_flight); 4878c2ecf20Sopenharmony_ci if (s & RQ_EXP_BARR_ACK) 4888c2ecf20Sopenharmony_ci kref_put(&req->kref, drbd_req_destroy); 4898c2ecf20Sopenharmony_ci req->net_done_jif = jiffies; 4908c2ecf20Sopenharmony_ci 4918c2ecf20Sopenharmony_ci /* in ahead/behind mode, or just in case, 4928c2ecf20Sopenharmony_ci * before we finally destroy this request, 4938c2ecf20Sopenharmony_ci * the caching pointers must not reference it anymore */ 4948c2ecf20Sopenharmony_ci advance_conn_req_next(peer_device, req); 4958c2ecf20Sopenharmony_ci advance_conn_req_ack_pending(peer_device, req); 4968c2ecf20Sopenharmony_ci advance_conn_req_not_net_done(peer_device, req); 4978c2ecf20Sopenharmony_ci } 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci /* potentially complete and destroy */ 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_ci /* If we made progress, retry conflicting peer requests, if any. */ 5028c2ecf20Sopenharmony_ci if (req->i.waiting) 5038c2ecf20Sopenharmony_ci wake_up(&device->misc_wait); 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci drbd_req_put_completion_ref(req, m, c_put); 5068c2ecf20Sopenharmony_ci kref_put(&req->kref, drbd_req_destroy); 5078c2ecf20Sopenharmony_ci} 5088c2ecf20Sopenharmony_ci 5098c2ecf20Sopenharmony_cistatic void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) 5108c2ecf20Sopenharmony_ci{ 5118c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci if (!__ratelimit(&drbd_ratelimit_state)) 5148c2ecf20Sopenharmony_ci return; 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci drbd_warn(device, "local %s IO error sector %llu+%u on %s\n", 5178c2ecf20Sopenharmony_ci (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", 5188c2ecf20Sopenharmony_ci (unsigned long long)req->i.sector, 5198c2ecf20Sopenharmony_ci req->i.size >> 9, 5208c2ecf20Sopenharmony_ci bdevname(device->ldev->backing_bdev, b)); 5218c2ecf20Sopenharmony_ci} 5228c2ecf20Sopenharmony_ci 5238c2ecf20Sopenharmony_ci/* Helper for HANDED_OVER_TO_NETWORK. 5248c2ecf20Sopenharmony_ci * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)? 5258c2ecf20Sopenharmony_ci * Is it also still "PENDING"? 5268c2ecf20Sopenharmony_ci * --> If so, clear PENDING and set NET_OK below. 5278c2ecf20Sopenharmony_ci * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster 5288c2ecf20Sopenharmony_ci * (and we must not set RQ_NET_OK) */ 5298c2ecf20Sopenharmony_cistatic inline bool is_pending_write_protocol_A(struct drbd_request *req) 5308c2ecf20Sopenharmony_ci{ 5318c2ecf20Sopenharmony_ci return (req->rq_state & 5328c2ecf20Sopenharmony_ci (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK)) 5338c2ecf20Sopenharmony_ci == (RQ_WRITE|RQ_NET_PENDING); 5348c2ecf20Sopenharmony_ci} 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_ci/* obviously this could be coded as many single functions 5378c2ecf20Sopenharmony_ci * instead of one huge switch, 5388c2ecf20Sopenharmony_ci * or by putting the code directly in the respective locations 5398c2ecf20Sopenharmony_ci * (as it has been before). 5408c2ecf20Sopenharmony_ci * 5418c2ecf20Sopenharmony_ci * but having it this way 5428c2ecf20Sopenharmony_ci * enforces that it is all in this one place, where it is easier to audit, 5438c2ecf20Sopenharmony_ci * it makes it obvious that whatever "event" "happens" to a request should 5448c2ecf20Sopenharmony_ci * happen "atomically" within the req_lock, 5458c2ecf20Sopenharmony_ci * and it enforces that we have to think in a very structured manner 5468c2ecf20Sopenharmony_ci * about the "events" that may happen to a request during its life time ... 5478c2ecf20Sopenharmony_ci */ 5488c2ecf20Sopenharmony_ciint __req_mod(struct drbd_request *req, enum drbd_req_event what, 5498c2ecf20Sopenharmony_ci struct bio_and_error *m) 5508c2ecf20Sopenharmony_ci{ 5518c2ecf20Sopenharmony_ci struct drbd_device *const device = req->device; 5528c2ecf20Sopenharmony_ci struct drbd_peer_device *const peer_device = first_peer_device(device); 5538c2ecf20Sopenharmony_ci struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; 5548c2ecf20Sopenharmony_ci struct net_conf *nc; 5558c2ecf20Sopenharmony_ci int p, rv = 0; 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci if (m) 5588c2ecf20Sopenharmony_ci m->bio = NULL; 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci switch (what) { 5618c2ecf20Sopenharmony_ci default: 5628c2ecf20Sopenharmony_ci drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); 5638c2ecf20Sopenharmony_ci break; 5648c2ecf20Sopenharmony_ci 5658c2ecf20Sopenharmony_ci /* does not happen... 5668c2ecf20Sopenharmony_ci * initialization done in drbd_req_new 5678c2ecf20Sopenharmony_ci case CREATED: 5688c2ecf20Sopenharmony_ci break; 5698c2ecf20Sopenharmony_ci */ 5708c2ecf20Sopenharmony_ci 5718c2ecf20Sopenharmony_ci case TO_BE_SENT: /* via network */ 5728c2ecf20Sopenharmony_ci /* reached via __drbd_make_request 5738c2ecf20Sopenharmony_ci * and from w_read_retry_remote */ 5748c2ecf20Sopenharmony_ci D_ASSERT(device, !(req->rq_state & RQ_NET_MASK)); 5758c2ecf20Sopenharmony_ci rcu_read_lock(); 5768c2ecf20Sopenharmony_ci nc = rcu_dereference(connection->net_conf); 5778c2ecf20Sopenharmony_ci p = nc->wire_protocol; 5788c2ecf20Sopenharmony_ci rcu_read_unlock(); 5798c2ecf20Sopenharmony_ci req->rq_state |= 5808c2ecf20Sopenharmony_ci p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : 5818c2ecf20Sopenharmony_ci p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; 5828c2ecf20Sopenharmony_ci mod_rq_state(req, m, 0, RQ_NET_PENDING); 5838c2ecf20Sopenharmony_ci break; 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci case TO_BE_SUBMITTED: /* locally */ 5868c2ecf20Sopenharmony_ci /* reached via __drbd_make_request */ 5878c2ecf20Sopenharmony_ci D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK)); 5888c2ecf20Sopenharmony_ci mod_rq_state(req, m, 0, RQ_LOCAL_PENDING); 5898c2ecf20Sopenharmony_ci break; 5908c2ecf20Sopenharmony_ci 5918c2ecf20Sopenharmony_ci case COMPLETED_OK: 5928c2ecf20Sopenharmony_ci if (req->rq_state & RQ_WRITE) 5938c2ecf20Sopenharmony_ci device->writ_cnt += req->i.size >> 9; 5948c2ecf20Sopenharmony_ci else 5958c2ecf20Sopenharmony_ci device->read_cnt += req->i.size >> 9; 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_LOCAL_PENDING, 5988c2ecf20Sopenharmony_ci RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); 5998c2ecf20Sopenharmony_ci break; 6008c2ecf20Sopenharmony_ci 6018c2ecf20Sopenharmony_ci case ABORT_DISK_IO: 6028c2ecf20Sopenharmony_ci mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); 6038c2ecf20Sopenharmony_ci break; 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci case WRITE_COMPLETED_WITH_ERROR: 6068c2ecf20Sopenharmony_ci drbd_report_io_error(device, req); 6078c2ecf20Sopenharmony_ci __drbd_chk_io_error(device, DRBD_WRITE_ERROR); 6088c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); 6098c2ecf20Sopenharmony_ci break; 6108c2ecf20Sopenharmony_ci 6118c2ecf20Sopenharmony_ci case READ_COMPLETED_WITH_ERROR: 6128c2ecf20Sopenharmony_ci drbd_set_out_of_sync(device, req->i.sector, req->i.size); 6138c2ecf20Sopenharmony_ci drbd_report_io_error(device, req); 6148c2ecf20Sopenharmony_ci __drbd_chk_io_error(device, DRBD_READ_ERROR); 6158c2ecf20Sopenharmony_ci fallthrough; 6168c2ecf20Sopenharmony_ci case READ_AHEAD_COMPLETED_WITH_ERROR: 6178c2ecf20Sopenharmony_ci /* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */ 6188c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); 6198c2ecf20Sopenharmony_ci break; 6208c2ecf20Sopenharmony_ci 6218c2ecf20Sopenharmony_ci case DISCARD_COMPLETED_NOTSUPP: 6228c2ecf20Sopenharmony_ci case DISCARD_COMPLETED_WITH_ERROR: 6238c2ecf20Sopenharmony_ci /* I'd rather not detach from local disk just because it 6248c2ecf20Sopenharmony_ci * failed a REQ_OP_DISCARD. */ 6258c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); 6268c2ecf20Sopenharmony_ci break; 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci case QUEUE_FOR_NET_READ: 6298c2ecf20Sopenharmony_ci /* READ, and 6308c2ecf20Sopenharmony_ci * no local disk, 6318c2ecf20Sopenharmony_ci * or target area marked as invalid, 6328c2ecf20Sopenharmony_ci * or just got an io-error. */ 6338c2ecf20Sopenharmony_ci /* from __drbd_make_request 6348c2ecf20Sopenharmony_ci * or from bio_endio during read io-error recovery */ 6358c2ecf20Sopenharmony_ci 6368c2ecf20Sopenharmony_ci /* So we can verify the handle in the answer packet. 6378c2ecf20Sopenharmony_ci * Corresponding drbd_remove_request_interval is in 6388c2ecf20Sopenharmony_ci * drbd_req_complete() */ 6398c2ecf20Sopenharmony_ci D_ASSERT(device, drbd_interval_empty(&req->i)); 6408c2ecf20Sopenharmony_ci drbd_insert_interval(&device->read_requests, &req->i); 6418c2ecf20Sopenharmony_ci 6428c2ecf20Sopenharmony_ci set_bit(UNPLUG_REMOTE, &device->flags); 6438c2ecf20Sopenharmony_ci 6448c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 6458c2ecf20Sopenharmony_ci D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0); 6468c2ecf20Sopenharmony_ci mod_rq_state(req, m, 0, RQ_NET_QUEUED); 6478c2ecf20Sopenharmony_ci req->w.cb = w_send_read_req; 6488c2ecf20Sopenharmony_ci drbd_queue_work(&connection->sender_work, 6498c2ecf20Sopenharmony_ci &req->w); 6508c2ecf20Sopenharmony_ci break; 6518c2ecf20Sopenharmony_ci 6528c2ecf20Sopenharmony_ci case QUEUE_FOR_NET_WRITE: 6538c2ecf20Sopenharmony_ci /* assert something? */ 6548c2ecf20Sopenharmony_ci /* from __drbd_make_request only */ 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci /* Corresponding drbd_remove_request_interval is in 6578c2ecf20Sopenharmony_ci * drbd_req_complete() */ 6588c2ecf20Sopenharmony_ci D_ASSERT(device, drbd_interval_empty(&req->i)); 6598c2ecf20Sopenharmony_ci drbd_insert_interval(&device->write_requests, &req->i); 6608c2ecf20Sopenharmony_ci 6618c2ecf20Sopenharmony_ci /* NOTE 6628c2ecf20Sopenharmony_ci * In case the req ended up on the transfer log before being 6638c2ecf20Sopenharmony_ci * queued on the worker, it could lead to this request being 6648c2ecf20Sopenharmony_ci * missed during cleanup after connection loss. 6658c2ecf20Sopenharmony_ci * So we have to do both operations here, 6668c2ecf20Sopenharmony_ci * within the same lock that protects the transfer log. 6678c2ecf20Sopenharmony_ci * 6688c2ecf20Sopenharmony_ci * _req_add_to_epoch(req); this has to be after the 6698c2ecf20Sopenharmony_ci * _maybe_start_new_epoch(req); which happened in 6708c2ecf20Sopenharmony_ci * __drbd_make_request, because we now may set the bit 6718c2ecf20Sopenharmony_ci * again ourselves to close the current epoch. 6728c2ecf20Sopenharmony_ci * 6738c2ecf20Sopenharmony_ci * Add req to the (now) current epoch (barrier). */ 6748c2ecf20Sopenharmony_ci 6758c2ecf20Sopenharmony_ci /* otherwise we may lose an unplug, which may cause some remote 6768c2ecf20Sopenharmony_ci * io-scheduler timeout to expire, increasing maximum latency, 6778c2ecf20Sopenharmony_ci * hurting performance. */ 6788c2ecf20Sopenharmony_ci set_bit(UNPLUG_REMOTE, &device->flags); 6798c2ecf20Sopenharmony_ci 6808c2ecf20Sopenharmony_ci /* queue work item to send data */ 6818c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 6828c2ecf20Sopenharmony_ci mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); 6838c2ecf20Sopenharmony_ci req->w.cb = w_send_dblock; 6848c2ecf20Sopenharmony_ci drbd_queue_work(&connection->sender_work, 6858c2ecf20Sopenharmony_ci &req->w); 6868c2ecf20Sopenharmony_ci 6878c2ecf20Sopenharmony_ci /* close the epoch, in case it outgrew the limit */ 6888c2ecf20Sopenharmony_ci rcu_read_lock(); 6898c2ecf20Sopenharmony_ci nc = rcu_dereference(connection->net_conf); 6908c2ecf20Sopenharmony_ci p = nc->max_epoch_size; 6918c2ecf20Sopenharmony_ci rcu_read_unlock(); 6928c2ecf20Sopenharmony_ci if (connection->current_tle_writes >= p) 6938c2ecf20Sopenharmony_ci start_new_tl_epoch(connection); 6948c2ecf20Sopenharmony_ci 6958c2ecf20Sopenharmony_ci break; 6968c2ecf20Sopenharmony_ci 6978c2ecf20Sopenharmony_ci case QUEUE_FOR_SEND_OOS: 6988c2ecf20Sopenharmony_ci mod_rq_state(req, m, 0, RQ_NET_QUEUED); 6998c2ecf20Sopenharmony_ci req->w.cb = w_send_out_of_sync; 7008c2ecf20Sopenharmony_ci drbd_queue_work(&connection->sender_work, 7018c2ecf20Sopenharmony_ci &req->w); 7028c2ecf20Sopenharmony_ci break; 7038c2ecf20Sopenharmony_ci 7048c2ecf20Sopenharmony_ci case READ_RETRY_REMOTE_CANCELED: 7058c2ecf20Sopenharmony_ci case SEND_CANCELED: 7068c2ecf20Sopenharmony_ci case SEND_FAILED: 7078c2ecf20Sopenharmony_ci /* real cleanup will be done from tl_clear. just update flags 7088c2ecf20Sopenharmony_ci * so it is no longer marked as on the worker queue */ 7098c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_QUEUED, 0); 7108c2ecf20Sopenharmony_ci break; 7118c2ecf20Sopenharmony_ci 7128c2ecf20Sopenharmony_ci case HANDED_OVER_TO_NETWORK: 7138c2ecf20Sopenharmony_ci /* assert something? */ 7148c2ecf20Sopenharmony_ci if (is_pending_write_protocol_A(req)) 7158c2ecf20Sopenharmony_ci /* this is what is dangerous about protocol A: 7168c2ecf20Sopenharmony_ci * pretend it was successfully written on the peer. */ 7178c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING, 7188c2ecf20Sopenharmony_ci RQ_NET_SENT|RQ_NET_OK); 7198c2ecf20Sopenharmony_ci else 7208c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); 7218c2ecf20Sopenharmony_ci /* It is still not yet RQ_NET_DONE until the 7228c2ecf20Sopenharmony_ci * corresponding epoch barrier got acked as well, 7238c2ecf20Sopenharmony_ci * so we know what to dirty on connection loss. */ 7248c2ecf20Sopenharmony_ci break; 7258c2ecf20Sopenharmony_ci 7268c2ecf20Sopenharmony_ci case OOS_HANDED_TO_NETWORK: 7278c2ecf20Sopenharmony_ci /* Was not set PENDING, no longer QUEUED, so is now DONE 7288c2ecf20Sopenharmony_ci * as far as this connection is concerned. */ 7298c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE); 7308c2ecf20Sopenharmony_ci break; 7318c2ecf20Sopenharmony_ci 7328c2ecf20Sopenharmony_ci case CONNECTION_LOST_WHILE_PENDING: 7338c2ecf20Sopenharmony_ci /* transfer log cleanup after connection loss */ 7348c2ecf20Sopenharmony_ci mod_rq_state(req, m, 7358c2ecf20Sopenharmony_ci RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP, 7368c2ecf20Sopenharmony_ci RQ_NET_DONE); 7378c2ecf20Sopenharmony_ci break; 7388c2ecf20Sopenharmony_ci 7398c2ecf20Sopenharmony_ci case CONFLICT_RESOLVED: 7408c2ecf20Sopenharmony_ci /* for superseded conflicting writes of multiple primaries, 7418c2ecf20Sopenharmony_ci * there is no need to keep anything in the tl, potential 7428c2ecf20Sopenharmony_ci * node crashes are covered by the activity log. 7438c2ecf20Sopenharmony_ci * 7448c2ecf20Sopenharmony_ci * If this request had been marked as RQ_POSTPONED before, 7458c2ecf20Sopenharmony_ci * it will actually not be completed, but "restarted", 7468c2ecf20Sopenharmony_ci * resubmitted from the retry worker context. */ 7478c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 7488c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); 7498c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK); 7508c2ecf20Sopenharmony_ci break; 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci case WRITE_ACKED_BY_PEER_AND_SIS: 7538c2ecf20Sopenharmony_ci req->rq_state |= RQ_NET_SIS; 7548c2ecf20Sopenharmony_ci case WRITE_ACKED_BY_PEER: 7558c2ecf20Sopenharmony_ci /* Normal operation protocol C: successfully written on peer. 7568c2ecf20Sopenharmony_ci * During resync, even in protocol != C, 7578c2ecf20Sopenharmony_ci * we requested an explicit write ack anyways. 7588c2ecf20Sopenharmony_ci * Which means we cannot even assert anything here. 7598c2ecf20Sopenharmony_ci * Nothing more to do here. 7608c2ecf20Sopenharmony_ci * We want to keep the tl in place for all protocols, to cater 7618c2ecf20Sopenharmony_ci * for volatile write-back caches on lower level devices. */ 7628c2ecf20Sopenharmony_ci goto ack_common; 7638c2ecf20Sopenharmony_ci case RECV_ACKED_BY_PEER: 7648c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK); 7658c2ecf20Sopenharmony_ci /* protocol B; pretends to be successfully written on peer. 7668c2ecf20Sopenharmony_ci * see also notes above in HANDED_OVER_TO_NETWORK about 7678c2ecf20Sopenharmony_ci * protocol != C */ 7688c2ecf20Sopenharmony_ci ack_common: 7698c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); 7708c2ecf20Sopenharmony_ci break; 7718c2ecf20Sopenharmony_ci 7728c2ecf20Sopenharmony_ci case POSTPONE_WRITE: 7738c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); 7748c2ecf20Sopenharmony_ci /* If this node has already detected the write conflict, the 7758c2ecf20Sopenharmony_ci * worker will be waiting on misc_wait. Wake it up once this 7768c2ecf20Sopenharmony_ci * request has completed locally. 7778c2ecf20Sopenharmony_ci */ 7788c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 7798c2ecf20Sopenharmony_ci req->rq_state |= RQ_POSTPONED; 7808c2ecf20Sopenharmony_ci if (req->i.waiting) 7818c2ecf20Sopenharmony_ci wake_up(&device->misc_wait); 7828c2ecf20Sopenharmony_ci /* Do not clear RQ_NET_PENDING. This request will make further 7838c2ecf20Sopenharmony_ci * progress via restart_conflicting_writes() or 7848c2ecf20Sopenharmony_ci * fail_postponed_requests(). Hopefully. */ 7858c2ecf20Sopenharmony_ci break; 7868c2ecf20Sopenharmony_ci 7878c2ecf20Sopenharmony_ci case NEG_ACKED: 7888c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0); 7898c2ecf20Sopenharmony_ci break; 7908c2ecf20Sopenharmony_ci 7918c2ecf20Sopenharmony_ci case FAIL_FROZEN_DISK_IO: 7928c2ecf20Sopenharmony_ci if (!(req->rq_state & RQ_LOCAL_COMPLETED)) 7938c2ecf20Sopenharmony_ci break; 7948c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); 7958c2ecf20Sopenharmony_ci break; 7968c2ecf20Sopenharmony_ci 7978c2ecf20Sopenharmony_ci case RESTART_FROZEN_DISK_IO: 7988c2ecf20Sopenharmony_ci if (!(req->rq_state & RQ_LOCAL_COMPLETED)) 7998c2ecf20Sopenharmony_ci break; 8008c2ecf20Sopenharmony_ci 8018c2ecf20Sopenharmony_ci mod_rq_state(req, m, 8028c2ecf20Sopenharmony_ci RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED, 8038c2ecf20Sopenharmony_ci RQ_LOCAL_PENDING); 8048c2ecf20Sopenharmony_ci 8058c2ecf20Sopenharmony_ci rv = MR_READ; 8068c2ecf20Sopenharmony_ci if (bio_data_dir(req->master_bio) == WRITE) 8078c2ecf20Sopenharmony_ci rv = MR_WRITE; 8088c2ecf20Sopenharmony_ci 8098c2ecf20Sopenharmony_ci get_ldev(device); /* always succeeds in this call path */ 8108c2ecf20Sopenharmony_ci req->w.cb = w_restart_disk_io; 8118c2ecf20Sopenharmony_ci drbd_queue_work(&connection->sender_work, 8128c2ecf20Sopenharmony_ci &req->w); 8138c2ecf20Sopenharmony_ci break; 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci case RESEND: 8168c2ecf20Sopenharmony_ci /* Simply complete (local only) READs. */ 8178c2ecf20Sopenharmony_ci if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { 8188c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); 8198c2ecf20Sopenharmony_ci break; 8208c2ecf20Sopenharmony_ci } 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK 8238c2ecf20Sopenharmony_ci before the connection loss (B&C only); only P_BARRIER_ACK 8248c2ecf20Sopenharmony_ci (or the local completion?) was missing when we suspended. 8258c2ecf20Sopenharmony_ci Throwing them out of the TL here by pretending we got a BARRIER_ACK. 8268c2ecf20Sopenharmony_ci During connection handshake, we ensure that the peer was not rebooted. */ 8278c2ecf20Sopenharmony_ci if (!(req->rq_state & RQ_NET_OK)) { 8288c2ecf20Sopenharmony_ci /* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync? 8298c2ecf20Sopenharmony_ci * in that case we must not set RQ_NET_PENDING. */ 8308c2ecf20Sopenharmony_ci 8318c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); 8328c2ecf20Sopenharmony_ci if (req->w.cb) { 8338c2ecf20Sopenharmony_ci /* w.cb expected to be w_send_dblock, or w_send_read_req */ 8348c2ecf20Sopenharmony_ci drbd_queue_work(&connection->sender_work, 8358c2ecf20Sopenharmony_ci &req->w); 8368c2ecf20Sopenharmony_ci rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; 8378c2ecf20Sopenharmony_ci } /* else: FIXME can this happen? */ 8388c2ecf20Sopenharmony_ci break; 8398c2ecf20Sopenharmony_ci } 8408c2ecf20Sopenharmony_ci fallthrough; /* to BARRIER_ACKED */ 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci case BARRIER_ACKED: 8438c2ecf20Sopenharmony_ci /* barrier ack for READ requests does not make sense */ 8448c2ecf20Sopenharmony_ci if (!(req->rq_state & RQ_WRITE)) 8458c2ecf20Sopenharmony_ci break; 8468c2ecf20Sopenharmony_ci 8478c2ecf20Sopenharmony_ci if (req->rq_state & RQ_NET_PENDING) { 8488c2ecf20Sopenharmony_ci /* barrier came in before all requests were acked. 8498c2ecf20Sopenharmony_ci * this is bad, because if the connection is lost now, 8508c2ecf20Sopenharmony_ci * we won't be able to clean them up... */ 8518c2ecf20Sopenharmony_ci drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n"); 8528c2ecf20Sopenharmony_ci } 8538c2ecf20Sopenharmony_ci /* Allowed to complete requests, even while suspended. 8548c2ecf20Sopenharmony_ci * As this is called for all requests within a matching epoch, 8558c2ecf20Sopenharmony_ci * we need to filter, and only set RQ_NET_DONE for those that 8568c2ecf20Sopenharmony_ci * have actually been on the wire. */ 8578c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_COMPLETION_SUSP, 8588c2ecf20Sopenharmony_ci (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0); 8598c2ecf20Sopenharmony_ci break; 8608c2ecf20Sopenharmony_ci 8618c2ecf20Sopenharmony_ci case DATA_RECEIVED: 8628c2ecf20Sopenharmony_ci D_ASSERT(device, req->rq_state & RQ_NET_PENDING); 8638c2ecf20Sopenharmony_ci mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); 8648c2ecf20Sopenharmony_ci break; 8658c2ecf20Sopenharmony_ci 8668c2ecf20Sopenharmony_ci case QUEUE_AS_DRBD_BARRIER: 8678c2ecf20Sopenharmony_ci start_new_tl_epoch(connection); 8688c2ecf20Sopenharmony_ci mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); 8698c2ecf20Sopenharmony_ci break; 8708c2ecf20Sopenharmony_ci } 8718c2ecf20Sopenharmony_ci 8728c2ecf20Sopenharmony_ci return rv; 8738c2ecf20Sopenharmony_ci} 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci/* we may do a local read if: 8768c2ecf20Sopenharmony_ci * - we are consistent (of course), 8778c2ecf20Sopenharmony_ci * - or we are generally inconsistent, 8788c2ecf20Sopenharmony_ci * BUT we are still/already IN SYNC for this area. 8798c2ecf20Sopenharmony_ci * since size may be bigger than BM_BLOCK_SIZE, 8808c2ecf20Sopenharmony_ci * we may need to check several bits. 8818c2ecf20Sopenharmony_ci */ 8828c2ecf20Sopenharmony_cistatic bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size) 8838c2ecf20Sopenharmony_ci{ 8848c2ecf20Sopenharmony_ci unsigned long sbnr, ebnr; 8858c2ecf20Sopenharmony_ci sector_t esector, nr_sectors; 8868c2ecf20Sopenharmony_ci 8878c2ecf20Sopenharmony_ci if (device->state.disk == D_UP_TO_DATE) 8888c2ecf20Sopenharmony_ci return true; 8898c2ecf20Sopenharmony_ci if (device->state.disk != D_INCONSISTENT) 8908c2ecf20Sopenharmony_ci return false; 8918c2ecf20Sopenharmony_ci esector = sector + (size >> 9) - 1; 8928c2ecf20Sopenharmony_ci nr_sectors = get_capacity(device->vdisk); 8938c2ecf20Sopenharmony_ci D_ASSERT(device, sector < nr_sectors); 8948c2ecf20Sopenharmony_ci D_ASSERT(device, esector < nr_sectors); 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_ci sbnr = BM_SECT_TO_BIT(sector); 8978c2ecf20Sopenharmony_ci ebnr = BM_SECT_TO_BIT(esector); 8988c2ecf20Sopenharmony_ci 8998c2ecf20Sopenharmony_ci return drbd_bm_count_bits(device, sbnr, ebnr) == 0; 9008c2ecf20Sopenharmony_ci} 9018c2ecf20Sopenharmony_ci 9028c2ecf20Sopenharmony_cistatic bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector, 9038c2ecf20Sopenharmony_ci enum drbd_read_balancing rbm) 9048c2ecf20Sopenharmony_ci{ 9058c2ecf20Sopenharmony_ci struct backing_dev_info *bdi; 9068c2ecf20Sopenharmony_ci int stripe_shift; 9078c2ecf20Sopenharmony_ci 9088c2ecf20Sopenharmony_ci switch (rbm) { 9098c2ecf20Sopenharmony_ci case RB_CONGESTED_REMOTE: 9108c2ecf20Sopenharmony_ci bdi = device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; 9118c2ecf20Sopenharmony_ci return bdi_read_congested(bdi); 9128c2ecf20Sopenharmony_ci case RB_LEAST_PENDING: 9138c2ecf20Sopenharmony_ci return atomic_read(&device->local_cnt) > 9148c2ecf20Sopenharmony_ci atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); 9158c2ecf20Sopenharmony_ci case RB_32K_STRIPING: /* stripe_shift = 15 */ 9168c2ecf20Sopenharmony_ci case RB_64K_STRIPING: 9178c2ecf20Sopenharmony_ci case RB_128K_STRIPING: 9188c2ecf20Sopenharmony_ci case RB_256K_STRIPING: 9198c2ecf20Sopenharmony_ci case RB_512K_STRIPING: 9208c2ecf20Sopenharmony_ci case RB_1M_STRIPING: /* stripe_shift = 20 */ 9218c2ecf20Sopenharmony_ci stripe_shift = (rbm - RB_32K_STRIPING + 15); 9228c2ecf20Sopenharmony_ci return (sector >> (stripe_shift - 9)) & 1; 9238c2ecf20Sopenharmony_ci case RB_ROUND_ROBIN: 9248c2ecf20Sopenharmony_ci return test_and_change_bit(READ_BALANCE_RR, &device->flags); 9258c2ecf20Sopenharmony_ci case RB_PREFER_REMOTE: 9268c2ecf20Sopenharmony_ci return true; 9278c2ecf20Sopenharmony_ci case RB_PREFER_LOCAL: 9288c2ecf20Sopenharmony_ci default: 9298c2ecf20Sopenharmony_ci return false; 9308c2ecf20Sopenharmony_ci } 9318c2ecf20Sopenharmony_ci} 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_ci/* 9348c2ecf20Sopenharmony_ci * complete_conflicting_writes - wait for any conflicting write requests 9358c2ecf20Sopenharmony_ci * 9368c2ecf20Sopenharmony_ci * The write_requests tree contains all active write requests which we 9378c2ecf20Sopenharmony_ci * currently know about. Wait for any requests to complete which conflict with 9388c2ecf20Sopenharmony_ci * the new one. 9398c2ecf20Sopenharmony_ci * 9408c2ecf20Sopenharmony_ci * Only way out: remove the conflicting intervals from the tree. 9418c2ecf20Sopenharmony_ci */ 9428c2ecf20Sopenharmony_cistatic void complete_conflicting_writes(struct drbd_request *req) 9438c2ecf20Sopenharmony_ci{ 9448c2ecf20Sopenharmony_ci DEFINE_WAIT(wait); 9458c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 9468c2ecf20Sopenharmony_ci struct drbd_interval *i; 9478c2ecf20Sopenharmony_ci sector_t sector = req->i.sector; 9488c2ecf20Sopenharmony_ci int size = req->i.size; 9498c2ecf20Sopenharmony_ci 9508c2ecf20Sopenharmony_ci for (;;) { 9518c2ecf20Sopenharmony_ci drbd_for_each_overlap(i, &device->write_requests, sector, size) { 9528c2ecf20Sopenharmony_ci /* Ignore, if already completed to upper layers. */ 9538c2ecf20Sopenharmony_ci if (i->completed) 9548c2ecf20Sopenharmony_ci continue; 9558c2ecf20Sopenharmony_ci /* Handle the first found overlap. After the schedule 9568c2ecf20Sopenharmony_ci * we have to restart the tree walk. */ 9578c2ecf20Sopenharmony_ci break; 9588c2ecf20Sopenharmony_ci } 9598c2ecf20Sopenharmony_ci if (!i) /* if any */ 9608c2ecf20Sopenharmony_ci break; 9618c2ecf20Sopenharmony_ci 9628c2ecf20Sopenharmony_ci /* Indicate to wake up device->misc_wait on progress. */ 9638c2ecf20Sopenharmony_ci prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); 9648c2ecf20Sopenharmony_ci i->waiting = true; 9658c2ecf20Sopenharmony_ci spin_unlock_irq(&device->resource->req_lock); 9668c2ecf20Sopenharmony_ci schedule(); 9678c2ecf20Sopenharmony_ci spin_lock_irq(&device->resource->req_lock); 9688c2ecf20Sopenharmony_ci } 9698c2ecf20Sopenharmony_ci finish_wait(&device->misc_wait, &wait); 9708c2ecf20Sopenharmony_ci} 9718c2ecf20Sopenharmony_ci 9728c2ecf20Sopenharmony_ci/* called within req_lock */ 9738c2ecf20Sopenharmony_cistatic void maybe_pull_ahead(struct drbd_device *device) 9748c2ecf20Sopenharmony_ci{ 9758c2ecf20Sopenharmony_ci struct drbd_connection *connection = first_peer_device(device)->connection; 9768c2ecf20Sopenharmony_ci struct net_conf *nc; 9778c2ecf20Sopenharmony_ci bool congested = false; 9788c2ecf20Sopenharmony_ci enum drbd_on_congestion on_congestion; 9798c2ecf20Sopenharmony_ci 9808c2ecf20Sopenharmony_ci rcu_read_lock(); 9818c2ecf20Sopenharmony_ci nc = rcu_dereference(connection->net_conf); 9828c2ecf20Sopenharmony_ci on_congestion = nc ? nc->on_congestion : OC_BLOCK; 9838c2ecf20Sopenharmony_ci rcu_read_unlock(); 9848c2ecf20Sopenharmony_ci if (on_congestion == OC_BLOCK || 9858c2ecf20Sopenharmony_ci connection->agreed_pro_version < 96) 9868c2ecf20Sopenharmony_ci return; 9878c2ecf20Sopenharmony_ci 9888c2ecf20Sopenharmony_ci if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD) 9898c2ecf20Sopenharmony_ci return; /* nothing to do ... */ 9908c2ecf20Sopenharmony_ci 9918c2ecf20Sopenharmony_ci /* If I don't even have good local storage, we can not reasonably try 9928c2ecf20Sopenharmony_ci * to pull ahead of the peer. We also need the local reference to make 9938c2ecf20Sopenharmony_ci * sure device->act_log is there. 9948c2ecf20Sopenharmony_ci */ 9958c2ecf20Sopenharmony_ci if (!get_ldev_if_state(device, D_UP_TO_DATE)) 9968c2ecf20Sopenharmony_ci return; 9978c2ecf20Sopenharmony_ci 9988c2ecf20Sopenharmony_ci if (nc->cong_fill && 9998c2ecf20Sopenharmony_ci atomic_read(&device->ap_in_flight) >= nc->cong_fill) { 10008c2ecf20Sopenharmony_ci drbd_info(device, "Congestion-fill threshold reached\n"); 10018c2ecf20Sopenharmony_ci congested = true; 10028c2ecf20Sopenharmony_ci } 10038c2ecf20Sopenharmony_ci 10048c2ecf20Sopenharmony_ci if (device->act_log->used >= nc->cong_extents) { 10058c2ecf20Sopenharmony_ci drbd_info(device, "Congestion-extents threshold reached\n"); 10068c2ecf20Sopenharmony_ci congested = true; 10078c2ecf20Sopenharmony_ci } 10088c2ecf20Sopenharmony_ci 10098c2ecf20Sopenharmony_ci if (congested) { 10108c2ecf20Sopenharmony_ci /* start a new epoch for non-mirrored writes */ 10118c2ecf20Sopenharmony_ci start_new_tl_epoch(first_peer_device(device)->connection); 10128c2ecf20Sopenharmony_ci 10138c2ecf20Sopenharmony_ci if (on_congestion == OC_PULL_AHEAD) 10148c2ecf20Sopenharmony_ci _drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL); 10158c2ecf20Sopenharmony_ci else /*nc->on_congestion == OC_DISCONNECT */ 10168c2ecf20Sopenharmony_ci _drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL); 10178c2ecf20Sopenharmony_ci } 10188c2ecf20Sopenharmony_ci put_ldev(device); 10198c2ecf20Sopenharmony_ci} 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ci/* If this returns false, and req->private_bio is still set, 10228c2ecf20Sopenharmony_ci * this should be submitted locally. 10238c2ecf20Sopenharmony_ci * 10248c2ecf20Sopenharmony_ci * If it returns false, but req->private_bio is not set, 10258c2ecf20Sopenharmony_ci * we do not have access to good data :( 10268c2ecf20Sopenharmony_ci * 10278c2ecf20Sopenharmony_ci * Otherwise, this destroys req->private_bio, if any, 10288c2ecf20Sopenharmony_ci * and returns true. 10298c2ecf20Sopenharmony_ci */ 10308c2ecf20Sopenharmony_cistatic bool do_remote_read(struct drbd_request *req) 10318c2ecf20Sopenharmony_ci{ 10328c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 10338c2ecf20Sopenharmony_ci enum drbd_read_balancing rbm; 10348c2ecf20Sopenharmony_ci 10358c2ecf20Sopenharmony_ci if (req->private_bio) { 10368c2ecf20Sopenharmony_ci if (!drbd_may_do_local_read(device, 10378c2ecf20Sopenharmony_ci req->i.sector, req->i.size)) { 10388c2ecf20Sopenharmony_ci bio_put(req->private_bio); 10398c2ecf20Sopenharmony_ci req->private_bio = NULL; 10408c2ecf20Sopenharmony_ci put_ldev(device); 10418c2ecf20Sopenharmony_ci } 10428c2ecf20Sopenharmony_ci } 10438c2ecf20Sopenharmony_ci 10448c2ecf20Sopenharmony_ci if (device->state.pdsk != D_UP_TO_DATE) 10458c2ecf20Sopenharmony_ci return false; 10468c2ecf20Sopenharmony_ci 10478c2ecf20Sopenharmony_ci if (req->private_bio == NULL) 10488c2ecf20Sopenharmony_ci return true; 10498c2ecf20Sopenharmony_ci 10508c2ecf20Sopenharmony_ci /* TODO: improve read balancing decisions, take into account drbd 10518c2ecf20Sopenharmony_ci * protocol, pending requests etc. */ 10528c2ecf20Sopenharmony_ci 10538c2ecf20Sopenharmony_ci rcu_read_lock(); 10548c2ecf20Sopenharmony_ci rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing; 10558c2ecf20Sopenharmony_ci rcu_read_unlock(); 10568c2ecf20Sopenharmony_ci 10578c2ecf20Sopenharmony_ci if (rbm == RB_PREFER_LOCAL && req->private_bio) 10588c2ecf20Sopenharmony_ci return false; /* submit locally */ 10598c2ecf20Sopenharmony_ci 10608c2ecf20Sopenharmony_ci if (remote_due_to_read_balancing(device, req->i.sector, rbm)) { 10618c2ecf20Sopenharmony_ci if (req->private_bio) { 10628c2ecf20Sopenharmony_ci bio_put(req->private_bio); 10638c2ecf20Sopenharmony_ci req->private_bio = NULL; 10648c2ecf20Sopenharmony_ci put_ldev(device); 10658c2ecf20Sopenharmony_ci } 10668c2ecf20Sopenharmony_ci return true; 10678c2ecf20Sopenharmony_ci } 10688c2ecf20Sopenharmony_ci 10698c2ecf20Sopenharmony_ci return false; 10708c2ecf20Sopenharmony_ci} 10718c2ecf20Sopenharmony_ci 10728c2ecf20Sopenharmony_cibool drbd_should_do_remote(union drbd_dev_state s) 10738c2ecf20Sopenharmony_ci{ 10748c2ecf20Sopenharmony_ci return s.pdsk == D_UP_TO_DATE || 10758c2ecf20Sopenharmony_ci (s.pdsk >= D_INCONSISTENT && 10768c2ecf20Sopenharmony_ci s.conn >= C_WF_BITMAP_T && 10778c2ecf20Sopenharmony_ci s.conn < C_AHEAD); 10788c2ecf20Sopenharmony_ci /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. 10798c2ecf20Sopenharmony_ci That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* 10808c2ecf20Sopenharmony_ci states. */ 10818c2ecf20Sopenharmony_ci} 10828c2ecf20Sopenharmony_ci 10838c2ecf20Sopenharmony_cistatic bool drbd_should_send_out_of_sync(union drbd_dev_state s) 10848c2ecf20Sopenharmony_ci{ 10858c2ecf20Sopenharmony_ci return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; 10868c2ecf20Sopenharmony_ci /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary 10878c2ecf20Sopenharmony_ci since we enter state C_AHEAD only if proto >= 96 */ 10888c2ecf20Sopenharmony_ci} 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_ci/* returns number of connections (== 1, for drbd 8.4) 10918c2ecf20Sopenharmony_ci * expected to actually write this data, 10928c2ecf20Sopenharmony_ci * which does NOT include those that we are L_AHEAD for. */ 10938c2ecf20Sopenharmony_cistatic int drbd_process_write_request(struct drbd_request *req) 10948c2ecf20Sopenharmony_ci{ 10958c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 10968c2ecf20Sopenharmony_ci int remote, send_oos; 10978c2ecf20Sopenharmony_ci 10988c2ecf20Sopenharmony_ci remote = drbd_should_do_remote(device->state); 10998c2ecf20Sopenharmony_ci send_oos = drbd_should_send_out_of_sync(device->state); 11008c2ecf20Sopenharmony_ci 11018c2ecf20Sopenharmony_ci /* Need to replicate writes. Unless it is an empty flush, 11028c2ecf20Sopenharmony_ci * which is better mapped to a DRBD P_BARRIER packet, 11038c2ecf20Sopenharmony_ci * also for drbd wire protocol compatibility reasons. 11048c2ecf20Sopenharmony_ci * If this was a flush, just start a new epoch. 11058c2ecf20Sopenharmony_ci * Unless the current epoch was empty anyways, or we are not currently 11068c2ecf20Sopenharmony_ci * replicating, in which case there is no point. */ 11078c2ecf20Sopenharmony_ci if (unlikely(req->i.size == 0)) { 11088c2ecf20Sopenharmony_ci /* The only size==0 bios we expect are empty flushes. */ 11098c2ecf20Sopenharmony_ci D_ASSERT(device, req->master_bio->bi_opf & REQ_PREFLUSH); 11108c2ecf20Sopenharmony_ci if (remote) 11118c2ecf20Sopenharmony_ci _req_mod(req, QUEUE_AS_DRBD_BARRIER); 11128c2ecf20Sopenharmony_ci return remote; 11138c2ecf20Sopenharmony_ci } 11148c2ecf20Sopenharmony_ci 11158c2ecf20Sopenharmony_ci if (!remote && !send_oos) 11168c2ecf20Sopenharmony_ci return 0; 11178c2ecf20Sopenharmony_ci 11188c2ecf20Sopenharmony_ci D_ASSERT(device, !(remote && send_oos)); 11198c2ecf20Sopenharmony_ci 11208c2ecf20Sopenharmony_ci if (remote) { 11218c2ecf20Sopenharmony_ci _req_mod(req, TO_BE_SENT); 11228c2ecf20Sopenharmony_ci _req_mod(req, QUEUE_FOR_NET_WRITE); 11238c2ecf20Sopenharmony_ci } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size)) 11248c2ecf20Sopenharmony_ci _req_mod(req, QUEUE_FOR_SEND_OOS); 11258c2ecf20Sopenharmony_ci 11268c2ecf20Sopenharmony_ci return remote; 11278c2ecf20Sopenharmony_ci} 11288c2ecf20Sopenharmony_ci 11298c2ecf20Sopenharmony_cistatic void drbd_process_discard_or_zeroes_req(struct drbd_request *req, int flags) 11308c2ecf20Sopenharmony_ci{ 11318c2ecf20Sopenharmony_ci int err = drbd_issue_discard_or_zero_out(req->device, 11328c2ecf20Sopenharmony_ci req->i.sector, req->i.size >> 9, flags); 11338c2ecf20Sopenharmony_ci if (err) 11348c2ecf20Sopenharmony_ci req->private_bio->bi_status = BLK_STS_IOERR; 11358c2ecf20Sopenharmony_ci bio_endio(req->private_bio); 11368c2ecf20Sopenharmony_ci} 11378c2ecf20Sopenharmony_ci 11388c2ecf20Sopenharmony_cistatic void 11398c2ecf20Sopenharmony_cidrbd_submit_req_private_bio(struct drbd_request *req) 11408c2ecf20Sopenharmony_ci{ 11418c2ecf20Sopenharmony_ci struct drbd_device *device = req->device; 11428c2ecf20Sopenharmony_ci struct bio *bio = req->private_bio; 11438c2ecf20Sopenharmony_ci unsigned int type; 11448c2ecf20Sopenharmony_ci 11458c2ecf20Sopenharmony_ci if (bio_op(bio) != REQ_OP_READ) 11468c2ecf20Sopenharmony_ci type = DRBD_FAULT_DT_WR; 11478c2ecf20Sopenharmony_ci else if (bio->bi_opf & REQ_RAHEAD) 11488c2ecf20Sopenharmony_ci type = DRBD_FAULT_DT_RA; 11498c2ecf20Sopenharmony_ci else 11508c2ecf20Sopenharmony_ci type = DRBD_FAULT_DT_RD; 11518c2ecf20Sopenharmony_ci 11528c2ecf20Sopenharmony_ci bio_set_dev(bio, device->ldev->backing_bdev); 11538c2ecf20Sopenharmony_ci 11548c2ecf20Sopenharmony_ci /* State may have changed since we grabbed our reference on the 11558c2ecf20Sopenharmony_ci * ->ldev member. Double check, and short-circuit to endio. 11568c2ecf20Sopenharmony_ci * In case the last activity log transaction failed to get on 11578c2ecf20Sopenharmony_ci * stable storage, and this is a WRITE, we may not even submit 11588c2ecf20Sopenharmony_ci * this bio. */ 11598c2ecf20Sopenharmony_ci if (get_ldev(device)) { 11608c2ecf20Sopenharmony_ci if (drbd_insert_fault(device, type)) 11618c2ecf20Sopenharmony_ci bio_io_error(bio); 11628c2ecf20Sopenharmony_ci else if (bio_op(bio) == REQ_OP_WRITE_ZEROES) 11638c2ecf20Sopenharmony_ci drbd_process_discard_or_zeroes_req(req, EE_ZEROOUT | 11648c2ecf20Sopenharmony_ci ((bio->bi_opf & REQ_NOUNMAP) ? 0 : EE_TRIM)); 11658c2ecf20Sopenharmony_ci else if (bio_op(bio) == REQ_OP_DISCARD) 11668c2ecf20Sopenharmony_ci drbd_process_discard_or_zeroes_req(req, EE_TRIM); 11678c2ecf20Sopenharmony_ci else 11688c2ecf20Sopenharmony_ci submit_bio_noacct(bio); 11698c2ecf20Sopenharmony_ci put_ldev(device); 11708c2ecf20Sopenharmony_ci } else 11718c2ecf20Sopenharmony_ci bio_io_error(bio); 11728c2ecf20Sopenharmony_ci} 11738c2ecf20Sopenharmony_ci 11748c2ecf20Sopenharmony_cistatic void drbd_queue_write(struct drbd_device *device, struct drbd_request *req) 11758c2ecf20Sopenharmony_ci{ 11768c2ecf20Sopenharmony_ci spin_lock_irq(&device->resource->req_lock); 11778c2ecf20Sopenharmony_ci list_add_tail(&req->tl_requests, &device->submit.writes); 11788c2ecf20Sopenharmony_ci list_add_tail(&req->req_pending_master_completion, 11798c2ecf20Sopenharmony_ci &device->pending_master_completion[1 /* WRITE */]); 11808c2ecf20Sopenharmony_ci spin_unlock_irq(&device->resource->req_lock); 11818c2ecf20Sopenharmony_ci queue_work(device->submit.wq, &device->submit.worker); 11828c2ecf20Sopenharmony_ci /* do_submit() may sleep internally on al_wait, too */ 11838c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 11848c2ecf20Sopenharmony_ci} 11858c2ecf20Sopenharmony_ci 11868c2ecf20Sopenharmony_ci/* returns the new drbd_request pointer, if the caller is expected to 11878c2ecf20Sopenharmony_ci * drbd_send_and_submit() it (to save latency), or NULL if we queued the 11888c2ecf20Sopenharmony_ci * request on the submitter thread. 11898c2ecf20Sopenharmony_ci * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. 11908c2ecf20Sopenharmony_ci */ 11918c2ecf20Sopenharmony_cistatic struct drbd_request * 11928c2ecf20Sopenharmony_cidrbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif) 11938c2ecf20Sopenharmony_ci{ 11948c2ecf20Sopenharmony_ci const int rw = bio_data_dir(bio); 11958c2ecf20Sopenharmony_ci struct drbd_request *req; 11968c2ecf20Sopenharmony_ci 11978c2ecf20Sopenharmony_ci /* allocate outside of all locks; */ 11988c2ecf20Sopenharmony_ci req = drbd_req_new(device, bio); 11998c2ecf20Sopenharmony_ci if (!req) { 12008c2ecf20Sopenharmony_ci dec_ap_bio(device); 12018c2ecf20Sopenharmony_ci /* only pass the error to the upper layers. 12028c2ecf20Sopenharmony_ci * if user cannot handle io errors, that's not our business. */ 12038c2ecf20Sopenharmony_ci drbd_err(device, "could not kmalloc() req\n"); 12048c2ecf20Sopenharmony_ci bio->bi_status = BLK_STS_RESOURCE; 12058c2ecf20Sopenharmony_ci bio_endio(bio); 12068c2ecf20Sopenharmony_ci return ERR_PTR(-ENOMEM); 12078c2ecf20Sopenharmony_ci } 12088c2ecf20Sopenharmony_ci 12098c2ecf20Sopenharmony_ci /* Update disk stats */ 12108c2ecf20Sopenharmony_ci req->start_jif = bio_start_io_acct(req->master_bio); 12118c2ecf20Sopenharmony_ci 12128c2ecf20Sopenharmony_ci if (!get_ldev(device)) { 12138c2ecf20Sopenharmony_ci bio_put(req->private_bio); 12148c2ecf20Sopenharmony_ci req->private_bio = NULL; 12158c2ecf20Sopenharmony_ci } 12168c2ecf20Sopenharmony_ci 12178c2ecf20Sopenharmony_ci /* process discards always from our submitter thread */ 12188c2ecf20Sopenharmony_ci if (bio_op(bio) == REQ_OP_WRITE_ZEROES || 12198c2ecf20Sopenharmony_ci bio_op(bio) == REQ_OP_DISCARD) 12208c2ecf20Sopenharmony_ci goto queue_for_submitter_thread; 12218c2ecf20Sopenharmony_ci 12228c2ecf20Sopenharmony_ci if (rw == WRITE && req->private_bio && req->i.size 12238c2ecf20Sopenharmony_ci && !test_bit(AL_SUSPENDED, &device->flags)) { 12248c2ecf20Sopenharmony_ci if (!drbd_al_begin_io_fastpath(device, &req->i)) 12258c2ecf20Sopenharmony_ci goto queue_for_submitter_thread; 12268c2ecf20Sopenharmony_ci req->rq_state |= RQ_IN_ACT_LOG; 12278c2ecf20Sopenharmony_ci req->in_actlog_jif = jiffies; 12288c2ecf20Sopenharmony_ci } 12298c2ecf20Sopenharmony_ci return req; 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci queue_for_submitter_thread: 12328c2ecf20Sopenharmony_ci atomic_inc(&device->ap_actlog_cnt); 12338c2ecf20Sopenharmony_ci drbd_queue_write(device, req); 12348c2ecf20Sopenharmony_ci return NULL; 12358c2ecf20Sopenharmony_ci} 12368c2ecf20Sopenharmony_ci 12378c2ecf20Sopenharmony_ci/* Require at least one path to current data. 12388c2ecf20Sopenharmony_ci * We don't want to allow writes on C_STANDALONE D_INCONSISTENT: 12398c2ecf20Sopenharmony_ci * We would not allow to read what was written, 12408c2ecf20Sopenharmony_ci * we would not have bumped the data generation uuids, 12418c2ecf20Sopenharmony_ci * we would cause data divergence for all the wrong reasons. 12428c2ecf20Sopenharmony_ci * 12438c2ecf20Sopenharmony_ci * If we don't see at least one D_UP_TO_DATE, we will fail this request, 12448c2ecf20Sopenharmony_ci * which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO, 12458c2ecf20Sopenharmony_ci * and queues for retry later. 12468c2ecf20Sopenharmony_ci */ 12478c2ecf20Sopenharmony_cistatic bool may_do_writes(struct drbd_device *device) 12488c2ecf20Sopenharmony_ci{ 12498c2ecf20Sopenharmony_ci const union drbd_dev_state s = device->state; 12508c2ecf20Sopenharmony_ci return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE; 12518c2ecf20Sopenharmony_ci} 12528c2ecf20Sopenharmony_ci 12538c2ecf20Sopenharmony_cistruct drbd_plug_cb { 12548c2ecf20Sopenharmony_ci struct blk_plug_cb cb; 12558c2ecf20Sopenharmony_ci struct drbd_request *most_recent_req; 12568c2ecf20Sopenharmony_ci /* do we need more? */ 12578c2ecf20Sopenharmony_ci}; 12588c2ecf20Sopenharmony_ci 12598c2ecf20Sopenharmony_cistatic void drbd_unplug(struct blk_plug_cb *cb, bool from_schedule) 12608c2ecf20Sopenharmony_ci{ 12618c2ecf20Sopenharmony_ci struct drbd_plug_cb *plug = container_of(cb, struct drbd_plug_cb, cb); 12628c2ecf20Sopenharmony_ci struct drbd_resource *resource = plug->cb.data; 12638c2ecf20Sopenharmony_ci struct drbd_request *req = plug->most_recent_req; 12648c2ecf20Sopenharmony_ci 12658c2ecf20Sopenharmony_ci kfree(cb); 12668c2ecf20Sopenharmony_ci if (!req) 12678c2ecf20Sopenharmony_ci return; 12688c2ecf20Sopenharmony_ci 12698c2ecf20Sopenharmony_ci spin_lock_irq(&resource->req_lock); 12708c2ecf20Sopenharmony_ci /* In case the sender did not process it yet, raise the flag to 12718c2ecf20Sopenharmony_ci * have it followed with P_UNPLUG_REMOTE just after. */ 12728c2ecf20Sopenharmony_ci req->rq_state |= RQ_UNPLUG; 12738c2ecf20Sopenharmony_ci /* but also queue a generic unplug */ 12748c2ecf20Sopenharmony_ci drbd_queue_unplug(req->device); 12758c2ecf20Sopenharmony_ci kref_put(&req->kref, drbd_req_destroy); 12768c2ecf20Sopenharmony_ci spin_unlock_irq(&resource->req_lock); 12778c2ecf20Sopenharmony_ci} 12788c2ecf20Sopenharmony_ci 12798c2ecf20Sopenharmony_cistatic struct drbd_plug_cb* drbd_check_plugged(struct drbd_resource *resource) 12808c2ecf20Sopenharmony_ci{ 12818c2ecf20Sopenharmony_ci /* A lot of text to say 12828c2ecf20Sopenharmony_ci * return (struct drbd_plug_cb*)blk_check_plugged(); */ 12838c2ecf20Sopenharmony_ci struct drbd_plug_cb *plug; 12848c2ecf20Sopenharmony_ci struct blk_plug_cb *cb = blk_check_plugged(drbd_unplug, resource, sizeof(*plug)); 12858c2ecf20Sopenharmony_ci 12868c2ecf20Sopenharmony_ci if (cb) 12878c2ecf20Sopenharmony_ci plug = container_of(cb, struct drbd_plug_cb, cb); 12888c2ecf20Sopenharmony_ci else 12898c2ecf20Sopenharmony_ci plug = NULL; 12908c2ecf20Sopenharmony_ci return plug; 12918c2ecf20Sopenharmony_ci} 12928c2ecf20Sopenharmony_ci 12938c2ecf20Sopenharmony_cistatic void drbd_update_plug(struct drbd_plug_cb *plug, struct drbd_request *req) 12948c2ecf20Sopenharmony_ci{ 12958c2ecf20Sopenharmony_ci struct drbd_request *tmp = plug->most_recent_req; 12968c2ecf20Sopenharmony_ci /* Will be sent to some peer. 12978c2ecf20Sopenharmony_ci * Remember to tag it with UNPLUG_REMOTE on unplug */ 12988c2ecf20Sopenharmony_ci kref_get(&req->kref); 12998c2ecf20Sopenharmony_ci plug->most_recent_req = req; 13008c2ecf20Sopenharmony_ci if (tmp) 13018c2ecf20Sopenharmony_ci kref_put(&tmp->kref, drbd_req_destroy); 13028c2ecf20Sopenharmony_ci} 13038c2ecf20Sopenharmony_ci 13048c2ecf20Sopenharmony_cistatic void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) 13058c2ecf20Sopenharmony_ci{ 13068c2ecf20Sopenharmony_ci struct drbd_resource *resource = device->resource; 13078c2ecf20Sopenharmony_ci const int rw = bio_data_dir(req->master_bio); 13088c2ecf20Sopenharmony_ci struct bio_and_error m = { NULL, }; 13098c2ecf20Sopenharmony_ci bool no_remote = false; 13108c2ecf20Sopenharmony_ci bool submit_private_bio = false; 13118c2ecf20Sopenharmony_ci 13128c2ecf20Sopenharmony_ci spin_lock_irq(&resource->req_lock); 13138c2ecf20Sopenharmony_ci if (rw == WRITE) { 13148c2ecf20Sopenharmony_ci /* This may temporarily give up the req_lock, 13158c2ecf20Sopenharmony_ci * but will re-aquire it before it returns here. 13168c2ecf20Sopenharmony_ci * Needs to be before the check on drbd_suspended() */ 13178c2ecf20Sopenharmony_ci complete_conflicting_writes(req); 13188c2ecf20Sopenharmony_ci /* no more giving up req_lock from now on! */ 13198c2ecf20Sopenharmony_ci 13208c2ecf20Sopenharmony_ci /* check for congestion, and potentially stop sending 13218c2ecf20Sopenharmony_ci * full data updates, but start sending "dirty bits" only. */ 13228c2ecf20Sopenharmony_ci maybe_pull_ahead(device); 13238c2ecf20Sopenharmony_ci } 13248c2ecf20Sopenharmony_ci 13258c2ecf20Sopenharmony_ci 13268c2ecf20Sopenharmony_ci if (drbd_suspended(device)) { 13278c2ecf20Sopenharmony_ci /* push back and retry: */ 13288c2ecf20Sopenharmony_ci req->rq_state |= RQ_POSTPONED; 13298c2ecf20Sopenharmony_ci if (req->private_bio) { 13308c2ecf20Sopenharmony_ci bio_put(req->private_bio); 13318c2ecf20Sopenharmony_ci req->private_bio = NULL; 13328c2ecf20Sopenharmony_ci put_ldev(device); 13338c2ecf20Sopenharmony_ci } 13348c2ecf20Sopenharmony_ci goto out; 13358c2ecf20Sopenharmony_ci } 13368c2ecf20Sopenharmony_ci 13378c2ecf20Sopenharmony_ci /* We fail READ early, if we can not serve it. 13388c2ecf20Sopenharmony_ci * We must do this before req is registered on any lists. 13398c2ecf20Sopenharmony_ci * Otherwise, drbd_req_complete() will queue failed READ for retry. */ 13408c2ecf20Sopenharmony_ci if (rw != WRITE) { 13418c2ecf20Sopenharmony_ci if (!do_remote_read(req) && !req->private_bio) 13428c2ecf20Sopenharmony_ci goto nodata; 13438c2ecf20Sopenharmony_ci } 13448c2ecf20Sopenharmony_ci 13458c2ecf20Sopenharmony_ci /* which transfer log epoch does this belong to? */ 13468c2ecf20Sopenharmony_ci req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr); 13478c2ecf20Sopenharmony_ci 13488c2ecf20Sopenharmony_ci /* no point in adding empty flushes to the transfer log, 13498c2ecf20Sopenharmony_ci * they are mapped to drbd barriers already. */ 13508c2ecf20Sopenharmony_ci if (likely(req->i.size!=0)) { 13518c2ecf20Sopenharmony_ci if (rw == WRITE) 13528c2ecf20Sopenharmony_ci first_peer_device(device)->connection->current_tle_writes++; 13538c2ecf20Sopenharmony_ci 13548c2ecf20Sopenharmony_ci list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log); 13558c2ecf20Sopenharmony_ci } 13568c2ecf20Sopenharmony_ci 13578c2ecf20Sopenharmony_ci if (rw == WRITE) { 13588c2ecf20Sopenharmony_ci if (req->private_bio && !may_do_writes(device)) { 13598c2ecf20Sopenharmony_ci bio_put(req->private_bio); 13608c2ecf20Sopenharmony_ci req->private_bio = NULL; 13618c2ecf20Sopenharmony_ci put_ldev(device); 13628c2ecf20Sopenharmony_ci goto nodata; 13638c2ecf20Sopenharmony_ci } 13648c2ecf20Sopenharmony_ci if (!drbd_process_write_request(req)) 13658c2ecf20Sopenharmony_ci no_remote = true; 13668c2ecf20Sopenharmony_ci } else { 13678c2ecf20Sopenharmony_ci /* We either have a private_bio, or we can read from remote. 13688c2ecf20Sopenharmony_ci * Otherwise we had done the goto nodata above. */ 13698c2ecf20Sopenharmony_ci if (req->private_bio == NULL) { 13708c2ecf20Sopenharmony_ci _req_mod(req, TO_BE_SENT); 13718c2ecf20Sopenharmony_ci _req_mod(req, QUEUE_FOR_NET_READ); 13728c2ecf20Sopenharmony_ci } else 13738c2ecf20Sopenharmony_ci no_remote = true; 13748c2ecf20Sopenharmony_ci } 13758c2ecf20Sopenharmony_ci 13768c2ecf20Sopenharmony_ci if (no_remote == false) { 13778c2ecf20Sopenharmony_ci struct drbd_plug_cb *plug = drbd_check_plugged(resource); 13788c2ecf20Sopenharmony_ci if (plug) 13798c2ecf20Sopenharmony_ci drbd_update_plug(plug, req); 13808c2ecf20Sopenharmony_ci } 13818c2ecf20Sopenharmony_ci 13828c2ecf20Sopenharmony_ci /* If it took the fast path in drbd_request_prepare, add it here. 13838c2ecf20Sopenharmony_ci * The slow path has added it already. */ 13848c2ecf20Sopenharmony_ci if (list_empty(&req->req_pending_master_completion)) 13858c2ecf20Sopenharmony_ci list_add_tail(&req->req_pending_master_completion, 13868c2ecf20Sopenharmony_ci &device->pending_master_completion[rw == WRITE]); 13878c2ecf20Sopenharmony_ci if (req->private_bio) { 13888c2ecf20Sopenharmony_ci /* needs to be marked within the same spinlock */ 13898c2ecf20Sopenharmony_ci req->pre_submit_jif = jiffies; 13908c2ecf20Sopenharmony_ci list_add_tail(&req->req_pending_local, 13918c2ecf20Sopenharmony_ci &device->pending_completion[rw == WRITE]); 13928c2ecf20Sopenharmony_ci _req_mod(req, TO_BE_SUBMITTED); 13938c2ecf20Sopenharmony_ci /* but we need to give up the spinlock to submit */ 13948c2ecf20Sopenharmony_ci submit_private_bio = true; 13958c2ecf20Sopenharmony_ci } else if (no_remote) { 13968c2ecf20Sopenharmony_cinodata: 13978c2ecf20Sopenharmony_ci if (__ratelimit(&drbd_ratelimit_state)) 13988c2ecf20Sopenharmony_ci drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n", 13998c2ecf20Sopenharmony_ci (unsigned long long)req->i.sector, req->i.size >> 9); 14008c2ecf20Sopenharmony_ci /* A write may have been queued for send_oos, however. 14018c2ecf20Sopenharmony_ci * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ 14028c2ecf20Sopenharmony_ci } 14038c2ecf20Sopenharmony_ci 14048c2ecf20Sopenharmony_ciout: 14058c2ecf20Sopenharmony_ci drbd_req_put_completion_ref(req, &m, 1); 14068c2ecf20Sopenharmony_ci spin_unlock_irq(&resource->req_lock); 14078c2ecf20Sopenharmony_ci 14088c2ecf20Sopenharmony_ci /* Even though above is a kref_put(), this is safe. 14098c2ecf20Sopenharmony_ci * As long as we still need to submit our private bio, 14108c2ecf20Sopenharmony_ci * we hold a completion ref, and the request cannot disappear. 14118c2ecf20Sopenharmony_ci * If however this request did not even have a private bio to submit 14128c2ecf20Sopenharmony_ci * (e.g. remote read), req may already be invalid now. 14138c2ecf20Sopenharmony_ci * That's why we cannot check on req->private_bio. */ 14148c2ecf20Sopenharmony_ci if (submit_private_bio) 14158c2ecf20Sopenharmony_ci drbd_submit_req_private_bio(req); 14168c2ecf20Sopenharmony_ci if (m.bio) 14178c2ecf20Sopenharmony_ci complete_master_bio(device, &m); 14188c2ecf20Sopenharmony_ci} 14198c2ecf20Sopenharmony_ci 14208c2ecf20Sopenharmony_civoid __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif) 14218c2ecf20Sopenharmony_ci{ 14228c2ecf20Sopenharmony_ci struct drbd_request *req = drbd_request_prepare(device, bio, start_jif); 14238c2ecf20Sopenharmony_ci if (IS_ERR_OR_NULL(req)) 14248c2ecf20Sopenharmony_ci return; 14258c2ecf20Sopenharmony_ci drbd_send_and_submit(device, req); 14268c2ecf20Sopenharmony_ci} 14278c2ecf20Sopenharmony_ci 14288c2ecf20Sopenharmony_cistatic void submit_fast_path(struct drbd_device *device, struct list_head *incoming) 14298c2ecf20Sopenharmony_ci{ 14308c2ecf20Sopenharmony_ci struct blk_plug plug; 14318c2ecf20Sopenharmony_ci struct drbd_request *req, *tmp; 14328c2ecf20Sopenharmony_ci 14338c2ecf20Sopenharmony_ci blk_start_plug(&plug); 14348c2ecf20Sopenharmony_ci list_for_each_entry_safe(req, tmp, incoming, tl_requests) { 14358c2ecf20Sopenharmony_ci const int rw = bio_data_dir(req->master_bio); 14368c2ecf20Sopenharmony_ci 14378c2ecf20Sopenharmony_ci if (rw == WRITE /* rw != WRITE should not even end up here! */ 14388c2ecf20Sopenharmony_ci && req->private_bio && req->i.size 14398c2ecf20Sopenharmony_ci && !test_bit(AL_SUSPENDED, &device->flags)) { 14408c2ecf20Sopenharmony_ci if (!drbd_al_begin_io_fastpath(device, &req->i)) 14418c2ecf20Sopenharmony_ci continue; 14428c2ecf20Sopenharmony_ci 14438c2ecf20Sopenharmony_ci req->rq_state |= RQ_IN_ACT_LOG; 14448c2ecf20Sopenharmony_ci req->in_actlog_jif = jiffies; 14458c2ecf20Sopenharmony_ci atomic_dec(&device->ap_actlog_cnt); 14468c2ecf20Sopenharmony_ci } 14478c2ecf20Sopenharmony_ci 14488c2ecf20Sopenharmony_ci list_del_init(&req->tl_requests); 14498c2ecf20Sopenharmony_ci drbd_send_and_submit(device, req); 14508c2ecf20Sopenharmony_ci } 14518c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 14528c2ecf20Sopenharmony_ci} 14538c2ecf20Sopenharmony_ci 14548c2ecf20Sopenharmony_cistatic bool prepare_al_transaction_nonblock(struct drbd_device *device, 14558c2ecf20Sopenharmony_ci struct list_head *incoming, 14568c2ecf20Sopenharmony_ci struct list_head *pending, 14578c2ecf20Sopenharmony_ci struct list_head *later) 14588c2ecf20Sopenharmony_ci{ 14598c2ecf20Sopenharmony_ci struct drbd_request *req; 14608c2ecf20Sopenharmony_ci int wake = 0; 14618c2ecf20Sopenharmony_ci int err; 14628c2ecf20Sopenharmony_ci 14638c2ecf20Sopenharmony_ci spin_lock_irq(&device->al_lock); 14648c2ecf20Sopenharmony_ci while ((req = list_first_entry_or_null(incoming, struct drbd_request, tl_requests))) { 14658c2ecf20Sopenharmony_ci err = drbd_al_begin_io_nonblock(device, &req->i); 14668c2ecf20Sopenharmony_ci if (err == -ENOBUFS) 14678c2ecf20Sopenharmony_ci break; 14688c2ecf20Sopenharmony_ci if (err == -EBUSY) 14698c2ecf20Sopenharmony_ci wake = 1; 14708c2ecf20Sopenharmony_ci if (err) 14718c2ecf20Sopenharmony_ci list_move_tail(&req->tl_requests, later); 14728c2ecf20Sopenharmony_ci else 14738c2ecf20Sopenharmony_ci list_move_tail(&req->tl_requests, pending); 14748c2ecf20Sopenharmony_ci } 14758c2ecf20Sopenharmony_ci spin_unlock_irq(&device->al_lock); 14768c2ecf20Sopenharmony_ci if (wake) 14778c2ecf20Sopenharmony_ci wake_up(&device->al_wait); 14788c2ecf20Sopenharmony_ci return !list_empty(pending); 14798c2ecf20Sopenharmony_ci} 14808c2ecf20Sopenharmony_ci 14818c2ecf20Sopenharmony_cistatic void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) 14828c2ecf20Sopenharmony_ci{ 14838c2ecf20Sopenharmony_ci struct blk_plug plug; 14848c2ecf20Sopenharmony_ci struct drbd_request *req; 14858c2ecf20Sopenharmony_ci 14868c2ecf20Sopenharmony_ci blk_start_plug(&plug); 14878c2ecf20Sopenharmony_ci while ((req = list_first_entry_or_null(pending, struct drbd_request, tl_requests))) { 14888c2ecf20Sopenharmony_ci req->rq_state |= RQ_IN_ACT_LOG; 14898c2ecf20Sopenharmony_ci req->in_actlog_jif = jiffies; 14908c2ecf20Sopenharmony_ci atomic_dec(&device->ap_actlog_cnt); 14918c2ecf20Sopenharmony_ci list_del_init(&req->tl_requests); 14928c2ecf20Sopenharmony_ci drbd_send_and_submit(device, req); 14938c2ecf20Sopenharmony_ci } 14948c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 14958c2ecf20Sopenharmony_ci} 14968c2ecf20Sopenharmony_ci 14978c2ecf20Sopenharmony_civoid do_submit(struct work_struct *ws) 14988c2ecf20Sopenharmony_ci{ 14998c2ecf20Sopenharmony_ci struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker); 15008c2ecf20Sopenharmony_ci LIST_HEAD(incoming); /* from drbd_make_request() */ 15018c2ecf20Sopenharmony_ci LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */ 15028c2ecf20Sopenharmony_ci LIST_HEAD(busy); /* blocked by resync requests */ 15038c2ecf20Sopenharmony_ci 15048c2ecf20Sopenharmony_ci /* grab new incoming requests */ 15058c2ecf20Sopenharmony_ci spin_lock_irq(&device->resource->req_lock); 15068c2ecf20Sopenharmony_ci list_splice_tail_init(&device->submit.writes, &incoming); 15078c2ecf20Sopenharmony_ci spin_unlock_irq(&device->resource->req_lock); 15088c2ecf20Sopenharmony_ci 15098c2ecf20Sopenharmony_ci for (;;) { 15108c2ecf20Sopenharmony_ci DEFINE_WAIT(wait); 15118c2ecf20Sopenharmony_ci 15128c2ecf20Sopenharmony_ci /* move used-to-be-busy back to front of incoming */ 15138c2ecf20Sopenharmony_ci list_splice_init(&busy, &incoming); 15148c2ecf20Sopenharmony_ci submit_fast_path(device, &incoming); 15158c2ecf20Sopenharmony_ci if (list_empty(&incoming)) 15168c2ecf20Sopenharmony_ci break; 15178c2ecf20Sopenharmony_ci 15188c2ecf20Sopenharmony_ci for (;;) { 15198c2ecf20Sopenharmony_ci prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE); 15208c2ecf20Sopenharmony_ci 15218c2ecf20Sopenharmony_ci list_splice_init(&busy, &incoming); 15228c2ecf20Sopenharmony_ci prepare_al_transaction_nonblock(device, &incoming, &pending, &busy); 15238c2ecf20Sopenharmony_ci if (!list_empty(&pending)) 15248c2ecf20Sopenharmony_ci break; 15258c2ecf20Sopenharmony_ci 15268c2ecf20Sopenharmony_ci schedule(); 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_ci /* If all currently "hot" activity log extents are kept busy by 15298c2ecf20Sopenharmony_ci * incoming requests, we still must not totally starve new 15308c2ecf20Sopenharmony_ci * requests to "cold" extents. 15318c2ecf20Sopenharmony_ci * Something left on &incoming means there had not been 15328c2ecf20Sopenharmony_ci * enough update slots available, and the activity log 15338c2ecf20Sopenharmony_ci * has been marked as "starving". 15348c2ecf20Sopenharmony_ci * 15358c2ecf20Sopenharmony_ci * Try again now, without looking for new requests, 15368c2ecf20Sopenharmony_ci * effectively blocking all new requests until we made 15378c2ecf20Sopenharmony_ci * at least _some_ progress with what we currently have. 15388c2ecf20Sopenharmony_ci */ 15398c2ecf20Sopenharmony_ci if (!list_empty(&incoming)) 15408c2ecf20Sopenharmony_ci continue; 15418c2ecf20Sopenharmony_ci 15428c2ecf20Sopenharmony_ci /* Nothing moved to pending, but nothing left 15438c2ecf20Sopenharmony_ci * on incoming: all moved to busy! 15448c2ecf20Sopenharmony_ci * Grab new and iterate. */ 15458c2ecf20Sopenharmony_ci spin_lock_irq(&device->resource->req_lock); 15468c2ecf20Sopenharmony_ci list_splice_tail_init(&device->submit.writes, &incoming); 15478c2ecf20Sopenharmony_ci spin_unlock_irq(&device->resource->req_lock); 15488c2ecf20Sopenharmony_ci } 15498c2ecf20Sopenharmony_ci finish_wait(&device->al_wait, &wait); 15508c2ecf20Sopenharmony_ci 15518c2ecf20Sopenharmony_ci /* If the transaction was full, before all incoming requests 15528c2ecf20Sopenharmony_ci * had been processed, skip ahead to commit, and iterate 15538c2ecf20Sopenharmony_ci * without splicing in more incoming requests from upper layers. 15548c2ecf20Sopenharmony_ci * 15558c2ecf20Sopenharmony_ci * Else, if all incoming have been processed, 15568c2ecf20Sopenharmony_ci * they have become either "pending" (to be submitted after 15578c2ecf20Sopenharmony_ci * next transaction commit) or "busy" (blocked by resync). 15588c2ecf20Sopenharmony_ci * 15598c2ecf20Sopenharmony_ci * Maybe more was queued, while we prepared the transaction? 15608c2ecf20Sopenharmony_ci * Try to stuff those into this transaction as well. 15618c2ecf20Sopenharmony_ci * Be strictly non-blocking here, 15628c2ecf20Sopenharmony_ci * we already have something to commit. 15638c2ecf20Sopenharmony_ci * 15648c2ecf20Sopenharmony_ci * Commit if we don't make any more progres. 15658c2ecf20Sopenharmony_ci */ 15668c2ecf20Sopenharmony_ci 15678c2ecf20Sopenharmony_ci while (list_empty(&incoming)) { 15688c2ecf20Sopenharmony_ci LIST_HEAD(more_pending); 15698c2ecf20Sopenharmony_ci LIST_HEAD(more_incoming); 15708c2ecf20Sopenharmony_ci bool made_progress; 15718c2ecf20Sopenharmony_ci 15728c2ecf20Sopenharmony_ci /* It is ok to look outside the lock, 15738c2ecf20Sopenharmony_ci * it's only an optimization anyways */ 15748c2ecf20Sopenharmony_ci if (list_empty(&device->submit.writes)) 15758c2ecf20Sopenharmony_ci break; 15768c2ecf20Sopenharmony_ci 15778c2ecf20Sopenharmony_ci spin_lock_irq(&device->resource->req_lock); 15788c2ecf20Sopenharmony_ci list_splice_tail_init(&device->submit.writes, &more_incoming); 15798c2ecf20Sopenharmony_ci spin_unlock_irq(&device->resource->req_lock); 15808c2ecf20Sopenharmony_ci 15818c2ecf20Sopenharmony_ci if (list_empty(&more_incoming)) 15828c2ecf20Sopenharmony_ci break; 15838c2ecf20Sopenharmony_ci 15848c2ecf20Sopenharmony_ci made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy); 15858c2ecf20Sopenharmony_ci 15868c2ecf20Sopenharmony_ci list_splice_tail_init(&more_pending, &pending); 15878c2ecf20Sopenharmony_ci list_splice_tail_init(&more_incoming, &incoming); 15888c2ecf20Sopenharmony_ci if (!made_progress) 15898c2ecf20Sopenharmony_ci break; 15908c2ecf20Sopenharmony_ci } 15918c2ecf20Sopenharmony_ci 15928c2ecf20Sopenharmony_ci drbd_al_begin_io_commit(device); 15938c2ecf20Sopenharmony_ci send_and_submit_pending(device, &pending); 15948c2ecf20Sopenharmony_ci } 15958c2ecf20Sopenharmony_ci} 15968c2ecf20Sopenharmony_ci 15978c2ecf20Sopenharmony_ciblk_qc_t drbd_submit_bio(struct bio *bio) 15988c2ecf20Sopenharmony_ci{ 15998c2ecf20Sopenharmony_ci struct drbd_device *device = bio->bi_disk->private_data; 16008c2ecf20Sopenharmony_ci unsigned long start_jif; 16018c2ecf20Sopenharmony_ci 16028c2ecf20Sopenharmony_ci blk_queue_split(&bio); 16038c2ecf20Sopenharmony_ci 16048c2ecf20Sopenharmony_ci start_jif = jiffies; 16058c2ecf20Sopenharmony_ci 16068c2ecf20Sopenharmony_ci /* 16078c2ecf20Sopenharmony_ci * what we "blindly" assume: 16088c2ecf20Sopenharmony_ci */ 16098c2ecf20Sopenharmony_ci D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512)); 16108c2ecf20Sopenharmony_ci 16118c2ecf20Sopenharmony_ci inc_ap_bio(device); 16128c2ecf20Sopenharmony_ci __drbd_make_request(device, bio, start_jif); 16138c2ecf20Sopenharmony_ci return BLK_QC_T_NONE; 16148c2ecf20Sopenharmony_ci} 16158c2ecf20Sopenharmony_ci 16168c2ecf20Sopenharmony_cistatic bool net_timeout_reached(struct drbd_request *net_req, 16178c2ecf20Sopenharmony_ci struct drbd_connection *connection, 16188c2ecf20Sopenharmony_ci unsigned long now, unsigned long ent, 16198c2ecf20Sopenharmony_ci unsigned int ko_count, unsigned int timeout) 16208c2ecf20Sopenharmony_ci{ 16218c2ecf20Sopenharmony_ci struct drbd_device *device = net_req->device; 16228c2ecf20Sopenharmony_ci 16238c2ecf20Sopenharmony_ci if (!time_after(now, net_req->pre_send_jif + ent)) 16248c2ecf20Sopenharmony_ci return false; 16258c2ecf20Sopenharmony_ci 16268c2ecf20Sopenharmony_ci if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) 16278c2ecf20Sopenharmony_ci return false; 16288c2ecf20Sopenharmony_ci 16298c2ecf20Sopenharmony_ci if (net_req->rq_state & RQ_NET_PENDING) { 16308c2ecf20Sopenharmony_ci drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", 16318c2ecf20Sopenharmony_ci jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); 16328c2ecf20Sopenharmony_ci return true; 16338c2ecf20Sopenharmony_ci } 16348c2ecf20Sopenharmony_ci 16358c2ecf20Sopenharmony_ci /* We received an ACK already (or are using protocol A), 16368c2ecf20Sopenharmony_ci * but are waiting for the epoch closing barrier ack. 16378c2ecf20Sopenharmony_ci * Check if we sent the barrier already. We should not blame the peer 16388c2ecf20Sopenharmony_ci * for being unresponsive, if we did not even ask it yet. */ 16398c2ecf20Sopenharmony_ci if (net_req->epoch == connection->send.current_epoch_nr) { 16408c2ecf20Sopenharmony_ci drbd_warn(device, 16418c2ecf20Sopenharmony_ci "We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n", 16428c2ecf20Sopenharmony_ci jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); 16438c2ecf20Sopenharmony_ci return false; 16448c2ecf20Sopenharmony_ci } 16458c2ecf20Sopenharmony_ci 16468c2ecf20Sopenharmony_ci /* Worst case: we may have been blocked for whatever reason, then 16478c2ecf20Sopenharmony_ci * suddenly are able to send a lot of requests (and epoch separating 16488c2ecf20Sopenharmony_ci * barriers) in quick succession. 16498c2ecf20Sopenharmony_ci * The timestamp of the net_req may be much too old and not correspond 16508c2ecf20Sopenharmony_ci * to the sending time of the relevant unack'ed barrier packet, so 16518c2ecf20Sopenharmony_ci * would trigger a spurious timeout. The latest barrier packet may 16528c2ecf20Sopenharmony_ci * have a too recent timestamp to trigger the timeout, potentially miss 16538c2ecf20Sopenharmony_ci * a timeout. Right now we don't have a place to conveniently store 16548c2ecf20Sopenharmony_ci * these timestamps. 16558c2ecf20Sopenharmony_ci * But in this particular situation, the application requests are still 16568c2ecf20Sopenharmony_ci * completed to upper layers, DRBD should still "feel" responsive. 16578c2ecf20Sopenharmony_ci * No need yet to kill this connection, it may still recover. 16588c2ecf20Sopenharmony_ci * If not, eventually we will have queued enough into the network for 16598c2ecf20Sopenharmony_ci * us to block. From that point of view, the timestamp of the last sent 16608c2ecf20Sopenharmony_ci * barrier packet is relevant enough. 16618c2ecf20Sopenharmony_ci */ 16628c2ecf20Sopenharmony_ci if (time_after(now, connection->send.last_sent_barrier_jif + ent)) { 16638c2ecf20Sopenharmony_ci drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", 16648c2ecf20Sopenharmony_ci connection->send.last_sent_barrier_jif, now, 16658c2ecf20Sopenharmony_ci jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout); 16668c2ecf20Sopenharmony_ci return true; 16678c2ecf20Sopenharmony_ci } 16688c2ecf20Sopenharmony_ci return false; 16698c2ecf20Sopenharmony_ci} 16708c2ecf20Sopenharmony_ci 16718c2ecf20Sopenharmony_ci/* A request is considered timed out, if 16728c2ecf20Sopenharmony_ci * - we have some effective timeout from the configuration, 16738c2ecf20Sopenharmony_ci * with some state restrictions applied, 16748c2ecf20Sopenharmony_ci * - the oldest request is waiting for a response from the network 16758c2ecf20Sopenharmony_ci * resp. the local disk, 16768c2ecf20Sopenharmony_ci * - the oldest request is in fact older than the effective timeout, 16778c2ecf20Sopenharmony_ci * - the connection was established (resp. disk was attached) 16788c2ecf20Sopenharmony_ci * for longer than the timeout already. 16798c2ecf20Sopenharmony_ci * Note that for 32bit jiffies and very stable connections/disks, 16808c2ecf20Sopenharmony_ci * we may have a wrap around, which is catched by 16818c2ecf20Sopenharmony_ci * !time_in_range(now, last_..._jif, last_..._jif + timeout). 16828c2ecf20Sopenharmony_ci * 16838c2ecf20Sopenharmony_ci * Side effect: once per 32bit wrap-around interval, which means every 16848c2ecf20Sopenharmony_ci * ~198 days with 250 HZ, we have a window where the timeout would need 16858c2ecf20Sopenharmony_ci * to expire twice (worst case) to become effective. Good enough. 16868c2ecf20Sopenharmony_ci */ 16878c2ecf20Sopenharmony_ci 16888c2ecf20Sopenharmony_civoid request_timer_fn(struct timer_list *t) 16898c2ecf20Sopenharmony_ci{ 16908c2ecf20Sopenharmony_ci struct drbd_device *device = from_timer(device, t, request_timer); 16918c2ecf20Sopenharmony_ci struct drbd_connection *connection = first_peer_device(device)->connection; 16928c2ecf20Sopenharmony_ci struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */ 16938c2ecf20Sopenharmony_ci struct net_conf *nc; 16948c2ecf20Sopenharmony_ci unsigned long oldest_submit_jif; 16958c2ecf20Sopenharmony_ci unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ 16968c2ecf20Sopenharmony_ci unsigned long now; 16978c2ecf20Sopenharmony_ci unsigned int ko_count = 0, timeout = 0; 16988c2ecf20Sopenharmony_ci 16998c2ecf20Sopenharmony_ci rcu_read_lock(); 17008c2ecf20Sopenharmony_ci nc = rcu_dereference(connection->net_conf); 17018c2ecf20Sopenharmony_ci if (nc && device->state.conn >= C_WF_REPORT_PARAMS) { 17028c2ecf20Sopenharmony_ci ko_count = nc->ko_count; 17038c2ecf20Sopenharmony_ci timeout = nc->timeout; 17048c2ecf20Sopenharmony_ci } 17058c2ecf20Sopenharmony_ci 17068c2ecf20Sopenharmony_ci if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ 17078c2ecf20Sopenharmony_ci dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; 17088c2ecf20Sopenharmony_ci put_ldev(device); 17098c2ecf20Sopenharmony_ci } 17108c2ecf20Sopenharmony_ci rcu_read_unlock(); 17118c2ecf20Sopenharmony_ci 17128c2ecf20Sopenharmony_ci 17138c2ecf20Sopenharmony_ci ent = timeout * HZ/10 * ko_count; 17148c2ecf20Sopenharmony_ci et = min_not_zero(dt, ent); 17158c2ecf20Sopenharmony_ci 17168c2ecf20Sopenharmony_ci if (!et) 17178c2ecf20Sopenharmony_ci return; /* Recurring timer stopped */ 17188c2ecf20Sopenharmony_ci 17198c2ecf20Sopenharmony_ci now = jiffies; 17208c2ecf20Sopenharmony_ci nt = now + et; 17218c2ecf20Sopenharmony_ci 17228c2ecf20Sopenharmony_ci spin_lock_irq(&device->resource->req_lock); 17238c2ecf20Sopenharmony_ci req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); 17248c2ecf20Sopenharmony_ci req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); 17258c2ecf20Sopenharmony_ci 17268c2ecf20Sopenharmony_ci /* maybe the oldest request waiting for the peer is in fact still 17278c2ecf20Sopenharmony_ci * blocking in tcp sendmsg. That's ok, though, that's handled via the 17288c2ecf20Sopenharmony_ci * socket send timeout, requesting a ping, and bumping ko-count in 17298c2ecf20Sopenharmony_ci * we_should_drop_the_connection(). 17308c2ecf20Sopenharmony_ci */ 17318c2ecf20Sopenharmony_ci 17328c2ecf20Sopenharmony_ci /* check the oldest request we did successfully sent, 17338c2ecf20Sopenharmony_ci * but which is still waiting for an ACK. */ 17348c2ecf20Sopenharmony_ci req_peer = connection->req_ack_pending; 17358c2ecf20Sopenharmony_ci 17368c2ecf20Sopenharmony_ci /* if we don't have such request (e.g. protocoll A) 17378c2ecf20Sopenharmony_ci * check the oldest requests which is still waiting on its epoch 17388c2ecf20Sopenharmony_ci * closing barrier ack. */ 17398c2ecf20Sopenharmony_ci if (!req_peer) 17408c2ecf20Sopenharmony_ci req_peer = connection->req_not_net_done; 17418c2ecf20Sopenharmony_ci 17428c2ecf20Sopenharmony_ci /* evaluate the oldest peer request only in one timer! */ 17438c2ecf20Sopenharmony_ci if (req_peer && req_peer->device != device) 17448c2ecf20Sopenharmony_ci req_peer = NULL; 17458c2ecf20Sopenharmony_ci 17468c2ecf20Sopenharmony_ci /* do we have something to evaluate? */ 17478c2ecf20Sopenharmony_ci if (req_peer == NULL && req_write == NULL && req_read == NULL) 17488c2ecf20Sopenharmony_ci goto out; 17498c2ecf20Sopenharmony_ci 17508c2ecf20Sopenharmony_ci oldest_submit_jif = 17518c2ecf20Sopenharmony_ci (req_write && req_read) 17528c2ecf20Sopenharmony_ci ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif) 17538c2ecf20Sopenharmony_ci ? req_write->pre_submit_jif : req_read->pre_submit_jif ) 17548c2ecf20Sopenharmony_ci : req_write ? req_write->pre_submit_jif 17558c2ecf20Sopenharmony_ci : req_read ? req_read->pre_submit_jif : now; 17568c2ecf20Sopenharmony_ci 17578c2ecf20Sopenharmony_ci if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout)) 17588c2ecf20Sopenharmony_ci _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); 17598c2ecf20Sopenharmony_ci 17608c2ecf20Sopenharmony_ci if (dt && oldest_submit_jif != now && 17618c2ecf20Sopenharmony_ci time_after(now, oldest_submit_jif + dt) && 17628c2ecf20Sopenharmony_ci !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { 17638c2ecf20Sopenharmony_ci drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); 17648c2ecf20Sopenharmony_ci __drbd_chk_io_error(device, DRBD_FORCE_DETACH); 17658c2ecf20Sopenharmony_ci } 17668c2ecf20Sopenharmony_ci 17678c2ecf20Sopenharmony_ci /* Reschedule timer for the nearest not already expired timeout. 17688c2ecf20Sopenharmony_ci * Fallback to now + min(effective network timeout, disk timeout). */ 17698c2ecf20Sopenharmony_ci ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent)) 17708c2ecf20Sopenharmony_ci ? req_peer->pre_send_jif + ent : now + et; 17718c2ecf20Sopenharmony_ci dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt)) 17728c2ecf20Sopenharmony_ci ? oldest_submit_jif + dt : now + et; 17738c2ecf20Sopenharmony_ci nt = time_before(ent, dt) ? ent : dt; 17748c2ecf20Sopenharmony_ciout: 17758c2ecf20Sopenharmony_ci spin_unlock_irq(&device->resource->req_lock); 17768c2ecf20Sopenharmony_ci mod_timer(&device->request_timer, nt); 17778c2ecf20Sopenharmony_ci} 1778