1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Generic SCSI-3 ALUA SCSI Device Handler
4 *
5 * Copyright (C) 2007-2010 Hannes Reinecke, SUSE Linux Products GmbH.
6 * All rights reserved.
7 */
8#include <linux/slab.h>
9#include <linux/delay.h>
10#include <linux/module.h>
11#include <asm/unaligned.h>
12#include <scsi/scsi.h>
13#include <scsi/scsi_proto.h>
14#include <scsi/scsi_dbg.h>
15#include <scsi/scsi_eh.h>
16#include <scsi/scsi_dh.h>
17
18#define ALUA_DH_NAME "alua"
19#define ALUA_DH_VER "2.0"
20
21#define TPGS_SUPPORT_NONE		0x00
22#define TPGS_SUPPORT_OPTIMIZED		0x01
23#define TPGS_SUPPORT_NONOPTIMIZED	0x02
24#define TPGS_SUPPORT_STANDBY		0x04
25#define TPGS_SUPPORT_UNAVAILABLE	0x08
26#define TPGS_SUPPORT_LBA_DEPENDENT	0x10
27#define TPGS_SUPPORT_OFFLINE		0x40
28#define TPGS_SUPPORT_TRANSITION		0x80
29#define TPGS_SUPPORT_ALL		0xdf
30
31#define RTPG_FMT_MASK			0x70
32#define RTPG_FMT_EXT_HDR		0x10
33
34#define TPGS_MODE_UNINITIALIZED		 -1
35#define TPGS_MODE_NONE			0x0
36#define TPGS_MODE_IMPLICIT		0x1
37#define TPGS_MODE_EXPLICIT		0x2
38
39#define ALUA_RTPG_SIZE			128
40#define ALUA_FAILOVER_TIMEOUT		60
41#define ALUA_FAILOVER_RETRIES		5
42#define ALUA_RTPG_DELAY_MSECS		5
43#define ALUA_RTPG_RETRY_DELAY		2
44
45/* device handler flags */
46#define ALUA_OPTIMIZE_STPG		0x01
47#define ALUA_RTPG_EXT_HDR_UNSUPP	0x02
48/* State machine flags */
49#define ALUA_PG_RUN_RTPG		0x10
50#define ALUA_PG_RUN_STPG		0x20
51#define ALUA_PG_RUNNING			0x40
52
53static uint optimize_stpg;
54module_param(optimize_stpg, uint, S_IRUGO|S_IWUSR);
55MODULE_PARM_DESC(optimize_stpg, "Allow use of a non-optimized path, rather than sending a STPG, when implicit TPGS is supported (0=No,1=Yes). Default is 0.");
56
57static LIST_HEAD(port_group_list);
58static DEFINE_SPINLOCK(port_group_lock);
59static struct workqueue_struct *kaluad_wq;
60
61struct alua_port_group {
62	struct kref		kref;
63	struct rcu_head		rcu;
64	struct list_head	node;
65	struct list_head	dh_list;
66	unsigned char		device_id_str[256];
67	int			device_id_len;
68	int			group_id;
69	int			tpgs;
70	int			state;
71	int			pref;
72	int			valid_states;
73	unsigned		flags; /* used for optimizing STPG */
74	unsigned char		transition_tmo;
75	unsigned long		expiry;
76	unsigned long		interval;
77	struct delayed_work	rtpg_work;
78	spinlock_t		lock;
79	struct list_head	rtpg_list;
80	struct scsi_device	*rtpg_sdev;
81};
82
83struct alua_dh_data {
84	struct list_head	node;
85	struct alua_port_group __rcu *pg;
86	int			group_id;
87	spinlock_t		pg_lock;
88	struct scsi_device	*sdev;
89	int			init_error;
90	struct mutex		init_mutex;
91};
92
93struct alua_queue_data {
94	struct list_head	entry;
95	activate_complete	callback_fn;
96	void			*callback_data;
97};
98
99#define ALUA_POLICY_SWITCH_CURRENT	0
100#define ALUA_POLICY_SWITCH_ALL		1
101
102static void alua_rtpg_work(struct work_struct *work);
103static bool alua_rtpg_queue(struct alua_port_group *pg,
104			    struct scsi_device *sdev,
105			    struct alua_queue_data *qdata, bool force);
106static void alua_check(struct scsi_device *sdev, bool force);
107
108static void release_port_group(struct kref *kref)
109{
110	struct alua_port_group *pg;
111
112	pg = container_of(kref, struct alua_port_group, kref);
113	if (pg->rtpg_sdev)
114		flush_delayed_work(&pg->rtpg_work);
115	spin_lock(&port_group_lock);
116	list_del(&pg->node);
117	spin_unlock(&port_group_lock);
118	kfree_rcu(pg, rcu);
119}
120
121/*
122 * submit_rtpg - Issue a REPORT TARGET GROUP STATES command
123 * @sdev: sdev the command should be sent to
124 */
125static int submit_rtpg(struct scsi_device *sdev, unsigned char *buff,
126		       int bufflen, struct scsi_sense_hdr *sshdr, int flags)
127{
128	u8 cdb[MAX_COMMAND_SIZE];
129	int req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
130		REQ_FAILFAST_DRIVER;
131
132	/* Prepare the command. */
133	memset(cdb, 0x0, MAX_COMMAND_SIZE);
134	cdb[0] = MAINTENANCE_IN;
135	if (!(flags & ALUA_RTPG_EXT_HDR_UNSUPP))
136		cdb[1] = MI_REPORT_TARGET_PGS | MI_EXT_HDR_PARAM_FMT;
137	else
138		cdb[1] = MI_REPORT_TARGET_PGS;
139	put_unaligned_be32(bufflen, &cdb[6]);
140
141	return scsi_execute(sdev, cdb, DMA_FROM_DEVICE, buff, bufflen, NULL,
142			sshdr, ALUA_FAILOVER_TIMEOUT * HZ,
143			ALUA_FAILOVER_RETRIES, req_flags, 0, NULL);
144}
145
146/*
147 * submit_stpg - Issue a SET TARGET PORT GROUP command
148 *
149 * Currently we're only setting the current target port group state
150 * to 'active/optimized' and let the array firmware figure out
151 * the states of the remaining groups.
152 */
153static int submit_stpg(struct scsi_device *sdev, int group_id,
154		       struct scsi_sense_hdr *sshdr)
155{
156	u8 cdb[MAX_COMMAND_SIZE];
157	unsigned char stpg_data[8];
158	int stpg_len = 8;
159	int req_flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
160		REQ_FAILFAST_DRIVER;
161
162	/* Prepare the data buffer */
163	memset(stpg_data, 0, stpg_len);
164	stpg_data[4] = SCSI_ACCESS_STATE_OPTIMAL;
165	put_unaligned_be16(group_id, &stpg_data[6]);
166
167	/* Prepare the command. */
168	memset(cdb, 0x0, MAX_COMMAND_SIZE);
169	cdb[0] = MAINTENANCE_OUT;
170	cdb[1] = MO_SET_TARGET_PGS;
171	put_unaligned_be32(stpg_len, &cdb[6]);
172
173	return scsi_execute(sdev, cdb, DMA_TO_DEVICE, stpg_data, stpg_len, NULL,
174			sshdr, ALUA_FAILOVER_TIMEOUT * HZ,
175			ALUA_FAILOVER_RETRIES, req_flags, 0, NULL);
176}
177
178static struct alua_port_group *alua_find_get_pg(char *id_str, size_t id_size,
179						int group_id)
180{
181	struct alua_port_group *pg;
182
183	if (!id_str || !id_size || !strlen(id_str))
184		return NULL;
185
186	list_for_each_entry(pg, &port_group_list, node) {
187		if (pg->group_id != group_id)
188			continue;
189		if (!pg->device_id_len || pg->device_id_len != id_size)
190			continue;
191		if (strncmp(pg->device_id_str, id_str, id_size))
192			continue;
193		if (!kref_get_unless_zero(&pg->kref))
194			continue;
195		return pg;
196	}
197
198	return NULL;
199}
200
201/*
202 * alua_alloc_pg - Allocate a new port_group structure
203 * @sdev: scsi device
204 * @group_id: port group id
205 * @tpgs: target port group settings
206 *
207 * Allocate a new port_group structure for a given
208 * device.
209 */
210static struct alua_port_group *alua_alloc_pg(struct scsi_device *sdev,
211					     int group_id, int tpgs)
212{
213	struct alua_port_group *pg, *tmp_pg;
214
215	pg = kzalloc(sizeof(struct alua_port_group), GFP_KERNEL);
216	if (!pg)
217		return ERR_PTR(-ENOMEM);
218
219	pg->device_id_len = scsi_vpd_lun_id(sdev, pg->device_id_str,
220					    sizeof(pg->device_id_str));
221	if (pg->device_id_len <= 0) {
222		/*
223		 * TPGS supported but no device identification found.
224		 * Generate private device identification.
225		 */
226		sdev_printk(KERN_INFO, sdev,
227			    "%s: No device descriptors found\n",
228			    ALUA_DH_NAME);
229		pg->device_id_str[0] = '\0';
230		pg->device_id_len = 0;
231	}
232	pg->group_id = group_id;
233	pg->tpgs = tpgs;
234	pg->state = SCSI_ACCESS_STATE_OPTIMAL;
235	pg->valid_states = TPGS_SUPPORT_ALL;
236	if (optimize_stpg)
237		pg->flags |= ALUA_OPTIMIZE_STPG;
238	kref_init(&pg->kref);
239	INIT_DELAYED_WORK(&pg->rtpg_work, alua_rtpg_work);
240	INIT_LIST_HEAD(&pg->rtpg_list);
241	INIT_LIST_HEAD(&pg->node);
242	INIT_LIST_HEAD(&pg->dh_list);
243	spin_lock_init(&pg->lock);
244
245	spin_lock(&port_group_lock);
246	tmp_pg = alua_find_get_pg(pg->device_id_str, pg->device_id_len,
247				  group_id);
248	if (tmp_pg) {
249		spin_unlock(&port_group_lock);
250		kfree(pg);
251		return tmp_pg;
252	}
253
254	list_add(&pg->node, &port_group_list);
255	spin_unlock(&port_group_lock);
256
257	return pg;
258}
259
260/*
261 * alua_check_tpgs - Evaluate TPGS setting
262 * @sdev: device to be checked
263 *
264 * Examine the TPGS setting of the sdev to find out if ALUA
265 * is supported.
266 */
267static int alua_check_tpgs(struct scsi_device *sdev)
268{
269	int tpgs = TPGS_MODE_NONE;
270
271	/*
272	 * ALUA support for non-disk devices is fraught with
273	 * difficulties, so disable it for now.
274	 */
275	if (sdev->type != TYPE_DISK) {
276		sdev_printk(KERN_INFO, sdev,
277			    "%s: disable for non-disk devices\n",
278			    ALUA_DH_NAME);
279		return tpgs;
280	}
281
282	tpgs = scsi_device_tpgs(sdev);
283	switch (tpgs) {
284	case TPGS_MODE_EXPLICIT|TPGS_MODE_IMPLICIT:
285		sdev_printk(KERN_INFO, sdev,
286			    "%s: supports implicit and explicit TPGS\n",
287			    ALUA_DH_NAME);
288		break;
289	case TPGS_MODE_EXPLICIT:
290		sdev_printk(KERN_INFO, sdev, "%s: supports explicit TPGS\n",
291			    ALUA_DH_NAME);
292		break;
293	case TPGS_MODE_IMPLICIT:
294		sdev_printk(KERN_INFO, sdev, "%s: supports implicit TPGS\n",
295			    ALUA_DH_NAME);
296		break;
297	case TPGS_MODE_NONE:
298		sdev_printk(KERN_INFO, sdev, "%s: not supported\n",
299			    ALUA_DH_NAME);
300		break;
301	default:
302		sdev_printk(KERN_INFO, sdev,
303			    "%s: unsupported TPGS setting %d\n",
304			    ALUA_DH_NAME, tpgs);
305		tpgs = TPGS_MODE_NONE;
306		break;
307	}
308
309	return tpgs;
310}
311
312/*
313 * alua_check_vpd - Evaluate INQUIRY vpd page 0x83
314 * @sdev: device to be checked
315 *
316 * Extract the relative target port and the target port group
317 * descriptor from the list of identificators.
318 */
319static int alua_check_vpd(struct scsi_device *sdev, struct alua_dh_data *h,
320			  int tpgs)
321{
322	int rel_port = -1, group_id;
323	struct alua_port_group *pg, *old_pg = NULL;
324	bool pg_updated = false;
325	unsigned long flags;
326
327	group_id = scsi_vpd_tpg_id(sdev, &rel_port);
328	if (group_id < 0) {
329		/*
330		 * Internal error; TPGS supported but required
331		 * VPD identification descriptors not present.
332		 * Disable ALUA support
333		 */
334		sdev_printk(KERN_INFO, sdev,
335			    "%s: No target port descriptors found\n",
336			    ALUA_DH_NAME);
337		return SCSI_DH_DEV_UNSUPP;
338	}
339
340	pg = alua_alloc_pg(sdev, group_id, tpgs);
341	if (IS_ERR(pg)) {
342		if (PTR_ERR(pg) == -ENOMEM)
343			return SCSI_DH_NOMEM;
344		return SCSI_DH_DEV_UNSUPP;
345	}
346	if (pg->device_id_len)
347		sdev_printk(KERN_INFO, sdev,
348			    "%s: device %s port group %x rel port %x\n",
349			    ALUA_DH_NAME, pg->device_id_str,
350			    group_id, rel_port);
351	else
352		sdev_printk(KERN_INFO, sdev,
353			    "%s: port group %x rel port %x\n",
354			    ALUA_DH_NAME, group_id, rel_port);
355
356	/* Check for existing port group references */
357	spin_lock(&h->pg_lock);
358	old_pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
359	if (old_pg != pg) {
360		/* port group has changed. Update to new port group */
361		if (h->pg) {
362			spin_lock_irqsave(&old_pg->lock, flags);
363			list_del_rcu(&h->node);
364			spin_unlock_irqrestore(&old_pg->lock, flags);
365		}
366		rcu_assign_pointer(h->pg, pg);
367		pg_updated = true;
368	}
369
370	spin_lock_irqsave(&pg->lock, flags);
371	if (pg_updated)
372		list_add_rcu(&h->node, &pg->dh_list);
373	spin_unlock_irqrestore(&pg->lock, flags);
374
375	alua_rtpg_queue(rcu_dereference_protected(h->pg,
376						  lockdep_is_held(&h->pg_lock)),
377			sdev, NULL, true);
378	spin_unlock(&h->pg_lock);
379
380	if (old_pg)
381		kref_put(&old_pg->kref, release_port_group);
382
383	return SCSI_DH_OK;
384}
385
386static char print_alua_state(unsigned char state)
387{
388	switch (state) {
389	case SCSI_ACCESS_STATE_OPTIMAL:
390		return 'A';
391	case SCSI_ACCESS_STATE_ACTIVE:
392		return 'N';
393	case SCSI_ACCESS_STATE_STANDBY:
394		return 'S';
395	case SCSI_ACCESS_STATE_UNAVAILABLE:
396		return 'U';
397	case SCSI_ACCESS_STATE_LBA:
398		return 'L';
399	case SCSI_ACCESS_STATE_OFFLINE:
400		return 'O';
401	case SCSI_ACCESS_STATE_TRANSITIONING:
402		return 'T';
403	default:
404		return 'X';
405	}
406}
407
408static enum scsi_disposition alua_check_sense(struct scsi_device *sdev,
409					      struct scsi_sense_hdr *sense_hdr)
410{
411	switch (sense_hdr->sense_key) {
412	case NOT_READY:
413		if (sense_hdr->asc == 0x04 && sense_hdr->ascq == 0x0a) {
414			/*
415			 * LUN Not Accessible - ALUA state transition
416			 */
417			alua_check(sdev, false);
418			return NEEDS_RETRY;
419		}
420		break;
421	case UNIT_ATTENTION:
422		if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x00) {
423			/*
424			 * Power On, Reset, or Bus Device Reset.
425			 * Might have obscured a state transition,
426			 * so schedule a recheck.
427			 */
428			alua_check(sdev, true);
429			return ADD_TO_MLQUEUE;
430		}
431		if (sense_hdr->asc == 0x29 && sense_hdr->ascq == 0x04)
432			/*
433			 * Device internal reset
434			 */
435			return ADD_TO_MLQUEUE;
436		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x01)
437			/*
438			 * Mode Parameters Changed
439			 */
440			return ADD_TO_MLQUEUE;
441		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x06) {
442			/*
443			 * ALUA state changed
444			 */
445			alua_check(sdev, true);
446			return ADD_TO_MLQUEUE;
447		}
448		if (sense_hdr->asc == 0x2a && sense_hdr->ascq == 0x07) {
449			/*
450			 * Implicit ALUA state transition failed
451			 */
452			alua_check(sdev, true);
453			return ADD_TO_MLQUEUE;
454		}
455		if (sense_hdr->asc == 0x3f && sense_hdr->ascq == 0x03)
456			/*
457			 * Inquiry data has changed
458			 */
459			return ADD_TO_MLQUEUE;
460		if (sense_hdr->asc == 0x3f && sense_hdr->ascq == 0x0e)
461			/*
462			 * REPORTED_LUNS_DATA_HAS_CHANGED is reported
463			 * when switching controllers on targets like
464			 * Intel Multi-Flex. We can just retry.
465			 */
466			return ADD_TO_MLQUEUE;
467		break;
468	}
469
470	return SCSI_RETURN_NOT_HANDLED;
471}
472
473/*
474 * alua_tur - Send a TEST UNIT READY
475 * @sdev: device to which the TEST UNIT READY command should be send
476 *
477 * Send a TEST UNIT READY to @sdev to figure out the device state
478 * Returns SCSI_DH_RETRY if the sense code is NOT READY/ALUA TRANSITIONING,
479 * SCSI_DH_OK if no error occurred, and SCSI_DH_IO otherwise.
480 */
481static int alua_tur(struct scsi_device *sdev)
482{
483	struct scsi_sense_hdr sense_hdr;
484	int retval;
485
486	retval = scsi_test_unit_ready(sdev, ALUA_FAILOVER_TIMEOUT * HZ,
487				      ALUA_FAILOVER_RETRIES, &sense_hdr);
488	if (sense_hdr.sense_key == NOT_READY &&
489	    sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a)
490		return SCSI_DH_RETRY;
491	else if (retval)
492		return SCSI_DH_IO;
493	else
494		return SCSI_DH_OK;
495}
496
497/*
498 * alua_rtpg - Evaluate REPORT TARGET GROUP STATES
499 * @sdev: the device to be evaluated.
500 *
501 * Evaluate the Target Port Group State.
502 * Returns SCSI_DH_DEV_OFFLINED if the path is
503 * found to be unusable.
504 */
505static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
506{
507	struct scsi_sense_hdr sense_hdr;
508	struct alua_port_group *tmp_pg;
509	int len, k, off, bufflen = ALUA_RTPG_SIZE;
510	unsigned char *desc, *buff;
511	unsigned err;
512	int retval;
513	unsigned int tpg_desc_tbl_off;
514	unsigned char orig_transition_tmo;
515	unsigned long flags;
516	bool transitioning_sense = false;
517
518	if (!pg->expiry) {
519		unsigned long transition_tmo = ALUA_FAILOVER_TIMEOUT * HZ;
520
521		if (pg->transition_tmo)
522			transition_tmo = pg->transition_tmo * HZ;
523
524		pg->expiry = round_jiffies_up(jiffies + transition_tmo);
525	}
526
527	buff = kzalloc(bufflen, GFP_KERNEL);
528	if (!buff)
529		return SCSI_DH_DEV_TEMP_BUSY;
530
531 retry:
532	err = 0;
533	retval = submit_rtpg(sdev, buff, bufflen, &sense_hdr, pg->flags);
534
535	if (retval) {
536		/*
537		 * Some (broken) implementations have a habit of returning
538		 * an error during things like firmware update etc.
539		 * But if the target only supports active/optimized there's
540		 * not much we can do; it's not that we can switch paths
541		 * or anything.
542		 * So ignore any errors to avoid spurious failures during
543		 * path failover.
544		 */
545		if ((pg->valid_states & ~TPGS_SUPPORT_OPTIMIZED) == 0) {
546			sdev_printk(KERN_INFO, sdev,
547				    "%s: ignoring rtpg result %d\n",
548				    ALUA_DH_NAME, retval);
549			kfree(buff);
550			return SCSI_DH_OK;
551		}
552		if (retval < 0 || !scsi_sense_valid(&sense_hdr)) {
553			sdev_printk(KERN_INFO, sdev,
554				    "%s: rtpg failed, result %d\n",
555				    ALUA_DH_NAME, retval);
556			kfree(buff);
557			if (retval < 0)
558				return SCSI_DH_DEV_TEMP_BUSY;
559			return SCSI_DH_IO;
560		}
561
562		/*
563		 * submit_rtpg() has failed on existing arrays
564		 * when requesting extended header info, and
565		 * the array doesn't support extended headers,
566		 * even though it shouldn't according to T10.
567		 * The retry without rtpg_ext_hdr_req set
568		 * handles this.
569		 * Note:  some arrays return a sense key of ILLEGAL_REQUEST
570		 * with ASC 00h if they don't support the extended header.
571		 */
572		if (!(pg->flags & ALUA_RTPG_EXT_HDR_UNSUPP) &&
573		    sense_hdr.sense_key == ILLEGAL_REQUEST) {
574			pg->flags |= ALUA_RTPG_EXT_HDR_UNSUPP;
575			goto retry;
576		}
577		/*
578		 * If the array returns with 'ALUA state transition'
579		 * sense code here it cannot return RTPG data during
580		 * transition. So set the state to 'transitioning' directly.
581		 */
582		if (sense_hdr.sense_key == NOT_READY &&
583		    sense_hdr.asc == 0x04 && sense_hdr.ascq == 0x0a) {
584			transitioning_sense = true;
585			goto skip_rtpg;
586		}
587		/*
588		 * Retry on any other UNIT ATTENTION occurred.
589		 */
590		if (sense_hdr.sense_key == UNIT_ATTENTION)
591			err = SCSI_DH_RETRY;
592		if (err == SCSI_DH_RETRY &&
593		    pg->expiry != 0 && time_before(jiffies, pg->expiry)) {
594			sdev_printk(KERN_ERR, sdev, "%s: rtpg retry\n",
595				    ALUA_DH_NAME);
596			scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
597			kfree(buff);
598			return err;
599		}
600		sdev_printk(KERN_ERR, sdev, "%s: rtpg failed\n",
601			    ALUA_DH_NAME);
602		scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
603		kfree(buff);
604		pg->expiry = 0;
605		return SCSI_DH_IO;
606	}
607
608	len = get_unaligned_be32(&buff[0]) + 4;
609
610	if (len > bufflen) {
611		/* Resubmit with the correct length */
612		kfree(buff);
613		bufflen = len;
614		buff = kmalloc(bufflen, GFP_KERNEL);
615		if (!buff) {
616			sdev_printk(KERN_WARNING, sdev,
617				    "%s: kmalloc buffer failed\n",__func__);
618			/* Temporary failure, bypass */
619			pg->expiry = 0;
620			return SCSI_DH_DEV_TEMP_BUSY;
621		}
622		goto retry;
623	}
624
625	orig_transition_tmo = pg->transition_tmo;
626	if ((buff[4] & RTPG_FMT_MASK) == RTPG_FMT_EXT_HDR && buff[5] != 0)
627		pg->transition_tmo = buff[5];
628	else
629		pg->transition_tmo = ALUA_FAILOVER_TIMEOUT;
630
631	if (orig_transition_tmo != pg->transition_tmo) {
632		sdev_printk(KERN_INFO, sdev,
633			    "%s: transition timeout set to %d seconds\n",
634			    ALUA_DH_NAME, pg->transition_tmo);
635		pg->expiry = jiffies + pg->transition_tmo * HZ;
636	}
637
638	if ((buff[4] & RTPG_FMT_MASK) == RTPG_FMT_EXT_HDR)
639		tpg_desc_tbl_off = 8;
640	else
641		tpg_desc_tbl_off = 4;
642
643	for (k = tpg_desc_tbl_off, desc = buff + tpg_desc_tbl_off;
644	     k < len;
645	     k += off, desc += off) {
646		u16 group_id = get_unaligned_be16(&desc[2]);
647
648		spin_lock_irqsave(&port_group_lock, flags);
649		tmp_pg = alua_find_get_pg(pg->device_id_str, pg->device_id_len,
650					  group_id);
651		spin_unlock_irqrestore(&port_group_lock, flags);
652		if (tmp_pg) {
653			if (spin_trylock_irqsave(&tmp_pg->lock, flags)) {
654				if ((tmp_pg == pg) ||
655				    !(tmp_pg->flags & ALUA_PG_RUNNING)) {
656					struct alua_dh_data *h;
657
658					tmp_pg->state = desc[0] & 0x0f;
659					tmp_pg->pref = desc[0] >> 7;
660					rcu_read_lock();
661					list_for_each_entry_rcu(h,
662						&tmp_pg->dh_list, node) {
663						if (!h->sdev)
664							continue;
665						h->sdev->access_state = desc[0];
666					}
667					rcu_read_unlock();
668				}
669				if (tmp_pg == pg)
670					tmp_pg->valid_states = desc[1];
671				spin_unlock_irqrestore(&tmp_pg->lock, flags);
672			}
673			kref_put(&tmp_pg->kref, release_port_group);
674		}
675		off = 8 + (desc[7] * 4);
676	}
677
678 skip_rtpg:
679	spin_lock_irqsave(&pg->lock, flags);
680	if (transitioning_sense)
681		pg->state = SCSI_ACCESS_STATE_TRANSITIONING;
682
683	sdev_printk(KERN_INFO, sdev,
684		    "%s: port group %02x state %c %s supports %c%c%c%c%c%c%c\n",
685		    ALUA_DH_NAME, pg->group_id, print_alua_state(pg->state),
686		    pg->pref ? "preferred" : "non-preferred",
687		    pg->valid_states&TPGS_SUPPORT_TRANSITION?'T':'t',
688		    pg->valid_states&TPGS_SUPPORT_OFFLINE?'O':'o',
689		    pg->valid_states&TPGS_SUPPORT_LBA_DEPENDENT?'L':'l',
690		    pg->valid_states&TPGS_SUPPORT_UNAVAILABLE?'U':'u',
691		    pg->valid_states&TPGS_SUPPORT_STANDBY?'S':'s',
692		    pg->valid_states&TPGS_SUPPORT_NONOPTIMIZED?'N':'n',
693		    pg->valid_states&TPGS_SUPPORT_OPTIMIZED?'A':'a');
694
695	switch (pg->state) {
696	case SCSI_ACCESS_STATE_TRANSITIONING:
697		if (time_before(jiffies, pg->expiry)) {
698			/* State transition, retry */
699			pg->interval = ALUA_RTPG_RETRY_DELAY;
700			err = SCSI_DH_RETRY;
701		} else {
702			struct alua_dh_data *h;
703
704			/* Transitioning time exceeded, set port to standby */
705			err = SCSI_DH_IO;
706			pg->state = SCSI_ACCESS_STATE_STANDBY;
707			pg->expiry = 0;
708			rcu_read_lock();
709			list_for_each_entry_rcu(h, &pg->dh_list, node) {
710				if (!h->sdev)
711					continue;
712				h->sdev->access_state =
713					(pg->state & SCSI_ACCESS_STATE_MASK);
714				if (pg->pref)
715					h->sdev->access_state |=
716						SCSI_ACCESS_STATE_PREFERRED;
717			}
718			rcu_read_unlock();
719		}
720		break;
721	case SCSI_ACCESS_STATE_OFFLINE:
722		/* Path unusable */
723		err = SCSI_DH_DEV_OFFLINED;
724		pg->expiry = 0;
725		break;
726	default:
727		/* Useable path if active */
728		err = SCSI_DH_OK;
729		pg->expiry = 0;
730		break;
731	}
732	spin_unlock_irqrestore(&pg->lock, flags);
733	kfree(buff);
734	return err;
735}
736
737/*
738 * alua_stpg - Issue a SET TARGET PORT GROUP command
739 *
740 * Issue a SET TARGET PORT GROUP command and evaluate the
741 * response. Returns SCSI_DH_RETRY per default to trigger
742 * a re-evaluation of the target group state or SCSI_DH_OK
743 * if no further action needs to be taken.
744 */
745static unsigned alua_stpg(struct scsi_device *sdev, struct alua_port_group *pg)
746{
747	int retval;
748	struct scsi_sense_hdr sense_hdr;
749
750	if (!(pg->tpgs & TPGS_MODE_EXPLICIT)) {
751		/* Only implicit ALUA supported, retry */
752		return SCSI_DH_RETRY;
753	}
754	switch (pg->state) {
755	case SCSI_ACCESS_STATE_OPTIMAL:
756		return SCSI_DH_OK;
757	case SCSI_ACCESS_STATE_ACTIVE:
758		if ((pg->flags & ALUA_OPTIMIZE_STPG) &&
759		    !pg->pref &&
760		    (pg->tpgs & TPGS_MODE_IMPLICIT))
761			return SCSI_DH_OK;
762		break;
763	case SCSI_ACCESS_STATE_STANDBY:
764	case SCSI_ACCESS_STATE_UNAVAILABLE:
765		break;
766	case SCSI_ACCESS_STATE_OFFLINE:
767		return SCSI_DH_IO;
768	case SCSI_ACCESS_STATE_TRANSITIONING:
769		break;
770	default:
771		sdev_printk(KERN_INFO, sdev,
772			    "%s: stpg failed, unhandled TPGS state %d",
773			    ALUA_DH_NAME, pg->state);
774		return SCSI_DH_NOSYS;
775	}
776	retval = submit_stpg(sdev, pg->group_id, &sense_hdr);
777
778	if (retval) {
779		if (retval < 0 || !scsi_sense_valid(&sense_hdr)) {
780			sdev_printk(KERN_INFO, sdev,
781				    "%s: stpg failed, result %d",
782				    ALUA_DH_NAME, retval);
783			if (retval < 0)
784				return SCSI_DH_DEV_TEMP_BUSY;
785		} else {
786			sdev_printk(KERN_INFO, sdev, "%s: stpg failed\n",
787				    ALUA_DH_NAME);
788			scsi_print_sense_hdr(sdev, ALUA_DH_NAME, &sense_hdr);
789		}
790	}
791	/* Retry RTPG */
792	return SCSI_DH_RETRY;
793}
794
795static void alua_rtpg_work(struct work_struct *work)
796{
797	struct alua_port_group *pg =
798		container_of(work, struct alua_port_group, rtpg_work.work);
799	struct scsi_device *sdev;
800	LIST_HEAD(qdata_list);
801	int err = SCSI_DH_OK;
802	struct alua_queue_data *qdata, *tmp;
803	unsigned long flags;
804
805	spin_lock_irqsave(&pg->lock, flags);
806	sdev = pg->rtpg_sdev;
807	if (!sdev) {
808		WARN_ON(pg->flags & ALUA_PG_RUN_RTPG);
809		WARN_ON(pg->flags & ALUA_PG_RUN_STPG);
810		spin_unlock_irqrestore(&pg->lock, flags);
811		kref_put(&pg->kref, release_port_group);
812		return;
813	}
814	pg->flags |= ALUA_PG_RUNNING;
815	if (pg->flags & ALUA_PG_RUN_RTPG) {
816		int state = pg->state;
817
818		pg->flags &= ~ALUA_PG_RUN_RTPG;
819		spin_unlock_irqrestore(&pg->lock, flags);
820		if (state == SCSI_ACCESS_STATE_TRANSITIONING) {
821			if (alua_tur(sdev) == SCSI_DH_RETRY) {
822				spin_lock_irqsave(&pg->lock, flags);
823				pg->flags &= ~ALUA_PG_RUNNING;
824				pg->flags |= ALUA_PG_RUN_RTPG;
825				if (!pg->interval)
826					pg->interval = ALUA_RTPG_RETRY_DELAY;
827				spin_unlock_irqrestore(&pg->lock, flags);
828				queue_delayed_work(kaluad_wq, &pg->rtpg_work,
829						   pg->interval * HZ);
830				return;
831			}
832			/* Send RTPG on failure or if TUR indicates SUCCESS */
833		}
834		err = alua_rtpg(sdev, pg);
835		spin_lock_irqsave(&pg->lock, flags);
836		if (err == SCSI_DH_RETRY || pg->flags & ALUA_PG_RUN_RTPG) {
837			pg->flags &= ~ALUA_PG_RUNNING;
838			if (!pg->interval && !(pg->flags & ALUA_PG_RUN_RTPG))
839				pg->interval = ALUA_RTPG_RETRY_DELAY;
840			pg->flags |= ALUA_PG_RUN_RTPG;
841			spin_unlock_irqrestore(&pg->lock, flags);
842			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
843					   pg->interval * HZ);
844			return;
845		}
846		if (err != SCSI_DH_OK)
847			pg->flags &= ~ALUA_PG_RUN_STPG;
848	}
849	if (pg->flags & ALUA_PG_RUN_STPG) {
850		pg->flags &= ~ALUA_PG_RUN_STPG;
851		spin_unlock_irqrestore(&pg->lock, flags);
852		err = alua_stpg(sdev, pg);
853		spin_lock_irqsave(&pg->lock, flags);
854		if (err == SCSI_DH_RETRY || pg->flags & ALUA_PG_RUN_RTPG) {
855			pg->flags |= ALUA_PG_RUN_RTPG;
856			pg->interval = 0;
857			pg->flags &= ~ALUA_PG_RUNNING;
858			spin_unlock_irqrestore(&pg->lock, flags);
859			queue_delayed_work(kaluad_wq, &pg->rtpg_work,
860					   pg->interval * HZ);
861			return;
862		}
863	}
864
865	list_splice_init(&pg->rtpg_list, &qdata_list);
866	pg->rtpg_sdev = NULL;
867	spin_unlock_irqrestore(&pg->lock, flags);
868
869	list_for_each_entry_safe(qdata, tmp, &qdata_list, entry) {
870		list_del(&qdata->entry);
871		if (qdata->callback_fn)
872			qdata->callback_fn(qdata->callback_data, err);
873		kfree(qdata);
874	}
875	spin_lock_irqsave(&pg->lock, flags);
876	pg->flags &= ~ALUA_PG_RUNNING;
877	spin_unlock_irqrestore(&pg->lock, flags);
878	scsi_device_put(sdev);
879	kref_put(&pg->kref, release_port_group);
880}
881
882/**
883 * alua_rtpg_queue() - cause RTPG to be submitted asynchronously
884 * @pg: ALUA port group associated with @sdev.
885 * @sdev: SCSI device for which to submit an RTPG.
886 * @qdata: Information about the callback to invoke after the RTPG.
887 * @force: Whether or not to submit an RTPG if a work item that will submit an
888 *         RTPG already has been scheduled.
889 *
890 * Returns true if and only if alua_rtpg_work() will be called asynchronously.
891 * That function is responsible for calling @qdata->fn().
892 */
893static bool alua_rtpg_queue(struct alua_port_group *pg,
894			    struct scsi_device *sdev,
895			    struct alua_queue_data *qdata, bool force)
896{
897	int start_queue = 0;
898	unsigned long flags;
899	if (WARN_ON_ONCE(!pg) || scsi_device_get(sdev))
900		return false;
901
902	spin_lock_irqsave(&pg->lock, flags);
903	if (qdata) {
904		list_add_tail(&qdata->entry, &pg->rtpg_list);
905		pg->flags |= ALUA_PG_RUN_STPG;
906		force = true;
907	}
908	if (pg->rtpg_sdev == NULL) {
909		pg->interval = 0;
910		pg->flags |= ALUA_PG_RUN_RTPG;
911		kref_get(&pg->kref);
912		pg->rtpg_sdev = sdev;
913		start_queue = 1;
914	} else if (!(pg->flags & ALUA_PG_RUN_RTPG) && force) {
915		pg->flags |= ALUA_PG_RUN_RTPG;
916		/* Do not queue if the worker is already running */
917		if (!(pg->flags & ALUA_PG_RUNNING)) {
918			kref_get(&pg->kref);
919			start_queue = 1;
920		}
921	}
922
923	spin_unlock_irqrestore(&pg->lock, flags);
924
925	if (start_queue) {
926		if (queue_delayed_work(kaluad_wq, &pg->rtpg_work,
927				msecs_to_jiffies(ALUA_RTPG_DELAY_MSECS)))
928			sdev = NULL;
929		else
930			kref_put(&pg->kref, release_port_group);
931	}
932	if (sdev)
933		scsi_device_put(sdev);
934
935	return true;
936}
937
938/*
939 * alua_initialize - Initialize ALUA state
940 * @sdev: the device to be initialized
941 *
942 * For the prep_fn to work correctly we have
943 * to initialize the ALUA state for the device.
944 */
945static int alua_initialize(struct scsi_device *sdev, struct alua_dh_data *h)
946{
947	int err = SCSI_DH_DEV_UNSUPP, tpgs;
948
949	mutex_lock(&h->init_mutex);
950	tpgs = alua_check_tpgs(sdev);
951	if (tpgs != TPGS_MODE_NONE)
952		err = alua_check_vpd(sdev, h, tpgs);
953	h->init_error = err;
954	mutex_unlock(&h->init_mutex);
955	return err;
956}
957/*
958 * alua_set_params - set/unset the optimize flag
959 * @sdev: device on the path to be activated
960 * params - parameters in the following format
961 *      "no_of_params\0param1\0param2\0param3\0...\0"
962 * For example, to set the flag pass the following parameters
963 * from multipath.conf
964 *     hardware_handler        "2 alua 1"
965 */
966static int alua_set_params(struct scsi_device *sdev, const char *params)
967{
968	struct alua_dh_data *h = sdev->handler_data;
969	struct alua_port_group *pg = NULL;
970	unsigned int optimize = 0, argc;
971	const char *p = params;
972	int result = SCSI_DH_OK;
973	unsigned long flags;
974
975	if ((sscanf(params, "%u", &argc) != 1) || (argc != 1))
976		return -EINVAL;
977
978	while (*p++)
979		;
980	if ((sscanf(p, "%u", &optimize) != 1) || (optimize > 1))
981		return -EINVAL;
982
983	rcu_read_lock();
984	pg = rcu_dereference(h->pg);
985	if (!pg) {
986		rcu_read_unlock();
987		return -ENXIO;
988	}
989	spin_lock_irqsave(&pg->lock, flags);
990	if (optimize)
991		pg->flags |= ALUA_OPTIMIZE_STPG;
992	else
993		pg->flags &= ~ALUA_OPTIMIZE_STPG;
994	spin_unlock_irqrestore(&pg->lock, flags);
995	rcu_read_unlock();
996
997	return result;
998}
999
1000/*
1001 * alua_activate - activate a path
1002 * @sdev: device on the path to be activated
1003 *
1004 * We're currently switching the port group to be activated only and
1005 * let the array figure out the rest.
1006 * There may be other arrays which require us to switch all port groups
1007 * based on a certain policy. But until we actually encounter them it
1008 * should be okay.
1009 */
1010static int alua_activate(struct scsi_device *sdev,
1011			activate_complete fn, void *data)
1012{
1013	struct alua_dh_data *h = sdev->handler_data;
1014	int err = SCSI_DH_OK;
1015	struct alua_queue_data *qdata;
1016	struct alua_port_group *pg;
1017
1018	qdata = kzalloc(sizeof(*qdata), GFP_KERNEL);
1019	if (!qdata) {
1020		err = SCSI_DH_RES_TEMP_UNAVAIL;
1021		goto out;
1022	}
1023	qdata->callback_fn = fn;
1024	qdata->callback_data = data;
1025
1026	mutex_lock(&h->init_mutex);
1027	rcu_read_lock();
1028	pg = rcu_dereference(h->pg);
1029	if (!pg || !kref_get_unless_zero(&pg->kref)) {
1030		rcu_read_unlock();
1031		kfree(qdata);
1032		err = h->init_error;
1033		mutex_unlock(&h->init_mutex);
1034		goto out;
1035	}
1036	rcu_read_unlock();
1037	mutex_unlock(&h->init_mutex);
1038
1039	if (alua_rtpg_queue(pg, sdev, qdata, true)) {
1040		fn = NULL;
1041	} else {
1042		kfree(qdata);
1043		err = SCSI_DH_DEV_OFFLINED;
1044	}
1045	kref_put(&pg->kref, release_port_group);
1046out:
1047	if (fn)
1048		fn(data, err);
1049	return 0;
1050}
1051
1052/*
1053 * alua_check - check path status
1054 * @sdev: device on the path to be checked
1055 *
1056 * Check the device status
1057 */
1058static void alua_check(struct scsi_device *sdev, bool force)
1059{
1060	struct alua_dh_data *h = sdev->handler_data;
1061	struct alua_port_group *pg;
1062
1063	rcu_read_lock();
1064	pg = rcu_dereference(h->pg);
1065	if (!pg || !kref_get_unless_zero(&pg->kref)) {
1066		rcu_read_unlock();
1067		return;
1068	}
1069	rcu_read_unlock();
1070
1071	alua_rtpg_queue(pg, sdev, NULL, force);
1072	kref_put(&pg->kref, release_port_group);
1073}
1074
1075/*
1076 * alua_prep_fn - request callback
1077 *
1078 * Fail I/O to all paths not in state
1079 * active/optimized or active/non-optimized.
1080 */
1081static blk_status_t alua_prep_fn(struct scsi_device *sdev, struct request *req)
1082{
1083	struct alua_dh_data *h = sdev->handler_data;
1084	struct alua_port_group *pg;
1085	unsigned char state = SCSI_ACCESS_STATE_OPTIMAL;
1086
1087	rcu_read_lock();
1088	pg = rcu_dereference(h->pg);
1089	if (pg)
1090		state = pg->state;
1091	rcu_read_unlock();
1092
1093	switch (state) {
1094	case SCSI_ACCESS_STATE_OPTIMAL:
1095	case SCSI_ACCESS_STATE_ACTIVE:
1096	case SCSI_ACCESS_STATE_LBA:
1097		return BLK_STS_OK;
1098	case SCSI_ACCESS_STATE_TRANSITIONING:
1099		return BLK_STS_RESOURCE;
1100	default:
1101		req->rq_flags |= RQF_QUIET;
1102		return BLK_STS_IOERR;
1103	}
1104}
1105
1106static void alua_rescan(struct scsi_device *sdev)
1107{
1108	struct alua_dh_data *h = sdev->handler_data;
1109
1110	alua_initialize(sdev, h);
1111}
1112
1113/*
1114 * alua_bus_attach - Attach device handler
1115 * @sdev: device to be attached to
1116 */
1117static int alua_bus_attach(struct scsi_device *sdev)
1118{
1119	struct alua_dh_data *h;
1120	int err;
1121
1122	h = kzalloc(sizeof(*h) , GFP_KERNEL);
1123	if (!h)
1124		return SCSI_DH_NOMEM;
1125	spin_lock_init(&h->pg_lock);
1126	rcu_assign_pointer(h->pg, NULL);
1127	h->init_error = SCSI_DH_OK;
1128	h->sdev = sdev;
1129	INIT_LIST_HEAD(&h->node);
1130
1131	mutex_init(&h->init_mutex);
1132	err = alua_initialize(sdev, h);
1133	if (err != SCSI_DH_OK && err != SCSI_DH_DEV_OFFLINED)
1134		goto failed;
1135
1136	sdev->handler_data = h;
1137	return SCSI_DH_OK;
1138failed:
1139	kfree(h);
1140	return err;
1141}
1142
1143/*
1144 * alua_bus_detach - Detach device handler
1145 * @sdev: device to be detached from
1146 */
1147static void alua_bus_detach(struct scsi_device *sdev)
1148{
1149	struct alua_dh_data *h = sdev->handler_data;
1150	struct alua_port_group *pg;
1151
1152	spin_lock(&h->pg_lock);
1153	pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock));
1154	rcu_assign_pointer(h->pg, NULL);
1155	spin_unlock(&h->pg_lock);
1156	if (pg) {
1157		spin_lock_irq(&pg->lock);
1158		list_del_rcu(&h->node);
1159		spin_unlock_irq(&pg->lock);
1160		kref_put(&pg->kref, release_port_group);
1161	}
1162	sdev->handler_data = NULL;
1163	synchronize_rcu();
1164	kfree(h);
1165}
1166
1167static struct scsi_device_handler alua_dh = {
1168	.name = ALUA_DH_NAME,
1169	.module = THIS_MODULE,
1170	.attach = alua_bus_attach,
1171	.detach = alua_bus_detach,
1172	.prep_fn = alua_prep_fn,
1173	.check_sense = alua_check_sense,
1174	.activate = alua_activate,
1175	.rescan = alua_rescan,
1176	.set_params = alua_set_params,
1177};
1178
1179static int __init alua_init(void)
1180{
1181	int r;
1182
1183	kaluad_wq = alloc_workqueue("kaluad", WQ_MEM_RECLAIM, 0);
1184	if (!kaluad_wq)
1185		return -ENOMEM;
1186
1187	r = scsi_register_device_handler(&alua_dh);
1188	if (r != 0) {
1189		printk(KERN_ERR "%s: Failed to register scsi device handler",
1190			ALUA_DH_NAME);
1191		destroy_workqueue(kaluad_wq);
1192	}
1193	return r;
1194}
1195
1196static void __exit alua_exit(void)
1197{
1198	scsi_unregister_device_handler(&alua_dh);
1199	destroy_workqueue(kaluad_wq);
1200}
1201
1202module_init(alua_init);
1203module_exit(alua_exit);
1204
1205MODULE_DESCRIPTION("DM Multipath ALUA support");
1206MODULE_AUTHOR("Hannes Reinecke <hare@suse.de>");
1207MODULE_LICENSE("GPL");
1208MODULE_VERSION(ALUA_DH_VER);
1209