1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3**  IA64 System Bus Adapter (SBA) I/O MMU manager
4**
5**	(c) Copyright 2002-2005 Alex Williamson
6**	(c) Copyright 2002-2003 Grant Grundler
7**	(c) Copyright 2002-2005 Hewlett-Packard Company
8**
9**	Portions (c) 2000 Grant Grundler (from parisc I/O MMU code)
10**	Portions (c) 1999 Dave S. Miller (from sparc64 I/O MMU code)
11**
12**
13**
14** This module initializes the IOC (I/O Controller) found on HP
15** McKinley machines and their successors.
16**
17*/
18
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/spinlock.h>
23#include <linux/slab.h>
24#include <linux/init.h>
25#include <linux/mm.h>
26#include <linux/string.h>
27#include <linux/pci.h>
28#include <linux/proc_fs.h>
29#include <linux/seq_file.h>
30#include <linux/acpi.h>
31#include <linux/efi.h>
32#include <linux/nodemask.h>
33#include <linux/bitops.h>         /* hweight64() */
34#include <linux/crash_dump.h>
35#include <linux/iommu-helper.h>
36#include <linux/dma-map-ops.h>
37#include <linux/prefetch.h>
38#include <linux/swiotlb.h>
39
40#include <asm/delay.h>		/* ia64_get_itc() */
41#include <asm/io.h>
42#include <asm/page.h>		/* PAGE_OFFSET */
43#include <asm/dma.h>
44
45#include <asm/acpi-ext.h>
46
47#define PFX "IOC: "
48
49/*
50** Enabling timing search of the pdir resource map.  Output in /proc.
51** Disabled by default to optimize performance.
52*/
53#undef PDIR_SEARCH_TIMING
54
55/*
56** This option allows cards capable of 64bit DMA to bypass the IOMMU.  If
57** not defined, all DMA will be 32bit and go through the TLB.
58** There's potentially a conflict in the bio merge code with us
59** advertising an iommu, but then bypassing it.  Since I/O MMU bypassing
60** appears to give more performance than bio-level virtual merging, we'll
61** do the former for now.  NOTE: BYPASS_SG also needs to be undef'd to
62** completely restrict DMA to the IOMMU.
63*/
64#define ALLOW_IOV_BYPASS
65
66/*
67** This option specifically allows/disallows bypassing scatterlists with
68** multiple entries.  Coalescing these entries can allow better DMA streaming
69** and in some cases shows better performance than entirely bypassing the
70** IOMMU.  Performance increase on the order of 1-2% sequential output/input
71** using bonnie++ on a RAID0 MD device (sym2 & mpt).
72*/
73#undef ALLOW_IOV_BYPASS_SG
74
75/*
76** If a device prefetches beyond the end of a valid pdir entry, it will cause
77** a hard failure, ie. MCA.  Version 3.0 and later of the zx1 LBA should
78** disconnect on 4k boundaries and prevent such issues.  If the device is
79** particularly aggressive, this option will keep the entire pdir valid such
80** that prefetching will hit a valid address.  This could severely impact
81** error containment, and is therefore off by default.  The page that is
82** used for spill-over is poisoned, so that should help debugging somewhat.
83*/
84#undef FULL_VALID_PDIR
85
86#define ENABLE_MARK_CLEAN
87
88/*
89** The number of debug flags is a clue - this code is fragile.  NOTE: since
90** tightening the use of res_lock the resource bitmap and actual pdir are no
91** longer guaranteed to stay in sync.  The sanity checking code isn't going to
92** like that.
93*/
94#undef DEBUG_SBA_INIT
95#undef DEBUG_SBA_RUN
96#undef DEBUG_SBA_RUN_SG
97#undef DEBUG_SBA_RESOURCE
98#undef ASSERT_PDIR_SANITY
99#undef DEBUG_LARGE_SG_ENTRIES
100#undef DEBUG_BYPASS
101
102#if defined(FULL_VALID_PDIR) && defined(ASSERT_PDIR_SANITY)
103#error FULL_VALID_PDIR and ASSERT_PDIR_SANITY are mutually exclusive
104#endif
105
106#define SBA_INLINE	__inline__
107/* #define SBA_INLINE */
108
109#ifdef DEBUG_SBA_INIT
110#define DBG_INIT(x...)	printk(x)
111#else
112#define DBG_INIT(x...)
113#endif
114
115#ifdef DEBUG_SBA_RUN
116#define DBG_RUN(x...)	printk(x)
117#else
118#define DBG_RUN(x...)
119#endif
120
121#ifdef DEBUG_SBA_RUN_SG
122#define DBG_RUN_SG(x...)	printk(x)
123#else
124#define DBG_RUN_SG(x...)
125#endif
126
127
128#ifdef DEBUG_SBA_RESOURCE
129#define DBG_RES(x...)	printk(x)
130#else
131#define DBG_RES(x...)
132#endif
133
134#ifdef DEBUG_BYPASS
135#define DBG_BYPASS(x...)	printk(x)
136#else
137#define DBG_BYPASS(x...)
138#endif
139
140#ifdef ASSERT_PDIR_SANITY
141#define ASSERT(expr) \
142        if(!(expr)) { \
143                printk( "\n" __FILE__ ":%d: Assertion " #expr " failed!\n",__LINE__); \
144                panic(#expr); \
145        }
146#else
147#define ASSERT(expr)
148#endif
149
150/*
151** The number of pdir entries to "free" before issuing
152** a read to PCOM register to flush out PCOM writes.
153** Interacts with allocation granularity (ie 4 or 8 entries
154** allocated and free'd/purged at a time might make this
155** less interesting).
156*/
157#define DELAYED_RESOURCE_CNT	64
158
159#define PCI_DEVICE_ID_HP_SX2000_IOC	0x12ec
160
161#define ZX1_IOC_ID	((PCI_DEVICE_ID_HP_ZX1_IOC << 16) | PCI_VENDOR_ID_HP)
162#define ZX2_IOC_ID	((PCI_DEVICE_ID_HP_ZX2_IOC << 16) | PCI_VENDOR_ID_HP)
163#define REO_IOC_ID	((PCI_DEVICE_ID_HP_REO_IOC << 16) | PCI_VENDOR_ID_HP)
164#define SX1000_IOC_ID	((PCI_DEVICE_ID_HP_SX1000_IOC << 16) | PCI_VENDOR_ID_HP)
165#define SX2000_IOC_ID	((PCI_DEVICE_ID_HP_SX2000_IOC << 16) | PCI_VENDOR_ID_HP)
166
167#define ZX1_IOC_OFFSET	0x1000	/* ACPI reports SBA, we want IOC */
168
169#define IOC_FUNC_ID	0x000
170#define IOC_FCLASS	0x008	/* function class, bist, header, rev... */
171#define IOC_IBASE	0x300	/* IO TLB */
172#define IOC_IMASK	0x308
173#define IOC_PCOM	0x310
174#define IOC_TCNFG	0x318
175#define IOC_PDIR_BASE	0x320
176
177#define IOC_ROPE0_CFG	0x500
178#define   IOC_ROPE_AO	  0x10	/* Allow "Relaxed Ordering" */
179
180
181/* AGP GART driver looks for this */
182#define ZX1_SBA_IOMMU_COOKIE	0x0000badbadc0ffeeUL
183
184/*
185** The zx1 IOC supports 4/8/16/64KB page sizes (see TCNFG register)
186**
187** Some IOCs (sx1000) can run at the above pages sizes, but are
188** really only supported using the IOC at a 4k page size.
189**
190** iovp_size could only be greater than PAGE_SIZE if we are
191** confident the drivers really only touch the next physical
192** page iff that driver instance owns it.
193*/
194static unsigned long iovp_size;
195static unsigned long iovp_shift;
196static unsigned long iovp_mask;
197
198struct ioc {
199	void __iomem	*ioc_hpa;	/* I/O MMU base address */
200	char		*res_map;	/* resource map, bit == pdir entry */
201	u64		*pdir_base;	/* physical base address */
202	unsigned long	ibase;		/* pdir IOV Space base */
203	unsigned long	imask;		/* pdir IOV Space mask */
204
205	unsigned long	*res_hint;	/* next avail IOVP - circular search */
206	unsigned long	dma_mask;
207	spinlock_t	res_lock;	/* protects the resource bitmap, but must be held when */
208					/* clearing pdir to prevent races with allocations. */
209	unsigned int	res_bitshift;	/* from the RIGHT! */
210	unsigned int	res_size;	/* size of resource map in bytes */
211#ifdef CONFIG_NUMA
212	unsigned int	node;		/* node where this IOC lives */
213#endif
214#if DELAYED_RESOURCE_CNT > 0
215	spinlock_t	saved_lock;	/* may want to try to get this on a separate cacheline */
216					/* than res_lock for bigger systems. */
217	int		saved_cnt;
218	struct sba_dma_pair {
219		dma_addr_t	iova;
220		size_t		size;
221	} saved[DELAYED_RESOURCE_CNT];
222#endif
223
224#ifdef PDIR_SEARCH_TIMING
225#define SBA_SEARCH_SAMPLE	0x100
226	unsigned long avg_search[SBA_SEARCH_SAMPLE];
227	unsigned long avg_idx;	/* current index into avg_search */
228#endif
229
230	/* Stuff we don't need in performance path */
231	struct ioc	*next;		/* list of IOC's in system */
232	acpi_handle	handle;		/* for multiple IOC's */
233	const char 	*name;
234	unsigned int	func_id;
235	unsigned int	rev;		/* HW revision of chip */
236	u32		iov_size;
237	unsigned int	pdir_size;	/* in bytes, determined by IOV Space size */
238	struct pci_dev	*sac_only_dev;
239};
240
241static struct ioc *ioc_list, *ioc_found;
242static int reserve_sba_gart = 1;
243
244static SBA_INLINE void sba_mark_invalid(struct ioc *, dma_addr_t, size_t);
245static SBA_INLINE void sba_free_range(struct ioc *, dma_addr_t, size_t);
246
247#define sba_sg_address(sg)	sg_virt((sg))
248
249#ifdef FULL_VALID_PDIR
250static u64 prefetch_spill_page;
251#endif
252
253#define GET_IOC(dev)	((dev_is_pci(dev))						\
254			 ? ((struct ioc *) PCI_CONTROLLER(to_pci_dev(dev))->iommu) : NULL)
255
256/*
257** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up
258** (or rather not merge) DMAs into manageable chunks.
259** On parisc, this is more of the software/tuning constraint
260** rather than the HW. I/O MMU allocation algorithms can be
261** faster with smaller sizes (to some degree).
262*/
263#define DMA_CHUNK_SIZE  (BITS_PER_LONG*iovp_size)
264
265#define ROUNDUP(x,y) ((x + ((y)-1)) & ~((y)-1))
266
267/************************************
268** SBA register read and write support
269**
270** BE WARNED: register writes are posted.
271**  (ie follow writes which must reach HW with a read)
272**
273*/
274#define READ_REG(addr)       __raw_readq(addr)
275#define WRITE_REG(val, addr) __raw_writeq(val, addr)
276
277#ifdef DEBUG_SBA_INIT
278
279/**
280 * sba_dump_tlb - debugging only - print IOMMU operating parameters
281 * @hpa: base address of the IOMMU
282 *
283 * Print the size/location of the IO MMU PDIR.
284 */
285static void
286sba_dump_tlb(char *hpa)
287{
288	DBG_INIT("IO TLB at 0x%p\n", (void *)hpa);
289	DBG_INIT("IOC_IBASE    : %016lx\n", READ_REG(hpa+IOC_IBASE));
290	DBG_INIT("IOC_IMASK    : %016lx\n", READ_REG(hpa+IOC_IMASK));
291	DBG_INIT("IOC_TCNFG    : %016lx\n", READ_REG(hpa+IOC_TCNFG));
292	DBG_INIT("IOC_PDIR_BASE: %016lx\n", READ_REG(hpa+IOC_PDIR_BASE));
293	DBG_INIT("\n");
294}
295#endif
296
297
298#ifdef ASSERT_PDIR_SANITY
299
300/**
301 * sba_dump_pdir_entry - debugging only - print one IOMMU PDIR entry
302 * @ioc: IO MMU structure which owns the pdir we are interested in.
303 * @msg: text to print ont the output line.
304 * @pide: pdir index.
305 *
306 * Print one entry of the IO MMU PDIR in human readable form.
307 */
308static void
309sba_dump_pdir_entry(struct ioc *ioc, char *msg, uint pide)
310{
311	/* start printing from lowest pde in rval */
312	u64 *ptr = &ioc->pdir_base[pide  & ~(BITS_PER_LONG - 1)];
313	unsigned long *rptr = (unsigned long *) &ioc->res_map[(pide >>3) & -sizeof(unsigned long)];
314	uint rcnt;
315
316	printk(KERN_DEBUG "SBA: %s rp %p bit %d rval 0x%lx\n",
317		 msg, rptr, pide & (BITS_PER_LONG - 1), *rptr);
318
319	rcnt = 0;
320	while (rcnt < BITS_PER_LONG) {
321		printk(KERN_DEBUG "%s %2d %p %016Lx\n",
322		       (rcnt == (pide & (BITS_PER_LONG - 1)))
323		       ? "    -->" : "       ",
324		       rcnt, ptr, (unsigned long long) *ptr );
325		rcnt++;
326		ptr++;
327	}
328	printk(KERN_DEBUG "%s", msg);
329}
330
331
332/**
333 * sba_check_pdir - debugging only - consistency checker
334 * @ioc: IO MMU structure which owns the pdir we are interested in.
335 * @msg: text to print ont the output line.
336 *
337 * Verify the resource map and pdir state is consistent
338 */
339static int
340sba_check_pdir(struct ioc *ioc, char *msg)
341{
342	u64 *rptr_end = (u64 *) &(ioc->res_map[ioc->res_size]);
343	u64 *rptr = (u64 *) ioc->res_map;	/* resource map ptr */
344	u64 *pptr = ioc->pdir_base;	/* pdir ptr */
345	uint pide = 0;
346
347	while (rptr < rptr_end) {
348		u64 rval;
349		int rcnt; /* number of bits we might check */
350
351		rval = *rptr;
352		rcnt = 64;
353
354		while (rcnt) {
355			/* Get last byte and highest bit from that */
356			u32 pde = ((u32)((*pptr >> (63)) & 0x1));
357			if ((rval & 0x1) ^ pde)
358			{
359				/*
360				** BUMMER!  -- res_map != pdir --
361				** Dump rval and matching pdir entries
362				*/
363				sba_dump_pdir_entry(ioc, msg, pide);
364				return(1);
365			}
366			rcnt--;
367			rval >>= 1;	/* try the next bit */
368			pptr++;
369			pide++;
370		}
371		rptr++;	/* look at next word of res_map */
372	}
373	/* It'd be nice if we always got here :^) */
374	return 0;
375}
376
377
378/**
379 * sba_dump_sg - debugging only - print Scatter-Gather list
380 * @ioc: IO MMU structure which owns the pdir we are interested in.
381 * @startsg: head of the SG list
382 * @nents: number of entries in SG list
383 *
384 * print the SG list so we can verify it's correct by hand.
385 */
386static void
387sba_dump_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
388{
389	while (nents-- > 0) {
390		printk(KERN_DEBUG " %d : DMA %08lx/%05x CPU %p\n", nents,
391		       startsg->dma_address, startsg->dma_length,
392		       sba_sg_address(startsg));
393		startsg = sg_next(startsg);
394	}
395}
396
397static void
398sba_check_sg( struct ioc *ioc, struct scatterlist *startsg, int nents)
399{
400	struct scatterlist *the_sg = startsg;
401	int the_nents = nents;
402
403	while (the_nents-- > 0) {
404		if (sba_sg_address(the_sg) == 0x0UL)
405			sba_dump_sg(NULL, startsg, nents);
406		the_sg = sg_next(the_sg);
407	}
408}
409
410#endif /* ASSERT_PDIR_SANITY */
411
412
413
414
415/**************************************************************
416*
417*   I/O Pdir Resource Management
418*
419*   Bits set in the resource map are in use.
420*   Each bit can represent a number of pages.
421*   LSbs represent lower addresses (IOVA's).
422*
423***************************************************************/
424#define PAGES_PER_RANGE 1	/* could increase this to 4 or 8 if needed */
425
426/* Convert from IOVP to IOVA and vice versa. */
427#define SBA_IOVA(ioc,iovp,offset) ((ioc->ibase) | (iovp) | (offset))
428#define SBA_IOVP(ioc,iova) ((iova) & ~(ioc->ibase))
429
430#define PDIR_ENTRY_SIZE	sizeof(u64)
431
432#define PDIR_INDEX(iovp)   ((iovp)>>iovp_shift)
433
434#define RESMAP_MASK(n)    ~(~0UL << (n))
435#define RESMAP_IDX_MASK   (sizeof(unsigned long) - 1)
436
437
438/**
439 * For most cases the normal get_order is sufficient, however it limits us
440 * to PAGE_SIZE being the minimum mapping alignment and TC flush granularity.
441 * It only incurs about 1 clock cycle to use this one with the static variable
442 * and makes the code more intuitive.
443 */
444static SBA_INLINE int
445get_iovp_order (unsigned long size)
446{
447	long double d = size - 1;
448	long order;
449
450	order = ia64_getf_exp(d);
451	order = order - iovp_shift - 0xffff + 1;
452	if (order < 0)
453		order = 0;
454	return order;
455}
456
457static unsigned long ptr_to_pide(struct ioc *ioc, unsigned long *res_ptr,
458				 unsigned int bitshiftcnt)
459{
460	return (((unsigned long)res_ptr - (unsigned long)ioc->res_map) << 3)
461		+ bitshiftcnt;
462}
463
464/**
465 * sba_search_bitmap - find free space in IO PDIR resource bitmap
466 * @ioc: IO MMU structure which owns the pdir we are interested in.
467 * @bits_wanted: number of entries we need.
468 * @use_hint: use res_hint to indicate where to start looking
469 *
470 * Find consecutive free bits in resource bitmap.
471 * Each bit represents one entry in the IO Pdir.
472 * Cool perf optimization: search for log2(size) bits at a time.
473 */
474static SBA_INLINE unsigned long
475sba_search_bitmap(struct ioc *ioc, struct device *dev,
476		  unsigned long bits_wanted, int use_hint)
477{
478	unsigned long *res_ptr;
479	unsigned long *res_end = (unsigned long *) &(ioc->res_map[ioc->res_size]);
480	unsigned long flags, pide = ~0UL, tpide;
481	unsigned long boundary_size;
482	unsigned long shift;
483	int ret;
484
485	ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0);
486	ASSERT(res_ptr < res_end);
487
488	boundary_size = dma_get_seg_boundary_nr_pages(dev, iovp_shift);
489
490	BUG_ON(ioc->ibase & ~iovp_mask);
491	shift = ioc->ibase >> iovp_shift;
492
493	spin_lock_irqsave(&ioc->res_lock, flags);
494
495	/* Allow caller to force a search through the entire resource space */
496	if (likely(use_hint)) {
497		res_ptr = ioc->res_hint;
498	} else {
499		res_ptr = (ulong *)ioc->res_map;
500		ioc->res_bitshift = 0;
501	}
502
503	/*
504	 * N.B.  REO/Grande defect AR2305 can cause TLB fetch timeouts
505	 * if a TLB entry is purged while in use.  sba_mark_invalid()
506	 * purges IOTLB entries in power-of-two sizes, so we also
507	 * allocate IOVA space in power-of-two sizes.
508	 */
509	bits_wanted = 1UL << get_iovp_order(bits_wanted << iovp_shift);
510
511	if (likely(bits_wanted == 1)) {
512		unsigned int bitshiftcnt;
513		for(; res_ptr < res_end ; res_ptr++) {
514			if (likely(*res_ptr != ~0UL)) {
515				bitshiftcnt = ffz(*res_ptr);
516				*res_ptr |= (1UL << bitshiftcnt);
517				pide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
518				ioc->res_bitshift = bitshiftcnt + bits_wanted;
519				goto found_it;
520			}
521		}
522		goto not_found;
523
524	}
525
526	if (likely(bits_wanted <= BITS_PER_LONG/2)) {
527		/*
528		** Search the resource bit map on well-aligned values.
529		** "o" is the alignment.
530		** We need the alignment to invalidate I/O TLB using
531		** SBA HW features in the unmap path.
532		*/
533		unsigned long o = 1 << get_iovp_order(bits_wanted << iovp_shift);
534		uint bitshiftcnt = ROUNDUP(ioc->res_bitshift, o);
535		unsigned long mask, base_mask;
536
537		base_mask = RESMAP_MASK(bits_wanted);
538		mask = base_mask << bitshiftcnt;
539
540		DBG_RES("%s() o %ld %p", __func__, o, res_ptr);
541		for(; res_ptr < res_end ; res_ptr++)
542		{
543			DBG_RES("    %p %lx %lx\n", res_ptr, mask, *res_ptr);
544			ASSERT(0 != mask);
545			for (; mask ; mask <<= o, bitshiftcnt += o) {
546				tpide = ptr_to_pide(ioc, res_ptr, bitshiftcnt);
547				ret = iommu_is_span_boundary(tpide, bits_wanted,
548							     shift,
549							     boundary_size);
550				if ((0 == ((*res_ptr) & mask)) && !ret) {
551					*res_ptr |= mask;     /* mark resources busy! */
552					pide = tpide;
553					ioc->res_bitshift = bitshiftcnt + bits_wanted;
554					goto found_it;
555				}
556			}
557
558			bitshiftcnt = 0;
559			mask = base_mask;
560
561		}
562
563	} else {
564		int qwords, bits, i;
565		unsigned long *end;
566
567		qwords = bits_wanted >> 6; /* /64 */
568		bits = bits_wanted - (qwords * BITS_PER_LONG);
569
570		end = res_end - qwords;
571
572		for (; res_ptr < end; res_ptr++) {
573			tpide = ptr_to_pide(ioc, res_ptr, 0);
574			ret = iommu_is_span_boundary(tpide, bits_wanted,
575						     shift, boundary_size);
576			if (ret)
577				goto next_ptr;
578			for (i = 0 ; i < qwords ; i++) {
579				if (res_ptr[i] != 0)
580					goto next_ptr;
581			}
582			if (bits && res_ptr[i] && (__ffs(res_ptr[i]) < bits))
583				continue;
584
585			/* Found it, mark it */
586			for (i = 0 ; i < qwords ; i++)
587				res_ptr[i] = ~0UL;
588			res_ptr[i] |= RESMAP_MASK(bits);
589
590			pide = tpide;
591			res_ptr += qwords;
592			ioc->res_bitshift = bits;
593			goto found_it;
594next_ptr:
595			;
596		}
597	}
598
599not_found:
600	prefetch(ioc->res_map);
601	ioc->res_hint = (unsigned long *) ioc->res_map;
602	ioc->res_bitshift = 0;
603	spin_unlock_irqrestore(&ioc->res_lock, flags);
604	return (pide);
605
606found_it:
607	ioc->res_hint = res_ptr;
608	spin_unlock_irqrestore(&ioc->res_lock, flags);
609	return (pide);
610}
611
612
613/**
614 * sba_alloc_range - find free bits and mark them in IO PDIR resource bitmap
615 * @ioc: IO MMU structure which owns the pdir we are interested in.
616 * @size: number of bytes to create a mapping for
617 *
618 * Given a size, find consecutive unmarked and then mark those bits in the
619 * resource bit map.
620 */
621static int
622sba_alloc_range(struct ioc *ioc, struct device *dev, size_t size)
623{
624	unsigned int pages_needed = size >> iovp_shift;
625#ifdef PDIR_SEARCH_TIMING
626	unsigned long itc_start;
627#endif
628	unsigned long pide;
629
630	ASSERT(pages_needed);
631	ASSERT(0 == (size & ~iovp_mask));
632
633#ifdef PDIR_SEARCH_TIMING
634	itc_start = ia64_get_itc();
635#endif
636	/*
637	** "seek and ye shall find"...praying never hurts either...
638	*/
639	pide = sba_search_bitmap(ioc, dev, pages_needed, 1);
640	if (unlikely(pide >= (ioc->res_size << 3))) {
641		pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
642		if (unlikely(pide >= (ioc->res_size << 3))) {
643#if DELAYED_RESOURCE_CNT > 0
644			unsigned long flags;
645
646			/*
647			** With delayed resource freeing, we can give this one more shot.  We're
648			** getting close to being in trouble here, so do what we can to make this
649			** one count.
650			*/
651			spin_lock_irqsave(&ioc->saved_lock, flags);
652			if (ioc->saved_cnt > 0) {
653				struct sba_dma_pair *d;
654				int cnt = ioc->saved_cnt;
655
656				d = &(ioc->saved[ioc->saved_cnt - 1]);
657
658				spin_lock(&ioc->res_lock);
659				while (cnt--) {
660					sba_mark_invalid(ioc, d->iova, d->size);
661					sba_free_range(ioc, d->iova, d->size);
662					d--;
663				}
664				ioc->saved_cnt = 0;
665				READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
666				spin_unlock(&ioc->res_lock);
667			}
668			spin_unlock_irqrestore(&ioc->saved_lock, flags);
669
670			pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
671			if (unlikely(pide >= (ioc->res_size << 3))) {
672				printk(KERN_WARNING "%s: I/O MMU @ %p is"
673				       "out of mapping resources, %u %u %lx\n",
674				       __func__, ioc->ioc_hpa, ioc->res_size,
675				       pages_needed, dma_get_seg_boundary(dev));
676				return -1;
677			}
678#else
679			printk(KERN_WARNING "%s: I/O MMU @ %p is"
680			       "out of mapping resources, %u %u %lx\n",
681			       __func__, ioc->ioc_hpa, ioc->res_size,
682			       pages_needed, dma_get_seg_boundary(dev));
683			return -1;
684#endif
685		}
686	}
687
688#ifdef PDIR_SEARCH_TIMING
689	ioc->avg_search[ioc->avg_idx++] = (ia64_get_itc() - itc_start) / pages_needed;
690	ioc->avg_idx &= SBA_SEARCH_SAMPLE - 1;
691#endif
692
693	prefetchw(&(ioc->pdir_base[pide]));
694
695#ifdef ASSERT_PDIR_SANITY
696	/* verify the first enable bit is clear */
697	if(0x00 != ((u8 *) ioc->pdir_base)[pide*PDIR_ENTRY_SIZE + 7]) {
698		sba_dump_pdir_entry(ioc, "sba_search_bitmap() botched it?", pide);
699	}
700#endif
701
702	DBG_RES("%s(%x) %d -> %lx hint %x/%x\n",
703		__func__, size, pages_needed, pide,
704		(uint) ((unsigned long) ioc->res_hint - (unsigned long) ioc->res_map),
705		ioc->res_bitshift );
706
707	return (pide);
708}
709
710
711/**
712 * sba_free_range - unmark bits in IO PDIR resource bitmap
713 * @ioc: IO MMU structure which owns the pdir we are interested in.
714 * @iova: IO virtual address which was previously allocated.
715 * @size: number of bytes to create a mapping for
716 *
717 * clear bits in the ioc's resource map
718 */
719static SBA_INLINE void
720sba_free_range(struct ioc *ioc, dma_addr_t iova, size_t size)
721{
722	unsigned long iovp = SBA_IOVP(ioc, iova);
723	unsigned int pide = PDIR_INDEX(iovp);
724	unsigned int ridx = pide >> 3;	/* convert bit to byte address */
725	unsigned long *res_ptr = (unsigned long *) &((ioc)->res_map[ridx & ~RESMAP_IDX_MASK]);
726	int bits_not_wanted = size >> iovp_shift;
727	unsigned long m;
728
729	/* Round up to power-of-two size: see AR2305 note above */
730	bits_not_wanted = 1UL << get_iovp_order(bits_not_wanted << iovp_shift);
731	for (; bits_not_wanted > 0 ; res_ptr++) {
732
733		if (unlikely(bits_not_wanted > BITS_PER_LONG)) {
734
735			/* these mappings start 64bit aligned */
736			*res_ptr = 0UL;
737			bits_not_wanted -= BITS_PER_LONG;
738			pide += BITS_PER_LONG;
739
740		} else {
741
742			/* 3-bits "bit" address plus 2 (or 3) bits for "byte" == bit in word */
743			m = RESMAP_MASK(bits_not_wanted) << (pide & (BITS_PER_LONG - 1));
744			bits_not_wanted = 0;
745
746			DBG_RES("%s( ,%x,%x) %x/%lx %x %p %lx\n", __func__, (uint) iova, size,
747			        bits_not_wanted, m, pide, res_ptr, *res_ptr);
748
749			ASSERT(m != 0);
750			ASSERT(bits_not_wanted);
751			ASSERT((*res_ptr & m) == m); /* verify same bits are set */
752			*res_ptr &= ~m;
753		}
754	}
755}
756
757
758/**************************************************************
759*
760*   "Dynamic DMA Mapping" support (aka "Coherent I/O")
761*
762***************************************************************/
763
764/**
765 * sba_io_pdir_entry - fill in one IO PDIR entry
766 * @pdir_ptr:  pointer to IO PDIR entry
767 * @vba: Virtual CPU address of buffer to map
768 *
769 * SBA Mapping Routine
770 *
771 * Given a virtual address (vba, arg1) sba_io_pdir_entry()
772 * loads the I/O PDIR entry pointed to by pdir_ptr (arg0).
773 * Each IO Pdir entry consists of 8 bytes as shown below
774 * (LSB == bit 0):
775 *
776 *  63                    40                                 11    7        0
777 * +-+---------------------+----------------------------------+----+--------+
778 * |V|        U            |            PPN[39:12]            | U  |   FF   |
779 * +-+---------------------+----------------------------------+----+--------+
780 *
781 *  V  == Valid Bit
782 *  U  == Unused
783 * PPN == Physical Page Number
784 *
785 * The physical address fields are filled with the results of virt_to_phys()
786 * on the vba.
787 */
788
789#if 1
790#define sba_io_pdir_entry(pdir_ptr, vba) *pdir_ptr = ((vba & ~0xE000000000000FFFULL)	\
791						      | 0x8000000000000000ULL)
792#else
793void SBA_INLINE
794sba_io_pdir_entry(u64 *pdir_ptr, unsigned long vba)
795{
796	*pdir_ptr = ((vba & ~0xE000000000000FFFULL) | 0x80000000000000FFULL);
797}
798#endif
799
800#ifdef ENABLE_MARK_CLEAN
801/*
802 * Since DMA is i-cache coherent, any (complete) pages that were written via
803 * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to
804 * flush them when they get mapped into an executable vm-area.
805 */
806static void mark_clean(void *addr, size_t size)
807{
808	struct folio *folio = virt_to_folio(addr);
809	ssize_t left = size;
810	size_t offset = offset_in_folio(folio, addr);
811
812	if (offset) {
813		left -= folio_size(folio) - offset;
814		if (left <= 0)
815			return;
816		folio = folio_next(folio);
817	}
818
819	while (left >= folio_size(folio)) {
820		left -= folio_size(folio);
821		set_bit(PG_arch_1, &folio->flags);
822		if (!left)
823			break;
824		folio = folio_next(folio);
825	}
826}
827#endif
828
829/**
830 * sba_mark_invalid - invalidate one or more IO PDIR entries
831 * @ioc: IO MMU structure which owns the pdir we are interested in.
832 * @iova:  IO Virtual Address mapped earlier
833 * @byte_cnt:  number of bytes this mapping covers.
834 *
835 * Marking the IO PDIR entry(ies) as Invalid and invalidate
836 * corresponding IO TLB entry. The PCOM (Purge Command Register)
837 * is to purge stale entries in the IO TLB when unmapping entries.
838 *
839 * The PCOM register supports purging of multiple pages, with a minium
840 * of 1 page and a maximum of 2GB. Hardware requires the address be
841 * aligned to the size of the range being purged. The size of the range
842 * must be a power of 2. The "Cool perf optimization" in the
843 * allocation routine helps keep that true.
844 */
845static SBA_INLINE void
846sba_mark_invalid(struct ioc *ioc, dma_addr_t iova, size_t byte_cnt)
847{
848	u32 iovp = (u32) SBA_IOVP(ioc,iova);
849
850	int off = PDIR_INDEX(iovp);
851
852	/* Must be non-zero and rounded up */
853	ASSERT(byte_cnt > 0);
854	ASSERT(0 == (byte_cnt & ~iovp_mask));
855
856#ifdef ASSERT_PDIR_SANITY
857	/* Assert first pdir entry is set */
858	if (!(ioc->pdir_base[off] >> 60)) {
859		sba_dump_pdir_entry(ioc,"sba_mark_invalid()", PDIR_INDEX(iovp));
860	}
861#endif
862
863	if (byte_cnt <= iovp_size)
864	{
865		ASSERT(off < ioc->pdir_size);
866
867		iovp |= iovp_shift;     /* set "size" field for PCOM */
868
869#ifndef FULL_VALID_PDIR
870		/*
871		** clear I/O PDIR entry "valid" bit
872		** Do NOT clear the rest - save it for debugging.
873		** We should only clear bits that have previously
874		** been enabled.
875		*/
876		ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
877#else
878		/*
879  		** If we want to maintain the PDIR as valid, put in
880		** the spill page so devices prefetching won't
881		** cause a hard fail.
882		*/
883		ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
884#endif
885	} else {
886		u32 t = get_iovp_order(byte_cnt) + iovp_shift;
887
888		iovp |= t;
889		ASSERT(t <= 31);   /* 2GB! Max value of "size" field */
890
891		do {
892			/* verify this pdir entry is enabled */
893			ASSERT(ioc->pdir_base[off]  >> 63);
894#ifndef FULL_VALID_PDIR
895			/* clear I/O Pdir entry "valid" bit first */
896			ioc->pdir_base[off] &= ~(0x80000000000000FFULL);
897#else
898			ioc->pdir_base[off] = (0x80000000000000FFULL | prefetch_spill_page);
899#endif
900			off++;
901			byte_cnt -= iovp_size;
902		} while (byte_cnt > 0);
903	}
904
905	WRITE_REG(iovp | ioc->ibase, ioc->ioc_hpa+IOC_PCOM);
906}
907
908/**
909 * sba_map_page - map one buffer and return IOVA for DMA
910 * @dev: instance of PCI owned by the driver that's asking.
911 * @page: page to map
912 * @poff: offset into page
913 * @size: number of bytes to map
914 * @dir: dma direction
915 * @attrs: optional dma attributes
916 *
917 * See Documentation/core-api/dma-api-howto.rst
918 */
919static dma_addr_t sba_map_page(struct device *dev, struct page *page,
920			       unsigned long poff, size_t size,
921			       enum dma_data_direction dir,
922			       unsigned long attrs)
923{
924	struct ioc *ioc;
925	void *addr = page_address(page) + poff;
926	dma_addr_t iovp;
927	dma_addr_t offset;
928	u64 *pdir_start;
929	int pide;
930#ifdef ASSERT_PDIR_SANITY
931	unsigned long flags;
932#endif
933#ifdef ALLOW_IOV_BYPASS
934	unsigned long pci_addr = virt_to_phys(addr);
935#endif
936
937#ifdef ALLOW_IOV_BYPASS
938	ASSERT(to_pci_dev(dev)->dma_mask);
939	/*
940 	** Check if the PCI device can DMA to ptr... if so, just return ptr
941 	*/
942	if (likely((pci_addr & ~to_pci_dev(dev)->dma_mask) == 0)) {
943		/*
944 		** Device is bit capable of DMA'ing to the buffer...
945		** just return the PCI address of ptr
946 		*/
947		DBG_BYPASS("sba_map_page() bypass mask/addr: "
948			   "0x%lx/0x%lx\n",
949		           to_pci_dev(dev)->dma_mask, pci_addr);
950		return pci_addr;
951	}
952#endif
953	ioc = GET_IOC(dev);
954	ASSERT(ioc);
955
956	prefetch(ioc->res_hint);
957
958	ASSERT(size > 0);
959	ASSERT(size <= DMA_CHUNK_SIZE);
960
961	/* save offset bits */
962	offset = ((dma_addr_t) (long) addr) & ~iovp_mask;
963
964	/* round up to nearest iovp_size */
965	size = (size + offset + ~iovp_mask) & iovp_mask;
966
967#ifdef ASSERT_PDIR_SANITY
968	spin_lock_irqsave(&ioc->res_lock, flags);
969	if (sba_check_pdir(ioc,"Check before sba_map_page()"))
970		panic("Sanity check failed");
971	spin_unlock_irqrestore(&ioc->res_lock, flags);
972#endif
973
974	pide = sba_alloc_range(ioc, dev, size);
975	if (pide < 0)
976		return DMA_MAPPING_ERROR;
977
978	iovp = (dma_addr_t) pide << iovp_shift;
979
980	DBG_RUN("%s() 0x%p -> 0x%lx\n", __func__, addr, (long) iovp | offset);
981
982	pdir_start = &(ioc->pdir_base[pide]);
983
984	while (size > 0) {
985		ASSERT(((u8 *)pdir_start)[7] == 0); /* verify availability */
986		sba_io_pdir_entry(pdir_start, (unsigned long) addr);
987
988		DBG_RUN("     pdir 0x%p %lx\n", pdir_start, *pdir_start);
989
990		addr += iovp_size;
991		size -= iovp_size;
992		pdir_start++;
993	}
994	/* force pdir update */
995	wmb();
996
997	/* form complete address */
998#ifdef ASSERT_PDIR_SANITY
999	spin_lock_irqsave(&ioc->res_lock, flags);
1000	sba_check_pdir(ioc,"Check after sba_map_page()");
1001	spin_unlock_irqrestore(&ioc->res_lock, flags);
1002#endif
1003	return SBA_IOVA(ioc, iovp, offset);
1004}
1005
1006#ifdef ENABLE_MARK_CLEAN
1007static SBA_INLINE void
1008sba_mark_clean(struct ioc *ioc, dma_addr_t iova, size_t size)
1009{
1010	u32	iovp = (u32) SBA_IOVP(ioc,iova);
1011	int	off = PDIR_INDEX(iovp);
1012	void	*addr;
1013
1014	if (size <= iovp_size) {
1015		addr = phys_to_virt(ioc->pdir_base[off] &
1016		                    ~0xE000000000000FFFULL);
1017		mark_clean(addr, size);
1018	} else {
1019		do {
1020			addr = phys_to_virt(ioc->pdir_base[off] &
1021			                    ~0xE000000000000FFFULL);
1022			mark_clean(addr, min(size, iovp_size));
1023			off++;
1024			size -= iovp_size;
1025		} while (size > 0);
1026	}
1027}
1028#endif
1029
1030/**
1031 * sba_unmap_page - unmap one IOVA and free resources
1032 * @dev: instance of PCI owned by the driver that's asking.
1033 * @iova:  IOVA of driver buffer previously mapped.
1034 * @size:  number of bytes mapped in driver buffer.
1035 * @dir:  R/W or both.
1036 * @attrs: optional dma attributes
1037 *
1038 * See Documentation/core-api/dma-api-howto.rst
1039 */
1040static void sba_unmap_page(struct device *dev, dma_addr_t iova, size_t size,
1041			   enum dma_data_direction dir, unsigned long attrs)
1042{
1043	struct ioc *ioc;
1044#if DELAYED_RESOURCE_CNT > 0
1045	struct sba_dma_pair *d;
1046#endif
1047	unsigned long flags;
1048	dma_addr_t offset;
1049
1050	ioc = GET_IOC(dev);
1051	ASSERT(ioc);
1052
1053#ifdef ALLOW_IOV_BYPASS
1054	if (likely((iova & ioc->imask) != ioc->ibase)) {
1055		/*
1056		** Address does not fall w/in IOVA, must be bypassing
1057		*/
1058		DBG_BYPASS("sba_unmap_page() bypass addr: 0x%lx\n",
1059			   iova);
1060
1061#ifdef ENABLE_MARK_CLEAN
1062		if (dir == DMA_FROM_DEVICE) {
1063			mark_clean(phys_to_virt(iova), size);
1064		}
1065#endif
1066		return;
1067	}
1068#endif
1069	offset = iova & ~iovp_mask;
1070
1071	DBG_RUN("%s() iovp 0x%lx/%x\n", __func__, (long) iova, size);
1072
1073	iova ^= offset;        /* clear offset bits */
1074	size += offset;
1075	size = ROUNDUP(size, iovp_size);
1076
1077#ifdef ENABLE_MARK_CLEAN
1078	if (dir == DMA_FROM_DEVICE)
1079		sba_mark_clean(ioc, iova, size);
1080#endif
1081
1082#if DELAYED_RESOURCE_CNT > 0
1083	spin_lock_irqsave(&ioc->saved_lock, flags);
1084	d = &(ioc->saved[ioc->saved_cnt]);
1085	d->iova = iova;
1086	d->size = size;
1087	if (unlikely(++(ioc->saved_cnt) >= DELAYED_RESOURCE_CNT)) {
1088		int cnt = ioc->saved_cnt;
1089		spin_lock(&ioc->res_lock);
1090		while (cnt--) {
1091			sba_mark_invalid(ioc, d->iova, d->size);
1092			sba_free_range(ioc, d->iova, d->size);
1093			d--;
1094		}
1095		ioc->saved_cnt = 0;
1096		READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
1097		spin_unlock(&ioc->res_lock);
1098	}
1099	spin_unlock_irqrestore(&ioc->saved_lock, flags);
1100#else /* DELAYED_RESOURCE_CNT == 0 */
1101	spin_lock_irqsave(&ioc->res_lock, flags);
1102	sba_mark_invalid(ioc, iova, size);
1103	sba_free_range(ioc, iova, size);
1104	READ_REG(ioc->ioc_hpa+IOC_PCOM);	/* flush purges */
1105	spin_unlock_irqrestore(&ioc->res_lock, flags);
1106#endif /* DELAYED_RESOURCE_CNT == 0 */
1107}
1108
1109/**
1110 * sba_alloc_coherent - allocate/map shared mem for DMA
1111 * @dev: instance of PCI owned by the driver that's asking.
1112 * @size:  number of bytes mapped in driver buffer.
1113 * @dma_handle:  IOVA of new buffer.
1114 *
1115 * See Documentation/core-api/dma-api-howto.rst
1116 */
1117static void *
1118sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
1119		   gfp_t flags, unsigned long attrs)
1120{
1121	struct page *page;
1122	struct ioc *ioc;
1123	int node = -1;
1124	void *addr;
1125
1126	ioc = GET_IOC(dev);
1127	ASSERT(ioc);
1128#ifdef CONFIG_NUMA
1129	node = ioc->node;
1130#endif
1131
1132	page = alloc_pages_node(node, flags, get_order(size));
1133	if (unlikely(!page))
1134		return NULL;
1135
1136	addr = page_address(page);
1137	memset(addr, 0, size);
1138	*dma_handle = page_to_phys(page);
1139
1140#ifdef ALLOW_IOV_BYPASS
1141	ASSERT(dev->coherent_dma_mask);
1142	/*
1143 	** Check if the PCI device can DMA to ptr... if so, just return ptr
1144 	*/
1145	if (likely((*dma_handle & ~dev->coherent_dma_mask) == 0)) {
1146		DBG_BYPASS("sba_alloc_coherent() bypass mask/addr: 0x%lx/0x%lx\n",
1147		           dev->coherent_dma_mask, *dma_handle);
1148
1149		return addr;
1150	}
1151#endif
1152
1153	/*
1154	 * If device can't bypass or bypass is disabled, pass the 32bit fake
1155	 * device to map single to get an iova mapping.
1156	 */
1157	*dma_handle = sba_map_page(&ioc->sac_only_dev->dev, page, 0, size,
1158			DMA_BIDIRECTIONAL, 0);
1159	if (dma_mapping_error(dev, *dma_handle))
1160		return NULL;
1161	return addr;
1162}
1163
1164
1165/**
1166 * sba_free_coherent - free/unmap shared mem for DMA
1167 * @dev: instance of PCI owned by the driver that's asking.
1168 * @size:  number of bytes mapped in driver buffer.
1169 * @vaddr:  virtual address IOVA of "consistent" buffer.
1170 * @dma_handler:  IO virtual address of "consistent" buffer.
1171 *
1172 * See Documentation/core-api/dma-api-howto.rst
1173 */
1174static void sba_free_coherent(struct device *dev, size_t size, void *vaddr,
1175			      dma_addr_t dma_handle, unsigned long attrs)
1176{
1177	sba_unmap_page(dev, dma_handle, size, 0, 0);
1178	free_pages((unsigned long) vaddr, get_order(size));
1179}
1180
1181
1182/*
1183** Since 0 is a valid pdir_base index value, can't use that
1184** to determine if a value is valid or not. Use a flag to indicate
1185** the SG list entry contains a valid pdir index.
1186*/
1187#define PIDE_FLAG 0x1UL
1188
1189#ifdef DEBUG_LARGE_SG_ENTRIES
1190int dump_run_sg = 0;
1191#endif
1192
1193
1194/**
1195 * sba_fill_pdir - write allocated SG entries into IO PDIR
1196 * @ioc: IO MMU structure which owns the pdir we are interested in.
1197 * @startsg:  list of IOVA/size pairs
1198 * @nents: number of entries in startsg list
1199 *
1200 * Take preprocessed SG list and write corresponding entries
1201 * in the IO PDIR.
1202 */
1203
1204static SBA_INLINE int
1205sba_fill_pdir(
1206	struct ioc *ioc,
1207	struct scatterlist *startsg,
1208	int nents)
1209{
1210	struct scatterlist *dma_sg = startsg;	/* pointer to current DMA */
1211	int n_mappings = 0;
1212	u64 *pdirp = NULL;
1213	unsigned long dma_offset = 0;
1214
1215	while (nents-- > 0) {
1216		int     cnt = startsg->dma_length;
1217		startsg->dma_length = 0;
1218
1219#ifdef DEBUG_LARGE_SG_ENTRIES
1220		if (dump_run_sg)
1221			printk(" %2d : %08lx/%05x %p\n",
1222				nents, startsg->dma_address, cnt,
1223				sba_sg_address(startsg));
1224#else
1225		DBG_RUN_SG(" %d : %08lx/%05x %p\n",
1226				nents, startsg->dma_address, cnt,
1227				sba_sg_address(startsg));
1228#endif
1229		/*
1230		** Look for the start of a new DMA stream
1231		*/
1232		if (startsg->dma_address & PIDE_FLAG) {
1233			u32 pide = startsg->dma_address & ~PIDE_FLAG;
1234			dma_offset = (unsigned long) pide & ~iovp_mask;
1235			startsg->dma_address = 0;
1236			if (n_mappings)
1237				dma_sg = sg_next(dma_sg);
1238			dma_sg->dma_address = pide | ioc->ibase;
1239			pdirp = &(ioc->pdir_base[pide >> iovp_shift]);
1240			n_mappings++;
1241		}
1242
1243		/*
1244		** Look for a VCONTIG chunk
1245		*/
1246		if (cnt) {
1247			unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
1248			ASSERT(pdirp);
1249
1250			/* Since multiple Vcontig blocks could make up
1251			** one DMA stream, *add* cnt to dma_len.
1252			*/
1253			dma_sg->dma_length += cnt;
1254			cnt += dma_offset;
1255			dma_offset=0;	/* only want offset on first chunk */
1256			cnt = ROUNDUP(cnt, iovp_size);
1257			do {
1258				sba_io_pdir_entry(pdirp, vaddr);
1259				vaddr += iovp_size;
1260				cnt -= iovp_size;
1261				pdirp++;
1262			} while (cnt > 0);
1263		}
1264		startsg = sg_next(startsg);
1265	}
1266	/* force pdir update */
1267	wmb();
1268
1269#ifdef DEBUG_LARGE_SG_ENTRIES
1270	dump_run_sg = 0;
1271#endif
1272	return(n_mappings);
1273}
1274
1275
1276/*
1277** Two address ranges are DMA contiguous *iff* "end of prev" and
1278** "start of next" are both on an IOV page boundary.
1279**
1280** (shift left is a quick trick to mask off upper bits)
1281*/
1282#define DMA_CONTIG(__X, __Y) \
1283	(((((unsigned long) __X) | ((unsigned long) __Y)) << (BITS_PER_LONG - iovp_shift)) == 0UL)
1284
1285
1286/**
1287 * sba_coalesce_chunks - preprocess the SG list
1288 * @ioc: IO MMU structure which owns the pdir we are interested in.
1289 * @startsg:  list of IOVA/size pairs
1290 * @nents: number of entries in startsg list
1291 *
1292 * First pass is to walk the SG list and determine where the breaks are
1293 * in the DMA stream. Allocates PDIR entries but does not fill them.
1294 * Returns the number of DMA chunks.
1295 *
1296 * Doing the fill separate from the coalescing/allocation keeps the
1297 * code simpler. Future enhancement could make one pass through
1298 * the sglist do both.
1299 */
1300static SBA_INLINE int
1301sba_coalesce_chunks(struct ioc *ioc, struct device *dev,
1302	struct scatterlist *startsg,
1303	int nents)
1304{
1305	struct scatterlist *vcontig_sg;    /* VCONTIG chunk head */
1306	unsigned long vcontig_len;         /* len of VCONTIG chunk */
1307	unsigned long vcontig_end;
1308	struct scatterlist *dma_sg;        /* next DMA stream head */
1309	unsigned long dma_offset, dma_len; /* start/len of DMA stream */
1310	int n_mappings = 0;
1311	unsigned int max_seg_size = dma_get_max_seg_size(dev);
1312	int idx;
1313
1314	while (nents > 0) {
1315		unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
1316
1317		/*
1318		** Prepare for first/next DMA stream
1319		*/
1320		dma_sg = vcontig_sg = startsg;
1321		dma_len = vcontig_len = vcontig_end = startsg->length;
1322		vcontig_end +=  vaddr;
1323		dma_offset = vaddr & ~iovp_mask;
1324
1325		/* PARANOID: clear entries */
1326		startsg->dma_address = startsg->dma_length = 0;
1327
1328		/*
1329		** This loop terminates one iteration "early" since
1330		** it's always looking one "ahead".
1331		*/
1332		while (--nents > 0) {
1333			unsigned long vaddr;	/* tmp */
1334
1335			startsg = sg_next(startsg);
1336
1337			/* PARANOID */
1338			startsg->dma_address = startsg->dma_length = 0;
1339
1340			/* catch brokenness in SCSI layer */
1341			ASSERT(startsg->length <= DMA_CHUNK_SIZE);
1342
1343			/*
1344			** First make sure current dma stream won't
1345			** exceed DMA_CHUNK_SIZE if we coalesce the
1346			** next entry.
1347			*/
1348			if (((dma_len + dma_offset + startsg->length + ~iovp_mask) & iovp_mask)
1349			    > DMA_CHUNK_SIZE)
1350				break;
1351
1352			if (dma_len + startsg->length > max_seg_size)
1353				break;
1354
1355			/*
1356			** Then look for virtually contiguous blocks.
1357			**
1358			** append the next transaction?
1359			*/
1360			vaddr = (unsigned long) sba_sg_address(startsg);
1361			if  (vcontig_end == vaddr)
1362			{
1363				vcontig_len += startsg->length;
1364				vcontig_end += startsg->length;
1365				dma_len     += startsg->length;
1366				continue;
1367			}
1368
1369#ifdef DEBUG_LARGE_SG_ENTRIES
1370			dump_run_sg = (vcontig_len > iovp_size);
1371#endif
1372
1373			/*
1374			** Not virtually contiguous.
1375			** Terminate prev chunk.
1376			** Start a new chunk.
1377			**
1378			** Once we start a new VCONTIG chunk, dma_offset
1379			** can't change. And we need the offset from the first
1380			** chunk - not the last one. Ergo Successive chunks
1381			** must start on page boundaries and dove tail
1382			** with it's predecessor.
1383			*/
1384			vcontig_sg->dma_length = vcontig_len;
1385
1386			vcontig_sg = startsg;
1387			vcontig_len = startsg->length;
1388
1389			/*
1390			** 3) do the entries end/start on page boundaries?
1391			**    Don't update vcontig_end until we've checked.
1392			*/
1393			if (DMA_CONTIG(vcontig_end, vaddr))
1394			{
1395				vcontig_end = vcontig_len + vaddr;
1396				dma_len += vcontig_len;
1397				continue;
1398			} else {
1399				break;
1400			}
1401		}
1402
1403		/*
1404		** End of DMA Stream
1405		** Terminate last VCONTIG block.
1406		** Allocate space for DMA stream.
1407		*/
1408		vcontig_sg->dma_length = vcontig_len;
1409		dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask;
1410		ASSERT(dma_len <= DMA_CHUNK_SIZE);
1411		idx = sba_alloc_range(ioc, dev, dma_len);
1412		if (idx < 0) {
1413			dma_sg->dma_length = 0;
1414			return -1;
1415		}
1416		dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift)
1417						   | dma_offset);
1418		n_mappings++;
1419	}
1420
1421	return n_mappings;
1422}
1423
1424static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
1425			       int nents, enum dma_data_direction dir,
1426			       unsigned long attrs);
1427/**
1428 * sba_map_sg - map Scatter/Gather list
1429 * @dev: instance of PCI owned by the driver that's asking.
1430 * @sglist:  array of buffer/length pairs
1431 * @nents:  number of entries in list
1432 * @dir:  R/W or both.
1433 * @attrs: optional dma attributes
1434 *
1435 * See Documentation/core-api/dma-api-howto.rst
1436 */
1437static int sba_map_sg_attrs(struct device *dev, struct scatterlist *sglist,
1438			    int nents, enum dma_data_direction dir,
1439			    unsigned long attrs)
1440{
1441	struct ioc *ioc;
1442	int coalesced, filled = 0;
1443#ifdef ASSERT_PDIR_SANITY
1444	unsigned long flags;
1445#endif
1446#ifdef ALLOW_IOV_BYPASS_SG
1447	struct scatterlist *sg;
1448#endif
1449
1450	DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
1451	ioc = GET_IOC(dev);
1452	ASSERT(ioc);
1453
1454#ifdef ALLOW_IOV_BYPASS_SG
1455	ASSERT(to_pci_dev(dev)->dma_mask);
1456	if (likely((ioc->dma_mask & ~to_pci_dev(dev)->dma_mask) == 0)) {
1457		for_each_sg(sglist, sg, nents, filled) {
1458			sg->dma_length = sg->length;
1459			sg->dma_address = virt_to_phys(sba_sg_address(sg));
1460		}
1461		return filled;
1462	}
1463#endif
1464	/* Fast path single entry scatterlists. */
1465	if (nents == 1) {
1466		sglist->dma_length = sglist->length;
1467		sglist->dma_address = sba_map_page(dev, sg_page(sglist),
1468				sglist->offset, sglist->length, dir, attrs);
1469		if (dma_mapping_error(dev, sglist->dma_address))
1470			return -EIO;
1471		return 1;
1472	}
1473
1474#ifdef ASSERT_PDIR_SANITY
1475	spin_lock_irqsave(&ioc->res_lock, flags);
1476	if (sba_check_pdir(ioc,"Check before sba_map_sg_attrs()"))
1477	{
1478		sba_dump_sg(ioc, sglist, nents);
1479		panic("Check before sba_map_sg_attrs()");
1480	}
1481	spin_unlock_irqrestore(&ioc->res_lock, flags);
1482#endif
1483
1484	prefetch(ioc->res_hint);
1485
1486	/*
1487	** First coalesce the chunks and allocate I/O pdir space
1488	**
1489	** If this is one DMA stream, we can properly map using the
1490	** correct virtual address associated with each DMA page.
1491	** w/o this association, we wouldn't have coherent DMA!
1492	** Access to the virtual address is what forces a two pass algorithm.
1493	*/
1494	coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
1495	if (coalesced < 0) {
1496		sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
1497		return -ENOMEM;
1498	}
1499
1500	/*
1501	** Program the I/O Pdir
1502	**
1503	** map the virtual addresses to the I/O Pdir
1504	** o dma_address will contain the pdir index
1505	** o dma_len will contain the number of bytes to map
1506	** o address contains the virtual address.
1507	*/
1508	filled = sba_fill_pdir(ioc, sglist, nents);
1509
1510#ifdef ASSERT_PDIR_SANITY
1511	spin_lock_irqsave(&ioc->res_lock, flags);
1512	if (sba_check_pdir(ioc,"Check after sba_map_sg_attrs()"))
1513	{
1514		sba_dump_sg(ioc, sglist, nents);
1515		panic("Check after sba_map_sg_attrs()\n");
1516	}
1517	spin_unlock_irqrestore(&ioc->res_lock, flags);
1518#endif
1519
1520	ASSERT(coalesced == filled);
1521	DBG_RUN_SG("%s() DONE %d mappings\n", __func__, filled);
1522
1523	return filled;
1524}
1525
1526/**
1527 * sba_unmap_sg_attrs - unmap Scatter/Gather list
1528 * @dev: instance of PCI owned by the driver that's asking.
1529 * @sglist:  array of buffer/length pairs
1530 * @nents:  number of entries in list
1531 * @dir:  R/W or both.
1532 * @attrs: optional dma attributes
1533 *
1534 * See Documentation/core-api/dma-api-howto.rst
1535 */
1536static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
1537			       int nents, enum dma_data_direction dir,
1538			       unsigned long attrs)
1539{
1540#ifdef ASSERT_PDIR_SANITY
1541	struct ioc *ioc;
1542	unsigned long flags;
1543#endif
1544
1545	DBG_RUN_SG("%s() START %d entries,  %p,%x\n",
1546		   __func__, nents, sba_sg_address(sglist), sglist->length);
1547
1548#ifdef ASSERT_PDIR_SANITY
1549	ioc = GET_IOC(dev);
1550	ASSERT(ioc);
1551
1552	spin_lock_irqsave(&ioc->res_lock, flags);
1553	sba_check_pdir(ioc,"Check before sba_unmap_sg_attrs()");
1554	spin_unlock_irqrestore(&ioc->res_lock, flags);
1555#endif
1556
1557	while (nents && sglist->dma_length) {
1558
1559		sba_unmap_page(dev, sglist->dma_address, sglist->dma_length,
1560			       dir, attrs);
1561		sglist = sg_next(sglist);
1562		nents--;
1563	}
1564
1565	DBG_RUN_SG("%s() DONE (nents %d)\n", __func__,  nents);
1566
1567#ifdef ASSERT_PDIR_SANITY
1568	spin_lock_irqsave(&ioc->res_lock, flags);
1569	sba_check_pdir(ioc,"Check after sba_unmap_sg_attrs()");
1570	spin_unlock_irqrestore(&ioc->res_lock, flags);
1571#endif
1572
1573}
1574
1575/**************************************************************
1576*
1577*   Initialization and claim
1578*
1579***************************************************************/
1580
1581static void
1582ioc_iova_init(struct ioc *ioc)
1583{
1584	int tcnfg;
1585	int agp_found = 0;
1586	struct pci_dev *device = NULL;
1587#ifdef FULL_VALID_PDIR
1588	unsigned long index;
1589#endif
1590
1591	/*
1592	** Firmware programs the base and size of a "safe IOVA space"
1593	** (one that doesn't overlap memory or LMMIO space) in the
1594	** IBASE and IMASK registers.
1595	*/
1596	ioc->ibase = READ_REG(ioc->ioc_hpa + IOC_IBASE) & ~0x1UL;
1597	ioc->imask = READ_REG(ioc->ioc_hpa + IOC_IMASK) | 0xFFFFFFFF00000000UL;
1598
1599	ioc->iov_size = ~ioc->imask + 1;
1600
1601	DBG_INIT("%s() hpa %p IOV base 0x%lx mask 0x%lx (%dMB)\n",
1602		__func__, ioc->ioc_hpa, ioc->ibase, ioc->imask,
1603		ioc->iov_size >> 20);
1604
1605	switch (iovp_size) {
1606		case  4*1024: tcnfg = 0; break;
1607		case  8*1024: tcnfg = 1; break;
1608		case 16*1024: tcnfg = 2; break;
1609		case 64*1024: tcnfg = 3; break;
1610		default:
1611			panic(PFX "Unsupported IOTLB page size %ldK",
1612				iovp_size >> 10);
1613			break;
1614	}
1615	WRITE_REG(tcnfg, ioc->ioc_hpa + IOC_TCNFG);
1616
1617	ioc->pdir_size = (ioc->iov_size / iovp_size) * PDIR_ENTRY_SIZE;
1618	ioc->pdir_base = (void *) __get_free_pages(GFP_KERNEL,
1619						   get_order(ioc->pdir_size));
1620	if (!ioc->pdir_base)
1621		panic(PFX "Couldn't allocate I/O Page Table\n");
1622
1623	memset(ioc->pdir_base, 0, ioc->pdir_size);
1624
1625	DBG_INIT("%s() IOV page size %ldK pdir %p size %x\n", __func__,
1626		iovp_size >> 10, ioc->pdir_base, ioc->pdir_size);
1627
1628	ASSERT(ALIGN((unsigned long) ioc->pdir_base, 4*1024) == (unsigned long) ioc->pdir_base);
1629	WRITE_REG(virt_to_phys(ioc->pdir_base), ioc->ioc_hpa + IOC_PDIR_BASE);
1630
1631	/*
1632	** If an AGP device is present, only use half of the IOV space
1633	** for PCI DMA.  Unfortunately we can't know ahead of time
1634	** whether GART support will actually be used, for now we
1635	** can just key on an AGP device found in the system.
1636	** We program the next pdir index after we stop w/ a key for
1637	** the GART code to handshake on.
1638	*/
1639	for_each_pci_dev(device)
1640		agp_found |= pci_find_capability(device, PCI_CAP_ID_AGP);
1641
1642	if (agp_found && reserve_sba_gart) {
1643		printk(KERN_INFO PFX "reserving %dMb of IOVA space at 0x%lx for agpgart\n",
1644		      ioc->iov_size/2 >> 20, ioc->ibase + ioc->iov_size/2);
1645		ioc->pdir_size /= 2;
1646		((u64 *)ioc->pdir_base)[PDIR_INDEX(ioc->iov_size/2)] = ZX1_SBA_IOMMU_COOKIE;
1647	}
1648#ifdef FULL_VALID_PDIR
1649	/*
1650  	** Check to see if the spill page has been allocated, we don't need more than
1651	** one across multiple SBAs.
1652	*/
1653	if (!prefetch_spill_page) {
1654		char *spill_poison = "SBAIOMMU POISON";
1655		int poison_size = 16;
1656		void *poison_addr, *addr;
1657
1658		addr = (void *)__get_free_pages(GFP_KERNEL, get_order(iovp_size));
1659		if (!addr)
1660			panic(PFX "Couldn't allocate PDIR spill page\n");
1661
1662		poison_addr = addr;
1663		for ( ; (u64) poison_addr < addr + iovp_size; poison_addr += poison_size)
1664			memcpy(poison_addr, spill_poison, poison_size);
1665
1666		prefetch_spill_page = virt_to_phys(addr);
1667
1668		DBG_INIT("%s() prefetch spill addr: 0x%lx\n", __func__, prefetch_spill_page);
1669	}
1670	/*
1671  	** Set all the PDIR entries valid w/ the spill page as the target
1672	*/
1673	for (index = 0 ; index < (ioc->pdir_size / PDIR_ENTRY_SIZE) ; index++)
1674		((u64 *)ioc->pdir_base)[index] = (0x80000000000000FF | prefetch_spill_page);
1675#endif
1676
1677	/* Clear I/O TLB of any possible entries */
1678	WRITE_REG(ioc->ibase | (get_iovp_order(ioc->iov_size) + iovp_shift), ioc->ioc_hpa + IOC_PCOM);
1679	READ_REG(ioc->ioc_hpa + IOC_PCOM);
1680
1681	/* Enable IOVA translation */
1682	WRITE_REG(ioc->ibase | 1, ioc->ioc_hpa + IOC_IBASE);
1683	READ_REG(ioc->ioc_hpa + IOC_IBASE);
1684}
1685
1686static void __init
1687ioc_resource_init(struct ioc *ioc)
1688{
1689	spin_lock_init(&ioc->res_lock);
1690#if DELAYED_RESOURCE_CNT > 0
1691	spin_lock_init(&ioc->saved_lock);
1692#endif
1693
1694	/* resource map size dictated by pdir_size */
1695	ioc->res_size = ioc->pdir_size / PDIR_ENTRY_SIZE; /* entries */
1696	ioc->res_size >>= 3;  /* convert bit count to byte count */
1697	DBG_INIT("%s() res_size 0x%x\n", __func__, ioc->res_size);
1698
1699	ioc->res_map = (char *) __get_free_pages(GFP_KERNEL,
1700						 get_order(ioc->res_size));
1701	if (!ioc->res_map)
1702		panic(PFX "Couldn't allocate resource map\n");
1703
1704	memset(ioc->res_map, 0, ioc->res_size);
1705	/* next available IOVP - circular search */
1706	ioc->res_hint = (unsigned long *) ioc->res_map;
1707
1708#ifdef ASSERT_PDIR_SANITY
1709	/* Mark first bit busy - ie no IOVA 0 */
1710	ioc->res_map[0] = 0x1;
1711	ioc->pdir_base[0] = 0x8000000000000000ULL | ZX1_SBA_IOMMU_COOKIE;
1712#endif
1713#ifdef FULL_VALID_PDIR
1714	/* Mark the last resource used so we don't prefetch beyond IOVA space */
1715	ioc->res_map[ioc->res_size - 1] |= 0x80UL; /* res_map is chars */
1716	ioc->pdir_base[(ioc->pdir_size / PDIR_ENTRY_SIZE) - 1] = (0x80000000000000FF
1717							      | prefetch_spill_page);
1718#endif
1719
1720	DBG_INIT("%s() res_map %x %p\n", __func__,
1721		 ioc->res_size, (void *) ioc->res_map);
1722}
1723
1724static void __init
1725ioc_sac_init(struct ioc *ioc)
1726{
1727	struct pci_dev *sac = NULL;
1728	struct pci_controller *controller = NULL;
1729
1730	/*
1731	 * pci_alloc_coherent() must return a DMA address which is
1732	 * SAC (single address cycle) addressable, so allocate a
1733	 * pseudo-device to enforce that.
1734	 */
1735	sac = kzalloc(sizeof(*sac), GFP_KERNEL);
1736	if (!sac)
1737		panic(PFX "Couldn't allocate struct pci_dev");
1738
1739	controller = kzalloc(sizeof(*controller), GFP_KERNEL);
1740	if (!controller)
1741		panic(PFX "Couldn't allocate struct pci_controller");
1742
1743	controller->iommu = ioc;
1744	sac->sysdata = controller;
1745	sac->dma_mask = 0xFFFFFFFFUL;
1746	sac->dev.bus = &pci_bus_type;
1747	ioc->sac_only_dev = sac;
1748}
1749
1750static void __init
1751ioc_zx1_init(struct ioc *ioc)
1752{
1753	unsigned long rope_config;
1754	unsigned int i;
1755
1756	if (ioc->rev < 0x20)
1757		panic(PFX "IOC 2.0 or later required for IOMMU support\n");
1758
1759	/* 38 bit memory controller + extra bit for range displaced by MMIO */
1760	ioc->dma_mask = (0x1UL << 39) - 1;
1761
1762	/*
1763	** Clear ROPE(N)_CONFIG AO bit.
1764	** Disables "NT Ordering" (~= !"Relaxed Ordering")
1765	** Overrides bit 1 in DMA Hint Sets.
1766	** Improves netperf UDP_STREAM by ~10% for tg3 on bcm5701.
1767	*/
1768	for (i=0; i<(8*8); i+=8) {
1769		rope_config = READ_REG(ioc->ioc_hpa + IOC_ROPE0_CFG + i);
1770		rope_config &= ~IOC_ROPE_AO;
1771		WRITE_REG(rope_config, ioc->ioc_hpa + IOC_ROPE0_CFG + i);
1772	}
1773}
1774
1775typedef void (initfunc)(struct ioc *);
1776
1777struct ioc_iommu {
1778	u32 func_id;
1779	char *name;
1780	initfunc *init;
1781};
1782
1783static struct ioc_iommu ioc_iommu_info[] __initdata = {
1784	{ ZX1_IOC_ID, "zx1", ioc_zx1_init },
1785	{ ZX2_IOC_ID, "zx2", NULL },
1786	{ SX1000_IOC_ID, "sx1000", NULL },
1787	{ SX2000_IOC_ID, "sx2000", NULL },
1788};
1789
1790static void __init ioc_init(unsigned long hpa, struct ioc *ioc)
1791{
1792	struct ioc_iommu *info;
1793
1794	ioc->next = ioc_list;
1795	ioc_list = ioc;
1796
1797	ioc->ioc_hpa = ioremap(hpa, 0x1000);
1798
1799	ioc->func_id = READ_REG(ioc->ioc_hpa + IOC_FUNC_ID);
1800	ioc->rev = READ_REG(ioc->ioc_hpa + IOC_FCLASS) & 0xFFUL;
1801	ioc->dma_mask = 0xFFFFFFFFFFFFFFFFUL;	/* conservative */
1802
1803	for (info = ioc_iommu_info; info < ioc_iommu_info + ARRAY_SIZE(ioc_iommu_info); info++) {
1804		if (ioc->func_id == info->func_id) {
1805			ioc->name = info->name;
1806			if (info->init)
1807				(info->init)(ioc);
1808		}
1809	}
1810
1811	iovp_size = (1 << iovp_shift);
1812	iovp_mask = ~(iovp_size - 1);
1813
1814	DBG_INIT("%s: PAGE_SIZE %ldK, iovp_size %ldK\n", __func__,
1815		PAGE_SIZE >> 10, iovp_size >> 10);
1816
1817	if (!ioc->name) {
1818		ioc->name = kmalloc(24, GFP_KERNEL);
1819		if (ioc->name)
1820			sprintf((char *) ioc->name, "Unknown (%04x:%04x)",
1821				ioc->func_id & 0xFFFF, (ioc->func_id >> 16) & 0xFFFF);
1822		else
1823			ioc->name = "Unknown";
1824	}
1825
1826	ioc_iova_init(ioc);
1827	ioc_resource_init(ioc);
1828	ioc_sac_init(ioc);
1829
1830	printk(KERN_INFO PFX
1831		"%s %d.%d HPA 0x%lx IOVA space %dMb at 0x%lx\n",
1832		ioc->name, (ioc->rev >> 4) & 0xF, ioc->rev & 0xF,
1833		hpa, ioc->iov_size >> 20, ioc->ibase);
1834}
1835
1836
1837
1838/**************************************************************************
1839**
1840**   SBA initialization code (HW and SW)
1841**
1842**   o identify SBA chip itself
1843**   o FIXME: initialize DMA hints for reasonable defaults
1844**
1845**************************************************************************/
1846
1847#ifdef CONFIG_PROC_FS
1848static void *
1849ioc_start(struct seq_file *s, loff_t *pos)
1850{
1851	struct ioc *ioc;
1852	loff_t n = *pos;
1853
1854	for (ioc = ioc_list; ioc; ioc = ioc->next)
1855		if (!n--)
1856			return ioc;
1857
1858	return NULL;
1859}
1860
1861static void *
1862ioc_next(struct seq_file *s, void *v, loff_t *pos)
1863{
1864	struct ioc *ioc = v;
1865
1866	++*pos;
1867	return ioc->next;
1868}
1869
1870static void
1871ioc_stop(struct seq_file *s, void *v)
1872{
1873}
1874
1875static int
1876ioc_show(struct seq_file *s, void *v)
1877{
1878	struct ioc *ioc = v;
1879	unsigned long *res_ptr = (unsigned long *)ioc->res_map;
1880	int i, used = 0;
1881
1882	seq_printf(s, "Hewlett Packard %s IOC rev %d.%d\n",
1883		ioc->name, ((ioc->rev >> 4) & 0xF), (ioc->rev & 0xF));
1884#ifdef CONFIG_NUMA
1885	if (ioc->node != NUMA_NO_NODE)
1886		seq_printf(s, "NUMA node       : %d\n", ioc->node);
1887#endif
1888	seq_printf(s, "IOVA size       : %ld MB\n", ((ioc->pdir_size >> 3) * iovp_size)/(1024*1024));
1889	seq_printf(s, "IOVA page size  : %ld kb\n", iovp_size/1024);
1890
1891	for (i = 0; i < (ioc->res_size / sizeof(unsigned long)); ++i, ++res_ptr)
1892		used += hweight64(*res_ptr);
1893
1894	seq_printf(s, "PDIR size       : %d entries\n", ioc->pdir_size >> 3);
1895	seq_printf(s, "PDIR used       : %d entries\n", used);
1896
1897#ifdef PDIR_SEARCH_TIMING
1898	{
1899		unsigned long i = 0, avg = 0, min, max;
1900		min = max = ioc->avg_search[0];
1901		for (i = 0; i < SBA_SEARCH_SAMPLE; i++) {
1902			avg += ioc->avg_search[i];
1903			if (ioc->avg_search[i] > max) max = ioc->avg_search[i];
1904			if (ioc->avg_search[i] < min) min = ioc->avg_search[i];
1905		}
1906		avg /= SBA_SEARCH_SAMPLE;
1907		seq_printf(s, "Bitmap search   : %ld/%ld/%ld (min/avg/max CPU Cycles/IOVA page)\n",
1908		           min, avg, max);
1909	}
1910#endif
1911#ifndef ALLOW_IOV_BYPASS
1912	 seq_printf(s, "IOVA bypass disabled\n");
1913#endif
1914	return 0;
1915}
1916
1917static const struct seq_operations ioc_seq_ops = {
1918	.start = ioc_start,
1919	.next  = ioc_next,
1920	.stop  = ioc_stop,
1921	.show  = ioc_show
1922};
1923
1924static void __init
1925ioc_proc_init(void)
1926{
1927	struct proc_dir_entry *dir;
1928
1929	dir = proc_mkdir("bus/mckinley", NULL);
1930	if (!dir)
1931		return;
1932
1933	proc_create_seq(ioc_list->name, 0, dir, &ioc_seq_ops);
1934}
1935#endif
1936
1937static void
1938sba_connect_bus(struct pci_bus *bus)
1939{
1940	acpi_handle handle, parent;
1941	acpi_status status;
1942	struct ioc *ioc;
1943
1944	if (!PCI_CONTROLLER(bus))
1945		panic(PFX "no sysdata on bus %d!\n", bus->number);
1946
1947	if (PCI_CONTROLLER(bus)->iommu)
1948		return;
1949
1950	handle = acpi_device_handle(PCI_CONTROLLER(bus)->companion);
1951	if (!handle)
1952		return;
1953
1954	/*
1955	 * The IOC scope encloses PCI root bridges in the ACPI
1956	 * namespace, so work our way out until we find an IOC we
1957	 * claimed previously.
1958	 */
1959	do {
1960		for (ioc = ioc_list; ioc; ioc = ioc->next)
1961			if (ioc->handle == handle) {
1962				PCI_CONTROLLER(bus)->iommu = ioc;
1963				return;
1964			}
1965
1966		status = acpi_get_parent(handle, &parent);
1967		handle = parent;
1968	} while (ACPI_SUCCESS(status));
1969
1970	printk(KERN_WARNING "No IOC for PCI Bus %04x:%02x in ACPI\n", pci_domain_nr(bus), bus->number);
1971}
1972
1973static void __init
1974sba_map_ioc_to_node(struct ioc *ioc, acpi_handle handle)
1975{
1976#ifdef CONFIG_NUMA
1977	unsigned int node;
1978
1979	node = acpi_get_node(handle);
1980	if (node != NUMA_NO_NODE && !node_online(node))
1981		node = NUMA_NO_NODE;
1982
1983	ioc->node = node;
1984#endif
1985}
1986
1987static void __init acpi_sba_ioc_add(struct ioc *ioc)
1988{
1989	acpi_handle handle = ioc->handle;
1990	acpi_status status;
1991	u64 hpa, length;
1992	struct acpi_device_info *adi;
1993
1994	ioc_found = ioc->next;
1995	status = hp_acpi_csr_space(handle, &hpa, &length);
1996	if (ACPI_FAILURE(status))
1997		goto err;
1998
1999	status = acpi_get_object_info(handle, &adi);
2000	if (ACPI_FAILURE(status))
2001		goto err;
2002
2003	/*
2004	 * For HWP0001, only SBA appears in ACPI namespace.  It encloses the PCI
2005	 * root bridges, and its CSR space includes the IOC function.
2006	 */
2007	if (strncmp("HWP0001", adi->hardware_id.string, 7) == 0) {
2008		hpa += ZX1_IOC_OFFSET;
2009		/* zx1 based systems default to kernel page size iommu pages */
2010		if (!iovp_shift)
2011			iovp_shift = min(PAGE_SHIFT, 16);
2012	}
2013	kfree(adi);
2014
2015	/*
2016	 * default anything not caught above or specified on cmdline to 4k
2017	 * iommu page size
2018	 */
2019	if (!iovp_shift)
2020		iovp_shift = 12;
2021
2022	ioc_init(hpa, ioc);
2023	/* setup NUMA node association */
2024	sba_map_ioc_to_node(ioc, handle);
2025	return;
2026
2027 err:
2028	kfree(ioc);
2029}
2030
2031static const struct acpi_device_id hp_ioc_iommu_device_ids[] = {
2032	{"HWP0001", 0},
2033	{"HWP0004", 0},
2034	{"", 0},
2035};
2036
2037static int acpi_sba_ioc_attach(struct acpi_device *device,
2038			       const struct acpi_device_id *not_used)
2039{
2040	struct ioc *ioc;
2041
2042	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2043	if (!ioc)
2044		return -ENOMEM;
2045
2046	ioc->next = ioc_found;
2047	ioc_found = ioc;
2048	ioc->handle = device->handle;
2049	return 1;
2050}
2051
2052
2053static struct acpi_scan_handler acpi_sba_ioc_handler = {
2054	.ids	= hp_ioc_iommu_device_ids,
2055	.attach	= acpi_sba_ioc_attach,
2056};
2057
2058static int __init acpi_sba_ioc_init_acpi(void)
2059{
2060	return acpi_scan_add_handler(&acpi_sba_ioc_handler);
2061}
2062/* This has to run before acpi_scan_init(). */
2063arch_initcall(acpi_sba_ioc_init_acpi);
2064
2065static int sba_dma_supported (struct device *dev, u64 mask)
2066{
2067	/* make sure it's at least 32bit capable */
2068	return ((mask & 0xFFFFFFFFUL) == 0xFFFFFFFFUL);
2069}
2070
2071static const struct dma_map_ops sba_dma_ops = {
2072	.alloc			= sba_alloc_coherent,
2073	.free			= sba_free_coherent,
2074	.map_page		= sba_map_page,
2075	.unmap_page		= sba_unmap_page,
2076	.map_sg			= sba_map_sg_attrs,
2077	.unmap_sg		= sba_unmap_sg_attrs,
2078	.dma_supported		= sba_dma_supported,
2079	.mmap			= dma_common_mmap,
2080	.get_sgtable		= dma_common_get_sgtable,
2081	.alloc_pages		= dma_common_alloc_pages,
2082	.free_pages		= dma_common_free_pages,
2083};
2084
2085static int __init
2086sba_init(void)
2087{
2088	/*
2089	 * If we are booting a kdump kernel, the sba_iommu will cause devices
2090	 * that were not shutdown properly to MCA as soon as they are turned
2091	 * back on.  Our only option for a successful kdump kernel boot is to
2092	 * use swiotlb.
2093	 */
2094	if (is_kdump_kernel())
2095		return 0;
2096
2097	/*
2098	 * ioc_found should be populated by the acpi_sba_ioc_handler's .attach()
2099	 * routine, but that only happens if acpi_scan_init() has already run.
2100	 */
2101	while (ioc_found)
2102		acpi_sba_ioc_add(ioc_found);
2103
2104	if (!ioc_list)
2105		return 0;
2106
2107	{
2108		struct pci_bus *b = NULL;
2109		while ((b = pci_find_next_bus(b)) != NULL)
2110			sba_connect_bus(b);
2111	}
2112
2113	/* no need for swiotlb with the iommu */
2114	swiotlb_exit();
2115	dma_ops = &sba_dma_ops;
2116
2117#ifdef CONFIG_PROC_FS
2118	ioc_proc_init();
2119#endif
2120	return 0;
2121}
2122
2123subsys_initcall(sba_init); /* must be initialized after ACPI etc., but before any drivers... */
2124
2125static int __init
2126nosbagart(char *str)
2127{
2128	reserve_sba_gart = 0;
2129	return 1;
2130}
2131
2132__setup("nosbagart", nosbagart);
2133
2134static int __init
2135sba_page_override(char *str)
2136{
2137	unsigned long page_size;
2138
2139	page_size = memparse(str, &str);
2140	switch (page_size) {
2141		case 4096:
2142		case 8192:
2143		case 16384:
2144		case 65536:
2145			iovp_shift = ffs(page_size) - 1;
2146			break;
2147		default:
2148			printk("%s: unknown/unsupported iommu page size %ld\n",
2149			       __func__, page_size);
2150	}
2151
2152	return 1;
2153}
2154
2155__setup("sbapagesize=",sba_page_override);
2156