Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband
authorLinus Torvalds <torvalds@g5.osdl.org>
Sun, 2 Apr 2006 19:51:22 +0000 (12:51 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Sun, 2 Apr 2006 19:51:22 +0000 (12:51 -0700)
* 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband:
  IB/ipath: kbuild infrastructure
  IB/ipath: infiniband verbs support
  IB/ipath: misc infiniband code, part 2
  IB/ipath: misc infiniband code, part 1
  IB/ipath: infiniband RC protocol support
  IB/ipath: infiniband UC and UD protocol support
  IB/ipath: infiniband header files
  IB/ipath: layering interfaces used by higher-level driver code
  IB/ipath: support for userspace apps using core driver
  IB/ipath: sysfs and ipathfs support for core driver
  IB/ipath: misc driver support code
  IB/ipath: chip initialisation code, and diag support
  IB/ipath: support for PCI Express devices
  IB/ipath: support for HyperTransport devices
  IB/ipath: core driver header files
  IB/ipath: core device driver

40 files changed:
MAINTAINERS
drivers/Makefile
drivers/infiniband/Kconfig
drivers/infiniband/Makefile
drivers/infiniband/hw/ipath/Kconfig [new file with mode: 0644]
drivers/infiniband/hw/ipath/Makefile [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_common.h [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_cq.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_debug.h [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_diag.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_driver.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_eeprom.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_file_ops.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_fs.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_ht400.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_init_chip.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_intr.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_kernel.h [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_keys.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_layer.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_layer.h [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_mad.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_mr.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_pe800.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_qp.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_rc.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_registers.h [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_ruc.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_srq.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_stats.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_sysfs.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_uc.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_ud.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_user_pages.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_verbs.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_verbs.h [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_verbs_mcast.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ipath_wc_x86_64.c [new file with mode: 0644]
drivers/infiniband/hw/ipath/ips_common.h [new file with mode: 0644]
drivers/infiniband/hw/ipath/verbs_debug.h [new file with mode: 0644]

index c946581..f97657b 100644 (file)
@@ -1451,6 +1451,12 @@ P:       Juanjo Ciarlante
 M:     jjciarla@raiz.uncu.edu.ar
 S:     Maintained
 
+IPATH DRIVER:
+P:     Bryan O'Sullivan
+M:     support@pathscale.com
+L:     openib-general@openib.org
+S:     Supported
+
 IPX NETWORK LAYER
 P:     Arnaldo Carvalho de Melo
 M:     acme@conectiva.com.br
index 55205c8..447d8e6 100644 (file)
@@ -69,6 +69,7 @@ obj-$(CONFIG_CPU_FREQ)                += cpufreq/
 obj-$(CONFIG_MMC)              += mmc/
 obj-$(CONFIG_NEW_LEDS)         += leds/
 obj-$(CONFIG_INFINIBAND)       += infiniband/
+obj-$(CONFIG_IPATH_CORE)       += infiniband/
 obj-$(CONFIG_SGI_SN)           += sn/
 obj-y                          += firmware/
 obj-$(CONFIG_CRYPTO)           += crypto/
index bdf0891..afc612b 100644 (file)
@@ -30,6 +30,7 @@ config INFINIBAND_USER_ACCESS
          <http://www.openib.org>.
 
 source "drivers/infiniband/hw/mthca/Kconfig"
+source "drivers/infiniband/hw/ipath/Kconfig"
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
 
index a43fb34..eea2732 100644 (file)
@@ -1,4 +1,5 @@
 obj-$(CONFIG_INFINIBAND)               += core/
 obj-$(CONFIG_INFINIBAND_MTHCA)         += hw/mthca/
+obj-$(CONFIG_IPATH_CORE)               += hw/ipath/
 obj-$(CONFIG_INFINIBAND_IPOIB)         += ulp/ipoib/
 obj-$(CONFIG_INFINIBAND_SRP)           += ulp/srp/
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig
new file mode 100644 (file)
index 0000000..9ea67c4
--- /dev/null
@@ -0,0 +1,16 @@
+config IPATH_CORE
+       tristate "PathScale InfiniPath Driver"
+       depends on 64BIT && PCI_MSI && NET
+       ---help---
+       This is a low-level driver for PathScale InfiniPath host channel
+       adapters (HCAs) based on the HT-400 and PE-800 chips.
+
+config INFINIBAND_IPATH
+       tristate "PathScale InfiniPath Verbs Driver"
+       depends on IPATH_CORE && INFINIBAND
+       ---help---
+       This is a driver that provides InfiniBand verbs support for
+       PathScale InfiniPath host channel adapters (HCAs).  This
+       allows these devices to be used with both kernel upper level
+       protocols such as IP-over-InfiniBand as well as with userspace
+       applications (in conjunction with InfiniBand userspace access).
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile
new file mode 100644 (file)
index 0000000..b4d084a
--- /dev/null
@@ -0,0 +1,36 @@
+EXTRA_CFLAGS += -DIPATH_IDSTR='"PathScale kernel.org driver"' \
+       -DIPATH_KERN_TYPE=0
+
+obj-$(CONFIG_IPATH_CORE) += ipath_core.o
+obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
+
+ipath_core-y := \
+       ipath_diag.o \
+       ipath_driver.o \
+       ipath_eeprom.o \
+       ipath_file_ops.o \
+       ipath_fs.o \
+       ipath_ht400.o \
+       ipath_init_chip.o \
+       ipath_intr.o \
+       ipath_layer.o \
+       ipath_pe800.o \
+       ipath_stats.o \
+       ipath_sysfs.o \
+       ipath_user_pages.o
+
+ipath_core-$(CONFIG_X86_64) += ipath_wc_x86_64.o
+
+ib_ipath-y := \
+       ipath_cq.o \
+       ipath_keys.o \
+       ipath_mad.o \
+       ipath_mr.o \
+       ipath_qp.o \
+       ipath_rc.o \
+       ipath_ruc.o \
+       ipath_srq.o \
+       ipath_uc.o \
+       ipath_ud.o \
+       ipath_verbs.o \
+       ipath_verbs_mcast.o
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h
new file mode 100644 (file)
index 0000000..48a5524
--- /dev/null
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_COMMON_H
+#define _IPATH_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* This is the IEEE-assigned OUI for PathScale, Inc. */
+#define IPATH_SRC_OUI_1 0x00
+#define IPATH_SRC_OUI_2 0x11
+#define IPATH_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _IPATH_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
+ */
+
+/*
+ * The value in the BTH QP field that InfiniPath uses to differentiate
+ * an infinipath protocol IB packet vs standard IB transport
+ */
+#define IPATH_KD_QP 0x656b79
+
+/*
+ * valid states passed to ipath_set_linkstate() user call
+ */
+#define IPATH_IB_LINKDOWN              0
+#define IPATH_IB_LINKARM               1
+#define IPATH_IB_LINKACTIVE            2
+#define IPATH_IB_LINKINIT              3
+#define IPATH_IB_LINKDOWN_SLEEP                4
+#define IPATH_IB_LINKDOWN_DISABLE      5
+
+/*
+ * stats maintained by the driver.  For now, at least, this is global
+ * to all minor devices.
+ */
+struct infinipath_stats {
+       /* number of interrupts taken */
+       __u64 sps_ints;
+       /* number of interrupts for errors */
+       __u64 sps_errints;
+       /* number of errors from chip (not incl. packet errors or CRC) */
+       __u64 sps_errs;
+       /* number of packet errors from chip other than CRC */
+       __u64 sps_pkterrs;
+       /* number of packets with CRC errors (ICRC and VCRC) */
+       __u64 sps_crcerrs;
+       /* number of hardware errors reported (parity, etc.) */
+       __u64 sps_hwerrs;
+       /* number of times IB link changed state unexpectedly */
+       __u64 sps_iblink;
+       /* no longer used; left for compatibility */
+       __u64 sps_unused3;
+       /* number of kernel (port0) packets received */
+       __u64 sps_port0pkts;
+       /* number of "ethernet" packets sent by driver */
+       __u64 sps_ether_spkts;
+       /* number of "ethernet" packets received by driver */
+       __u64 sps_ether_rpkts;
+       /* number of SMA packets sent by driver */
+       __u64 sps_sma_spkts;
+       /* number of SMA packets received by driver */
+       __u64 sps_sma_rpkts;
+       /* number of times all ports rcvhdrq was full and packet dropped */
+       __u64 sps_hdrqfull;
+       /* number of times all ports egrtid was full and packet dropped */
+       __u64 sps_etidfull;
+       /*
+        * number of times we tried to send from driver, but no pio buffers
+        * avail
+        */
+       __u64 sps_nopiobufs;
+       /* number of ports currently open */
+       __u64 sps_ports;
+       /* list of pkeys (other than default) accepted (0 means not set) */
+       __u16 sps_pkeys[4];
+       /* lids for up to 4 infinipaths, indexed by infinipath # */
+       __u16 sps_lid[4];
+       /* number of user ports per chip (not IB ports) */
+       __u32 sps_nports;
+       /* not our interrupt, or already handled */
+       __u32 sps_nullintr;
+       /* max number of packets handled per receive call */
+       __u32 sps_maxpkts_call;
+       /* avg number of packets handled per receive call */
+       __u32 sps_avgpkts_call;
+       /* total number of pages locked */
+       __u64 sps_pagelocks;
+       /* total number of pages unlocked */
+       __u64 sps_pageunlocks;
+       /*
+        * Number of packets dropped in kernel other than errors (ether
+        * packets if ipath not configured, sma/mad, etc.)
+        */
+       __u64 sps_krdrops;
+       /* mlids for up to 4 infinipaths, indexed by infinipath # */
+       __u16 sps_mlid[4];
+       /* pad for future growth */
+       __u64 __sps_pad[45];
+};
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.
+ */
+#define IPATH_STATUS_INITTED       0x1 /* basic initialization done */
+#define IPATH_STATUS_DISABLED      0x2 /* hardware disabled */
+/* Device has been disabled via admin request */
+#define IPATH_STATUS_ADMIN_DISABLED    0x4
+#define IPATH_STATUS_OIB_SMA       0x8 /* ipath_mad kernel SMA running */
+#define IPATH_STATUS_SMA          0x10 /* user SMA running */
+/* Chip has been found and initted */
+#define IPATH_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define IPATH_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define IPATH_STATUS_IB_CONF      0x80
+/* no link established, probably no cable */
+#define IPATH_STATUS_IB_NOCABLE  0x100
+/* A Fatal hardware error has occurred. */
+#define IPATH_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+typedef enum _ipath_ureg {
+       /* (RO)  DMA RcvHdr to be used next. */
+       ur_rcvhdrtail = 0,
+       /* (RW)  RcvHdr entry to be processed next by host. */
+       ur_rcvhdrhead = 1,
+       /* (RO)  Index of next Eager index to use. */
+       ur_rcvegrindextail = 2,
+       /* (RW)  Eager TID to be processed next */
+       ur_rcvegrindexhead = 3,
+       /* For internal use only; max register number. */
+       _IPATH_UregMax
+} ipath_ureg;
+
+/* bit values for spi_runtime_flags */
+#define IPATH_RUNTIME_HT       0x1
+#define IPATH_RUNTIME_PCIE     0x2
+#define IPATH_RUNTIME_FORCE_WC_ORDER   0x4
+#define IPATH_RUNTIME_RCVHDR_COPY      0x8
+
+/*
+ * This structure is returned by ipath_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct ipath_base_info {
+       /* version of hardware, for feature checking. */
+       __u32 spi_hw_version;
+       /* version of software, for feature checking. */
+       __u32 spi_sw_version;
+       /* InfiniPath port assigned, goes into sent packets */
+       __u32 spi_port;
+       /*
+        * IB MTU, packets IB data must be less than this.
+        * The MTU is in bytes, and will be a multiple of 4 bytes.
+        */
+       __u32 spi_mtu;
+       /*
+        * Size of a PIO buffer.  Any given packet's total size must be less
+        * than this (in words).  Included is the starting control word, so
+        * if 513 is returned, then total pkt size is 512 words or less.
+        */
+       __u32 spi_piosize;
+       /* size of the TID cache in infinipath, in entries */
+       __u32 spi_tidcnt;
+       /* size of the TID Eager list in infinipath, in entries */
+       __u32 spi_tidegrcnt;
+       /* size of a single receive header queue entry. */
+       __u32 spi_rcvhdrent_size;
+       /*
+        * Count of receive header queue entries allocated.
+        * This may be less than the spu_rcvhdrcnt passed in!.
+        */
+       __u32 spi_rcvhdr_cnt;
+
+       /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
+       __u32 spi_runtime_flags;
+
+       /* address where receive buffer queue is mapped into */
+       __u64 spi_rcvhdr_base;
+
+       /* user program. */
+
+       /* base address of eager TID receive buffers. */
+       __u64 spi_rcv_egrbufs;
+
+       /* Allocated by initialization code, not by protocol. */
+
+       /*
+        * Size of each TID buffer in host memory, starting at
+        * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+        */
+       __u32 spi_rcv_egrbufsize;
+       /*
+        * The special QP (queue pair) value that identifies an infinipath
+        * protocol packet from standard IB packets.  More, probably much
+        * more, to be added.
+        */
+       __u32 spi_qpair;
+
+       /*
+        * User register base for init code, not to be used directly by
+        * protocol or applications.
+        */
+       __u64 __spi_uregbase;
+       /*
+        * Maximum buffer size in bytes that can be used in a single TID
+        * entry (assuming the buffer is aligned to this boundary).  This is
+        * the minimum of what the hardware and software support Guaranteed
+        * to be a power of 2.
+        */
+       __u32 spi_tid_maxsize;
+       /*
+        * alignment of each pio send buffer (byte count
+        * to add to spi_piobufbase to get to second buffer)
+        */
+       __u32 spi_pioalign;
+       /*
+        * The index of the first pio buffer available to this process;
+        * needed to do lookup in spi_pioavailaddr; not added to
+        * spi_piobufbase.
+        */
+       __u32 spi_pioindex;
+        /* number of buffers mapped for this process */
+       __u32 spi_piocnt;
+
+       /*
+        * Base address of writeonly pio buffers for this process.
+        * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+        * boundaries.  spi_piocnt buffers are mapped from this address
+        */
+       __u64 spi_piobufbase;
+
+       /*
+        * Base address of readonly memory copy of the pioavail registers.
+        * There are 2 bits for each buffer.
+        */
+       __u64 spi_pioavailaddr;
+
+       /*
+        * Address where driver updates a copy of the interface and driver
+        * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
+        * string indicating hardware error, if there was one.
+        */
+       __u64 spi_status;
+
+       /* number of chip ports available to user processes */
+       __u32 spi_nports;
+       /* unit number of chip we are using */
+       __u32 spi_unit;
+       /* num bufs in each contiguous set */
+       __u32 spi_rcv_egrperchunk;
+       /* size in bytes of each contiguous set */
+       __u32 spi_rcv_egrchunksize;
+       /* total size of mmap to cover full rcvegrbuffers */
+       __u32 spi_rcv_egrbuftotlen;
+} __attribute__ ((aligned(8)));
+
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of ipath_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define IPATH_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference
+ */
+#define IPATH_USER_SWMINOR 2
+
+#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
+
+#define IPATH_KERN_TYPE 0
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a PathScale release, or from the driver from OpenIB, kernel.org,
+ * or a standard distribution, for support reasons.  The high bit is 0 for
+ * non-PathScale, and 1 for PathScale-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of ipath_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
+
+/*
+ * This structure is passed to ipath_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct ipath_user_info {
+       /*
+        * version of user software, to detect compatibility issues.
+        * Should be set to IPATH_USER_SWVERSION.
+        */
+       __u32 spu_userversion;
+
+       /* desired number of receive header queue entries */
+       __u32 spu_rcvhdrcnt;
+
+       /* size of struct base_info to write to */
+       __u32 spu_base_info_size;
+
+       /*
+        * number of words in KD protocol header
+        * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
+        * kernel uses a default.  Once set, attempts to set any other value
+        * are an error (EAGAIN) until driver is reloaded.
+        */
+       __u32 spu_rcvhdrsize;
+
+       /*
+        * cache line aligned (64 byte) user address to
+        * which the rcvhdrtail register will be written by infinipath
+        * whenever it changes, so that no chip registers are read in
+        * the performance path.
+        */
+       __u64 spu_rcvhdraddr;
+
+       /*
+        * address of struct base_info to write to
+        */
+       __u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+#define IPATH_CMD_MIN          16
+
+#define IPATH_CMD_USER_INIT    16      /* set up userspace */
+#define IPATH_CMD_PORT_INFO    17      /* find out what resources we got */
+#define IPATH_CMD_RECV_CTRL    18      /* control receipt of packets */
+#define IPATH_CMD_TID_UPDATE   19      /* update expected TID entries */
+#define IPATH_CMD_TID_FREE     20      /* free expected TID entries */
+#define IPATH_CMD_SET_PART_KEY 21      /* add partition key */
+
+#define IPATH_CMD_MAX          21
+
+struct ipath_port_info {
+       __u32 num_active;       /* number of active units */
+       __u32 unit;             /* unit (chip) assigned to caller */
+       __u32 port;             /* port on unit assigned to caller */
+};
+
+struct ipath_tid_info {
+       __u32 tidcnt;
+       /* make structure same size in 32 and 64 bit */
+       __u32 tid__unused;
+       /* virtual address of first page in transfer */
+       __u64 tidvaddr;
+       /* pointer (same size 32/64 bit) to __u16 tid array */
+       __u64 tidlist;
+
+       /*
+        * pointer (same size 32/64 bit) to bitmap of TIDs used
+        * for this call; checked for being large enough at open
+        */
+       __u64 tidmap;
+};
+
+struct ipath_cmd {
+       __u32 type;                     /* command type */
+       union {
+               struct ipath_tid_info tid_info;
+               struct ipath_user_info user_info;
+               /* address in userspace of struct ipath_port_info to
+                  write result to */
+               __u64 port_info;
+               /* enable/disable receipt of packets */
+               __u32 recv_ctrl;
+               /* partition key to set */
+               __u16 part_key;
+       } cmd;
+};
+
+struct ipath_iovec {
+       /* Pointer to data, but same size 32 and 64 bit */
+       __u64 iov_base;
+
+       /*
+        * Length of data; don't need 64 bits, but want
+        * ipath_sendpkt to remain same size as before 32 bit changes, so...
+        */
+       __u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the ipath_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __ipath_sendpkt {
+       __u32 sps_flags;        /* flags for packet (TBD) */
+       __u32 sps_cnt;          /* number of entries to use in sps_iov */
+       /* array of iov's describing packet. TEMPORARY */
+       struct ipath_iovec sps_iov[4];
+};
+
+/* Passed into SMA special file's ->read and ->write methods. */
+struct ipath_sma_pkt
+{
+       __u32 unit;     /* unit on which to send packet */
+       __u64 data;     /* address of payload in userspace */
+       __u32 len;      /* length of payload */
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define IPATH_FLASH_VERSION 1
+struct ipath_flash {
+       /* flash layout version (IPATH_FLASH_VERSION) */
+       __u8 if_fversion;
+       /* checksum protecting if_length bytes */
+       __u8 if_csum;
+       /*
+        * valid length (in use, protected by if_csum), including
+        * if_fversion and if_sum themselves)
+        */
+       __u8 if_length;
+       /* the GUID, in network order */
+       __u8 if_guid[8];
+       /* number of GUIDs to use, starting from if_guid */
+       __u8 if_numguid;
+       /* the board serial number, in ASCII */
+       char if_serial[12];
+       /* board mfg date (YYYYMMDD ASCII) */
+       char if_mfgdate[8];
+       /* last board rework/test date (YYYYMMDD ASCII) */
+       char if_testdate[8];
+       /* logging of error counts, TBD */
+       __u8 if_errcntp[4];
+       /* powered on hours, updated at driver unload */
+       __u8 if_powerhour[2];
+       /* ASCII free-form comment field */
+       char if_comment[32];
+       /* 78 bytes used, min flash size is 128 bytes */
+       __u8 if_future[50];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct infinipath_counters {
+       __u64 LBIntCnt;
+       __u64 LBFlowStallCnt;
+       __u64 Reserved1;
+       __u64 TxUnsupVLErrCnt;
+       __u64 TxDataPktCnt;
+       __u64 TxFlowPktCnt;
+       __u64 TxDwordCnt;
+       __u64 TxLenErrCnt;
+       __u64 TxMaxMinLenErrCnt;
+       __u64 TxUnderrunCnt;
+       __u64 TxFlowStallCnt;
+       __u64 TxDroppedPktCnt;
+       __u64 RxDroppedPktCnt;
+       __u64 RxDataPktCnt;
+       __u64 RxFlowPktCnt;
+       __u64 RxDwordCnt;
+       __u64 RxLenErrCnt;
+       __u64 RxMaxMinLenErrCnt;
+       __u64 RxICRCErrCnt;
+       __u64 RxVCRCErrCnt;
+       __u64 RxFlowCtrlErrCnt;
+       __u64 RxBadFormatCnt;
+       __u64 RxLinkProblemCnt;
+       __u64 RxEBPCnt;
+       __u64 RxLPCRCErrCnt;
+       __u64 RxBufOvflCnt;
+       __u64 RxTIDFullErrCnt;
+       __u64 RxTIDValidErrCnt;
+       __u64 RxPKeyMismatchCnt;
+       __u64 RxP0HdrEgrOvflCnt;
+       __u64 RxP1HdrEgrOvflCnt;
+       __u64 RxP2HdrEgrOvflCnt;
+       __u64 RxP3HdrEgrOvflCnt;
+       __u64 RxP4HdrEgrOvflCnt;
+       __u64 RxP5HdrEgrOvflCnt;
+       __u64 RxP6HdrEgrOvflCnt;
+       __u64 RxP7HdrEgrOvflCnt;
+       __u64 RxP8HdrEgrOvflCnt;
+       __u64 Reserved6;
+       __u64 Reserved7;
+       __u64 IBStatusChangeCnt;
+       __u64 IBLinkErrRecoveryCnt;
+       __u64 IBLinkDownedCnt;
+       __u64 IBSymbolErrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * ipath_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
+#define INFINIPATH_RHF_LENGTH_SHIFT 0
+#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
+#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
+#define INFINIPATH_RHF_EGRINDEX_MASK 0x7FF
+#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
+#define INFINIPATH_RHF_H_ICRCERR   0x80000000
+#define INFINIPATH_RHF_H_VCRCERR   0x40000000
+#define INFINIPATH_RHF_H_PARITYERR 0x20000000
+#define INFINIPATH_RHF_H_LENERR    0x10000000
+#define INFINIPATH_RHF_H_MTUERR    0x08000000
+#define INFINIPATH_RHF_H_IHDRERR   0x04000000
+#define INFINIPATH_RHF_H_TIDERR    0x02000000
+#define INFINIPATH_RHF_H_MKERR     0x01000000
+#define INFINIPATH_RHF_H_IBERR     0x00800000
+#define INFINIPATH_RHF_L_SWA       0x00008000
+#define INFINIPATH_RHF_L_SWB       0x00004000
+
+/* infinipath header fields */
+#define INFINIPATH_I_VERS_MASK 0xF
+#define INFINIPATH_I_VERS_SHIFT 28
+#define INFINIPATH_I_PORT_MASK 0xF
+#define INFINIPATH_I_PORT_SHIFT 24
+#define INFINIPATH_I_TID_MASK 0x7FF
+#define INFINIPATH_I_TID_SHIFT 13
+#define INFINIPATH_I_OFFSET_MASK 0x1FFF
+#define INFINIPATH_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define INFINIPATH_KPF_INTR 0x1
+
+/* SendPIO per-buffer control */
+#define INFINIPATH_SP_LENGTHP1_MASK 0x3FF
+#define INFINIPATH_SP_LENGTHP1_SHIFT 0
+#define INFINIPATH_SP_INTR    0x80000000
+#define INFINIPATH_SP_TEST    0x40000000
+#define INFINIPATH_SP_TESTEBP 0x20000000
+
+/* SendPIOAvail bits */
+#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
+#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
+
+#endif                         /* _IPATH_COMMON_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
new file mode 100644 (file)
index 0000000..7ece113
--- /dev/null
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with one of the qp->s_lock or qp->r_rq.lock held.
+ */
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
+{
+       unsigned long flags;
+       u32 next;
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       if (cq->head == cq->ibcq.cqe)
+               next = 0;
+       else
+               next = cq->head + 1;
+       if (unlikely(next == cq->tail)) {
+               spin_unlock_irqrestore(&cq->lock, flags);
+               if (cq->ibcq.event_handler) {
+                       struct ib_event ev;
+
+                       ev.device = cq->ibcq.device;
+                       ev.element.cq = &cq->ibcq;
+                       ev.event = IB_EVENT_CQ_ERR;
+                       cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+               }
+               return;
+       }
+       cq->queue[cq->head] = *entry;
+       cq->head = next;
+
+       if (cq->notify == IB_CQ_NEXT_COMP ||
+           (cq->notify == IB_CQ_SOLICITED && solicited)) {
+               cq->notify = IB_CQ_NONE;
+               cq->triggered++;
+               /*
+                * This will cause send_complete() to be called in
+                * another thread.
+                */
+               tasklet_hi_schedule(&cq->comptask);
+       }
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       if (entry->status != IB_WC_SUCCESS)
+               to_idev(cq->ibcq.device)->n_wqe_errs++;
+}
+
+/**
+ * ipath_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+       struct ipath_cq *cq = to_icq(ibcq);
+       unsigned long flags;
+       int npolled;
+
+       spin_lock_irqsave(&cq->lock, flags);
+
+       for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+               if (cq->tail == cq->head)
+                       break;
+               *entry = cq->queue[cq->tail];
+               if (cq->tail == cq->ibcq.cqe)
+                       cq->tail = 0;
+               else
+                       cq->tail++;
+       }
+
+       spin_unlock_irqrestore(&cq->lock, flags);
+
+       return npolled;
+}
+
+static void send_complete(unsigned long data)
+{
+       struct ipath_cq *cq = (struct ipath_cq *)data;
+
+       /*
+        * The completion handler will most likely rearm the notification
+        * and poll for all pending entries.  If a new completion entry
+        * is added while we are in this routine, tasklet_hi_schedule()
+        * won't call us again until we return so we check triggered to
+        * see if we need to call the handler again.
+        */
+       for (;;) {
+               u8 triggered = cq->triggered;
+
+               cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+               if (cq->triggered == triggered)
+                       return;
+       }
+}
+
+/**
+ * ipath_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @entries: the minimum size of the completion queue
+ * @context: unused by the InfiniPath driver
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
+                             struct ib_ucontext *context,
+                             struct ib_udata *udata)
+{
+       struct ipath_cq *cq;
+       struct ib_wc *wc;
+       struct ib_cq *ret;
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of
+        * entries.
+        */
+       cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of entries.
+        */
+       wc = vmalloc(sizeof(*wc) * (entries + 1));
+       if (!wc) {
+               kfree(cq);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+       /*
+        * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+        * The number of entries should be >= the number requested or return
+        * an error.
+        */
+       cq->ibcq.cqe = entries;
+       cq->notify = IB_CQ_NONE;
+       cq->triggered = 0;
+       spin_lock_init(&cq->lock);
+       tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
+       cq->head = 0;
+       cq->tail = 0;
+       cq->queue = wc;
+
+       ret = &cq->ibcq;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int ipath_destroy_cq(struct ib_cq *ibcq)
+{
+       struct ipath_cq *cq = to_icq(ibcq);
+
+       tasklet_kill(&cq->comptask);
+       vfree(cq->queue);
+       kfree(cq);
+
+       return 0;
+}
+
+/**
+ * ipath_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify)
+{
+       struct ipath_cq *cq = to_icq(ibcq);
+       unsigned long flags;
+
+       spin_lock_irqsave(&cq->lock, flags);
+       /*
+        * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+        * any other transitions.
+        */
+       if (cq->notify != IB_CQ_NEXT_COMP)
+               cq->notify = notify;
+       spin_unlock_irqrestore(&cq->lock, flags);
+       return 0;
+}
+
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+       struct ipath_cq *cq = to_icq(ibcq);
+       struct ib_wc *wc, *old_wc;
+       u32 n;
+       int ret;
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of entries.
+        */
+       wc = vmalloc(sizeof(*wc) * (cqe + 1));
+       if (!wc) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       spin_lock_irq(&cq->lock);
+       if (cq->head < cq->tail)
+               n = cq->ibcq.cqe + 1 + cq->head - cq->tail;
+       else
+               n = cq->head - cq->tail;
+       if (unlikely((u32)cqe < n)) {
+               spin_unlock_irq(&cq->lock);
+               vfree(wc);
+               ret = -EOVERFLOW;
+               goto bail;
+       }
+       for (n = 0; cq->tail != cq->head; n++) {
+               wc[n] = cq->queue[cq->tail];
+               if (cq->tail == cq->ibcq.cqe)
+                       cq->tail = 0;
+               else
+                       cq->tail++;
+       }
+       cq->ibcq.cqe = cqe;
+       cq->head = n;
+       cq->tail = 0;
+       old_wc = cq->queue;
+       cq->queue = wc;
+       spin_unlock_irq(&cq->lock);
+
+       vfree(old_wc);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h
new file mode 100644 (file)
index 0000000..593e289
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_DEBUG_H
+#define _IPATH_DEBUG_H
+
+#ifndef _IPATH_DEBUGGING       /* debugging enabled or not */
+#define _IPATH_DEBUGGING 1
+#endif
+
+#if _IPATH_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe infinipath.ko infinipath_debug=7
+ */
+
+#define __IPATH_INFO        0x1        /* generic low verbosity stuff */
+#define __IPATH_DBG         0x2        /* generic debug */
+#define __IPATH_TRSAMPLE    0x8        /* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __IPATH_VERBDBG     0x40       /* very verbose debug */
+#define __IPATH_PKTDBG      0x80       /* print packet data */
+/* print process startup (init)/exit messages */
+#define __IPATH_PROCDBG     0x100
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __IPATH_MMDBG       0x200
+#define __IPATH_USER_SEND   0x1000     /* use user mode send */
+#define __IPATH_KERNEL_SEND 0x2000     /* use kernel mode send */
+#define __IPATH_EPKTDBG     0x4000     /* print ethernet packet data */
+#define __IPATH_SMADBG      0x8000     /* sma packet debug */
+#define __IPATH_IPATHDBG    0x10000    /* Ethernet (IPATH) general debug on */
+#define __IPATH_IPATHWARN   0x20000    /* Ethernet (IPATH) warnings on */
+#define __IPATH_IPATHERR    0x40000    /* Ethernet (IPATH) errors on */
+#define __IPATH_IPATHPD     0x80000    /* Ethernet (IPATH) packet dump on */
+#define __IPATH_IPATHTABLE  0x100000   /* Ethernet (IPATH) table dump on */
+
+#else                          /* _IPATH_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __IPATH_INFO      0x0  /* generic low verbosity stuff */
+#define __IPATH_DBG       0x0  /* generic debug */
+#define __IPATH_TRSAMPLE  0x0  /* generate trace buffer sample entries */
+#define __IPATH_VERBDBG   0x0  /* very verbose debug */
+#define __IPATH_PKTDBG    0x0  /* print packet data */
+#define __IPATH_PROCDBG   0x0  /* print process startup (init)/exit messages */
+/* print mmap/nopage stuff, not using VDBG any more */
+#define __IPATH_MMDBG     0x0
+#define __IPATH_EPKTDBG   0x0  /* print ethernet packet data */
+#define __IPATH_SMADBG    0x0   /* print process startup (init)/exit messages */#define __IPATH_IPATHDBG  0x0  /* Ethernet (IPATH) table dump on */
+#define __IPATH_IPATHWARN 0x0  /* Ethernet (IPATH) warnings on   */
+#define __IPATH_IPATHERR  0x0  /* Ethernet (IPATH) errors on   */
+#define __IPATH_IPATHPD   0x0  /* Ethernet (IPATH) packet dump on   */
+#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on   */
+
+#endif                         /* _IPATH_DEBUGGING */
+
+#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
+
+#endif                         /* _IPATH_DEBUG_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
new file mode 100644 (file)
index 0000000..cd533cf
--- /dev/null
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the ipath_diag device, normally minor number 129.  Diagnostic use
+ * of the InfiniPath chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/pci.h>
+#include <asm/uaccess.h>
+
+#include "ipath_common.h"
+#include "ipath_kernel.h"
+#include "ips_common.h"
+#include "ipath_layer.h"
+
+int ipath_diag_inuse;
+static int diag_set_link;
+
+static int ipath_diag_open(struct inode *in, struct file *fp);
+static int ipath_diag_release(struct inode *in, struct file *fp);
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+                              size_t count, loff_t *off);
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+                               size_t count, loff_t *off);
+
+static struct file_operations diag_file_ops = {
+       .owner = THIS_MODULE,
+       .write = ipath_diag_write,
+       .read = ipath_diag_read,
+       .open = ipath_diag_open,
+       .release = ipath_diag_release
+};
+
+static struct cdev *diag_cdev;
+static struct class_device *diag_class_dev;
+
+int ipath_diag_init(void)
+{
+       return ipath_cdev_init(IPATH_DIAG_MINOR, "ipath_diag",
+                              &diag_file_ops, &diag_cdev, &diag_class_dev);
+}
+
+void ipath_diag_cleanup(void)
+{
+       ipath_cdev_cleanup(&diag_cdev, &diag_class_dev);
+}
+
+/**
+ * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
+                            const void __iomem *caddr, size_t count)
+{
+       const u64 __iomem *reg_addr = caddr;
+       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+       int ret;
+
+       /* not very efficient, but it works for now */
+       if (reg_addr < dd->ipath_kregbase ||
+           reg_end > dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       while (reg_addr < reg_end) {
+               u64 data = readq(reg_addr);
+               if (copy_to_user(uaddr, &data, sizeof(u64))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               reg_addr++;
+               uaddr++;
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
+                             const void __user *uaddr, size_t count)
+{
+       u64 __iomem *reg_addr = caddr;
+       const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+       int ret;
+
+       /* not very efficient, but it works for now */
+       if (reg_addr < dd->ipath_kregbase ||
+           reg_end > dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       while (reg_addr < reg_end) {
+               u64 data;
+               if (copy_from_user(&data, uaddr, sizeof(data))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               writeq(data, reg_addr);
+
+               reg_addr++;
+               uaddr++;
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
+                            const void __iomem *caddr, size_t count)
+{
+       const u32 __iomem *reg_addr = caddr;
+       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+       int ret;
+
+       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+           reg_end > (u32 __iomem *) dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* not very efficient, but it works for now */
+       while (reg_addr < reg_end) {
+               u32 data = readl(reg_addr);
+               if (copy_to_user(uaddr, &data, sizeof(data))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+
+               reg_addr++;
+               uaddr++;
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+/**
+ * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
+                             const void __user *uaddr, size_t count)
+{
+       u32 __iomem *reg_addr = caddr;
+       const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+       int ret;
+
+       if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+           reg_end > (u32 __iomem *) dd->ipath_kregend) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       while (reg_addr < reg_end) {
+               u32 data;
+               if (copy_from_user(&data, uaddr, sizeof(data))) {
+                       ret = -EFAULT;
+                       goto bail;
+               }
+               writel(data, reg_addr);
+
+               reg_addr++;
+               uaddr++;
+       }
+       ret = 0;
+bail:
+       return ret;
+}
+
+static int ipath_diag_open(struct inode *in, struct file *fp)
+{
+       struct ipath_devdata *dd;
+       int unit = 0; /* XXX this is bogus */
+       unsigned long flags;
+       int ret;
+
+       dd = ipath_lookup(unit);
+
+       mutex_lock(&ipath_mutex);
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       if (ipath_diag_inuse) {
+               ret = -EBUSY;
+               goto bail;
+       }
+
+       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+               /*
+                * we need at least one infinipath device to be present
+                * (don't use INITTED, because we want to be able to open
+                * even if device is in freeze mode, which cleared INITTED).
+                * There is a small amount of risk to this, which is why we
+                * also verify kregbase is set.
+                */
+
+               if (!(dd->ipath_flags & IPATH_PRESENT) ||
+                   !dd->ipath_kregbase)
+                       continue;
+
+               ipath_diag_inuse = 1;
+               diag_set_link = 0;
+               ret = 0;
+               goto bail;
+       }
+
+       ret = -ENODEV;
+
+bail:
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+       mutex_unlock(&ipath_mutex);
+
+       /* Only expose a way to reset the device if we
+          make it into diag mode. */
+       if (ret == 0)
+               ipath_expose_reset(&dd->pcidev->dev);
+
+       return ret;
+}
+
+static int ipath_diag_release(struct inode *i, struct file *f)
+{
+       mutex_lock(&ipath_mutex);
+       ipath_diag_inuse = 0;
+       mutex_unlock(&ipath_mutex);
+       return 0;
+}
+
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+                              size_t count, loff_t *off)
+{
+       int unit = 0; /* XXX provide for reads on other units some day */
+       struct ipath_devdata *dd;
+       void __iomem *kreg_base;
+       ssize_t ret;
+
+       dd = ipath_lookup(unit);
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       kreg_base = dd->ipath_kregbase;
+
+       if (count == 0)
+               ret = 0;
+       else if ((count % 4) || (*off % 4))
+               /* address or length is not 32-bit aligned, hence invalid */
+               ret = -EINVAL;
+       else if ((count % 8) || (*off % 8))
+               /* address or length not 64-bit aligned; do 32-bit reads */
+               ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
+       else
+               ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
+
+       if (ret >= 0) {
+               *off += count;
+               ret = count;
+       }
+
+bail:
+       return ret;
+}
+
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+                               size_t count, loff_t *off)
+{
+       int unit = 0; /* XXX this is bogus */
+       struct ipath_devdata *dd;
+       void __iomem *kreg_base;
+       ssize_t ret;
+
+       dd = ipath_lookup(unit);
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+       kreg_base = dd->ipath_kregbase;
+
+       if (count == 0)
+               ret = 0;
+       else if ((count % 4) || (*off % 4))
+               /* address or length is not 32-bit aligned, hence invalid */
+               ret = -EINVAL;
+       else if ((count % 8) || (*off % 8))
+               /* address or length not 64-bit aligned; do 32-bit writes */
+               ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
+       else
+               ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
+
+       if (ret >= 0) {
+               *off += count;
+               ret = count;
+       }
+
+bail:
+       return ret;
+}
+
+void ipath_diag_bringup_link(struct ipath_devdata *dd)
+{
+       if (diag_set_link || (dd->ipath_flags & IPATH_LINKACTIVE))
+               return;
+
+       diag_set_link = 1;
+       ipath_cdbg(VERBOSE, "Trying to set to set link active for "
+                  "diag pkt\n");
+       ipath_layer_set_linkstate(dd, IPATH_IB_LINKARM);
+       ipath_layer_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
new file mode 100644 (file)
index 0000000..58a94ef
--- /dev/null
@@ -0,0 +1,1983 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+#include "ips_common.h"
+#include "ipath_layer.h"
+
+static void ipath_update_pio_bufs(struct ipath_devdata *);
+
+const char *ipath_get_unit_name(int unit)
+{
+       static char iname[16];
+       snprintf(iname, sizeof iname, "infinipath%u", unit);
+       return iname;
+}
+
+EXPORT_SYMBOL_GPL(ipath_get_unit_name);
+
+#define DRIVER_LOAD_MSG "PathScale " IPATH_DRV_NAME " loaded: "
+#define PFX IPATH_DRV_NAME ": "
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ipath_core_version[] = IPATH_IDSTR "\n";
+
+static struct idr unit_table;
+DEFINE_SPINLOCK(ipath_devs_lock);
+LIST_HEAD(ipath_dev_list);
+
+wait_queue_head_t ipath_sma_state_wait;
+
+unsigned ipath_debug = __IPATH_INFO;
+
+module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "mask for debug prints");
+EXPORT_SYMBOL_GPL(ipath_debug);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("PathScale <support@pathscale.com>");
+MODULE_DESCRIPTION("Pathscale InfiniPath driver");
+
+const char *ipath_ibcstatus_str[] = {
+       "Disabled",
+       "LinkUp",
+       "PollActive",
+       "PollQuiet",
+       "SleepDelay",
+       "SleepQuiet",
+       "LState6",              /* unused */
+       "LState7",              /* unused */
+       "CfgDebounce",
+       "CfgRcvfCfg",
+       "CfgWaitRmt",
+       "CfgIdle",
+       "RecovRetrain",
+       "LState0xD",            /* unused */
+       "RecovWaitRmt",
+       "RecovIdle",
+};
+
+/*
+ * These variables are initialized in the chip-specific files
+ * but are defined here.
+ */
+u16 ipath_gpio_sda_num, ipath_gpio_scl_num;
+u64 ipath_gpio_sda, ipath_gpio_scl;
+u64 infinipath_i_bitsextant;
+ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
+u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
+
+static void __devexit ipath_remove_one(struct pci_dev *);
+static int __devinit ipath_init_one(struct pci_dev *,
+                                   const struct pci_device_id *);
+
+/* Only needed for registration, nothing else needs this info */
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
+#define PCI_DEVICE_ID_INFINIPATH_PE800 0x10
+
+static const struct pci_device_id ipath_pci_tbl[] = {
+       {PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE,
+                   PCI_DEVICE_ID_INFINIPATH_HT)},
+       {PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE,
+                   PCI_DEVICE_ID_INFINIPATH_PE800)},
+};
+
+MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
+
+static struct pci_driver ipath_driver = {
+       .name = IPATH_DRV_NAME,
+       .probe = ipath_init_one,
+       .remove = __devexit_p(ipath_remove_one),
+       .id_table = ipath_pci_tbl,
+};
+
+/*
+ * This is where port 0's rcvhdrtail register is written back; we also
+ * want nothing else sharing the cache line, so make it a cache line
+ * in size.  Used for all units.
+ */
+volatile __le64 *ipath_port0_rcvhdrtail;
+dma_addr_t ipath_port0_rcvhdrtail_dma;
+static int port0_rcvhdrtail_refs;
+
+static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
+                            u32 *bar0, u32 *bar1)
+{
+       int ret;
+
+       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
+       if (ret)
+               ipath_dev_err(dd, "failed to read bar0 before enable: "
+                             "error %d\n", -ret);
+
+       ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
+       if (ret)
+               ipath_dev_err(dd, "failed to read bar1 before enable: "
+                             "error %d\n", -ret);
+
+       ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
+}
+
+static void ipath_free_devdata(struct pci_dev *pdev,
+                              struct ipath_devdata *dd)
+{
+       unsigned long flags;
+
+       pci_set_drvdata(pdev, NULL);
+
+       if (dd->ipath_unit != -1) {
+               spin_lock_irqsave(&ipath_devs_lock, flags);
+               idr_remove(&unit_table, dd->ipath_unit);
+               list_del(&dd->ipath_list);
+               spin_unlock_irqrestore(&ipath_devs_lock, flags);
+       }
+       dma_free_coherent(&pdev->dev, sizeof(*dd), dd, dd->ipath_dma_addr);
+}
+
+static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
+{
+       unsigned long flags;
+       struct ipath_devdata *dd;
+       dma_addr_t dma_addr;
+       int ret;
+
+       if (!idr_pre_get(&unit_table, GFP_KERNEL)) {
+               dd = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       dd = dma_alloc_coherent(&pdev->dev, sizeof(*dd), &dma_addr,
+                               GFP_KERNEL);
+
+       if (!dd) {
+               dd = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       dd->ipath_dma_addr = dma_addr;
+       dd->ipath_unit = -1;
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       ret = idr_get_new(&unit_table, dd, &dd->ipath_unit);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not allocate unit ID: error %d\n", -ret);
+               ipath_free_devdata(pdev, dd);
+               dd = ERR_PTR(ret);
+               goto bail_unlock;
+       }
+
+       dd->pcidev = pdev;
+       pci_set_drvdata(pdev, dd);
+
+       list_add(&dd->ipath_list, &ipath_dev_list);
+
+bail_unlock:
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+bail:
+       return dd;
+}
+
+static inline struct ipath_devdata *__ipath_lookup(int unit)
+{
+       return idr_find(&unit_table, unit);
+}
+
+struct ipath_devdata *ipath_lookup(int unit)
+{
+       struct ipath_devdata *dd;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+       dd = __ipath_lookup(unit);
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       return dd;
+}
+
+int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp)
+{
+       int nunits, npresent, nup;
+       struct ipath_devdata *dd;
+       unsigned long flags;
+       u32 maxports;
+
+       nunits = npresent = nup = maxports = 0;
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+               nunits++;
+               if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
+                       npresent++;
+               if (dd->ipath_lid &&
+                   !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
+                                        | IPATH_LINKUNK)))
+                       nup++;
+               if (dd->ipath_cfgports > maxports)
+                       maxports = dd->ipath_cfgports;
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       if (npresentp)
+               *npresentp = npresent;
+       if (nupp)
+               *nupp = nup;
+       if (maxportsp)
+               *maxportsp = maxports;
+
+       return nunits;
+}
+
+static int init_port0_rcvhdrtail(struct pci_dev *pdev)
+{
+       int ret;
+
+       mutex_lock(&ipath_mutex);
+
+       if (!ipath_port0_rcvhdrtail) {
+               ipath_port0_rcvhdrtail =
+                       dma_alloc_coherent(&pdev->dev,
+                                          IPATH_PORT0_RCVHDRTAIL_SIZE,
+                                          &ipath_port0_rcvhdrtail_dma,
+                                          GFP_KERNEL);
+
+               if (!ipath_port0_rcvhdrtail) {
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+       }
+       port0_rcvhdrtail_refs++;
+       ret = 0;
+
+bail:
+       mutex_unlock(&ipath_mutex);
+
+       return ret;
+}
+
+static void cleanup_port0_rcvhdrtail(struct pci_dev *pdev)
+{
+       mutex_lock(&ipath_mutex);
+
+       if (!--port0_rcvhdrtail_refs) {
+               dma_free_coherent(&pdev->dev, IPATH_PORT0_RCVHDRTAIL_SIZE,
+                                 (void *) ipath_port0_rcvhdrtail,
+                                 ipath_port0_rcvhdrtail_dma);
+               ipath_port0_rcvhdrtail = NULL;
+       }
+
+       mutex_unlock(&ipath_mutex);
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
+{
+       return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
+{
+}
+
+static int __devinit ipath_init_one(struct pci_dev *pdev,
+                                   const struct pci_device_id *ent)
+{
+       int ret, len, j;
+       struct ipath_devdata *dd;
+       unsigned long long addr;
+       u32 bar0 = 0, bar1 = 0;
+       u8 rev;
+
+       ret = init_port0_rcvhdrtail(pdev);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not allocate port0_rcvhdrtail: error %d\n",
+                      -ret);
+               goto bail;
+       }
+
+       dd = ipath_alloc_devdata(pdev);
+       if (IS_ERR(dd)) {
+               ret = PTR_ERR(dd);
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not allocate devdata: error %d\n", -ret);
+               goto bail_rcvhdrtail;
+       }
+
+       ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
+
+       read_bars(dd, pdev, &bar0, &bar1);
+
+       ret = pci_enable_device(pdev);
+       if (ret) {
+               /* This can happen iff:
+                *
+                * We did a chip reset, and then failed to reprogram the
+                * BAR, or the chip reset due to an internal error.  We then
+                * unloaded the driver and reloaded it.
+                *
+                * Both reset cases set the BAR back to initial state.  For
+                * the latter case, the AER sticky error bit at offset 0x718
+                * should be set, but the Linux kernel doesn't yet know
+                * about that, it appears.  If the original BAR was retained
+                * in the kernel data structures, this may be OK.
+                */
+               ipath_dev_err(dd, "enable unit %d failed: error %d\n",
+                             dd->ipath_unit, -ret);
+               goto bail_devdata;
+       }
+       addr = pci_resource_start(pdev, 0);
+       len = pci_resource_len(pdev, 0);
+       ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %x, vend %x/%x "
+                  "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
+                  ent->device, ent->driver_data);
+
+       read_bars(dd, pdev, &bar0, &bar1);
+
+       if (!bar1 && !(bar0 & ~0xf)) {
+               if (addr) {
+                       dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
+                                "rewriting as %llx\n", addr);
+                       ret = pci_write_config_dword(
+                               pdev, PCI_BASE_ADDRESS_0, addr);
+                       if (ret) {
+                               ipath_dev_err(dd, "rewrite of BAR0 "
+                                             "failed: err %d\n", -ret);
+                               goto bail_disable;
+                       }
+                       ret = pci_write_config_dword(
+                               pdev, PCI_BASE_ADDRESS_1, addr >> 32);
+                       if (ret) {
+                               ipath_dev_err(dd, "rewrite of BAR1 "
+                                             "failed: err %d\n", -ret);
+                               goto bail_disable;
+                       }
+               } else {
+                       ipath_dev_err(dd, "BAR is 0 (probable RESET), "
+                                     "not usable until reboot\n");
+                       ret = -ENODEV;
+                       goto bail_disable;
+               }
+       }
+
+       ret = pci_request_regions(pdev, IPATH_DRV_NAME);
+       if (ret) {
+               dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
+                        "err %d\n", dd->ipath_unit, -ret);
+               goto bail_disable;
+       }
+
+       ret = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
+       if (ret) {
+               dev_info(&pdev->dev, "pci_set_dma_mask unit %u "
+                        "fails: %d\n", dd->ipath_unit, ret);
+               goto bail_regions;
+       }
+
+       pci_set_master(pdev);
+
+       /*
+        * Save BARs to rewrite after device reset.  Save all 64 bits of
+        * BAR, just in case.
+        */
+       dd->ipath_pcibar0 = addr;
+       dd->ipath_pcibar1 = addr >> 32;
+       dd->ipath_deviceid = ent->device;       /* save for later use */
+       dd->ipath_vendorid = ent->vendor;
+
+       /* setup the chip-specific functions, as early as possible. */
+       switch (ent->device) {
+       case PCI_DEVICE_ID_INFINIPATH_HT:
+               ipath_init_ht400_funcs(dd);
+               break;
+       case PCI_DEVICE_ID_INFINIPATH_PE800:
+               ipath_init_pe800_funcs(dd);
+               break;
+       default:
+               ipath_dev_err(dd, "Found unknown PathScale deviceid 0x%x, "
+                             "failing\n", ent->device);
+               return -ENODEV;
+       }
+
+       for (j = 0; j < 6; j++) {
+               if (!pdev->resource[j].start)
+                       continue;
+               ipath_cdbg(VERBOSE, "BAR %d start %lx, end %lx, len %lx\n",
+                          j, pdev->resource[j].start,
+                          pdev->resource[j].end,
+                          pci_resource_len(pdev, j));
+       }
+
+       if (!addr) {
+               ipath_dev_err(dd, "No valid address in BAR 0!\n");
+               ret = -ENODEV;
+               goto bail_regions;
+       }
+
+       dd->ipath_deviceid = ent->device;       /* save for later use */
+       dd->ipath_vendorid = ent->vendor;
+
+       ret = pci_read_config_byte(pdev, PCI_REVISION_ID, &rev);
+       if (ret) {
+               ipath_dev_err(dd, "Failed to read PCI revision ID unit "
+                             "%u: err %d\n", dd->ipath_unit, -ret);
+               goto bail_regions;      /* shouldn't ever happen */
+       }
+       dd->ipath_pcirev = rev;
+
+       dd->ipath_kregbase = ioremap_nocache(addr, len);
+
+       if (!dd->ipath_kregbase) {
+               ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
+                         addr);
+               ret = -ENOMEM;
+               goto bail_iounmap;
+       }
+       dd->ipath_kregend = (u64 __iomem *)
+               ((void __iomem *)dd->ipath_kregbase + len);
+       dd->ipath_physaddr = addr;      /* used for io_remap, etc. */
+       /* for user mmap */
+       dd->ipath_kregvirt = (u64 __iomem *) phys_to_virt(addr);
+       ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p "
+                  "kregvirt %p\n", addr, dd->ipath_kregbase,
+                  dd->ipath_kregvirt);
+
+       /*
+        * clear ipath_flags here instead of in ipath_init_chip as it is set
+        * by ipath_setup_htconfig.
+        */
+       dd->ipath_flags = 0;
+
+       if (dd->ipath_f_bus(dd, pdev))
+               ipath_dev_err(dd, "Failed to setup config space; "
+                             "continuing anyway\n");
+
+       /*
+        * set up our interrupt handler; SA_SHIRQ probably not needed,
+        * since MSI interrupts shouldn't be shared but won't  hurt for now.
+        * check 0 irq after we return from chip-specific bus setup, since
+        * that can affect this due to setup
+        */
+       if (!pdev->irq)
+               ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
+                             "work\n");
+       else {
+               ret = request_irq(pdev->irq, ipath_intr, SA_SHIRQ,
+                                 IPATH_DRV_NAME, dd);
+               if (ret) {
+                       ipath_dev_err(dd, "Couldn't setup irq handler, "
+                                     "irq=%u: %d\n", pdev->irq, ret);
+                       goto bail_iounmap;
+               }
+       }
+
+       ret = ipath_init_chip(dd, 0);   /* do the chip-specific init */
+       if (ret)
+               goto bail_iounmap;
+
+       ret = ipath_enable_wc(dd);
+
+       if (ret) {
+               ipath_dev_err(dd, "Write combining not enabled "
+                             "(err %d): performance may be poor\n",
+                             -ret);
+               ret = 0;
+       }
+
+       ipath_device_create_group(&pdev->dev, dd);
+       ipathfs_add_device(dd);
+       ipath_user_add(dd);
+       ipath_layer_add(dd);
+
+       goto bail;
+
+bail_iounmap:
+       iounmap((volatile void __iomem *) dd->ipath_kregbase);
+
+bail_regions:
+       pci_release_regions(pdev);
+
+bail_disable:
+       pci_disable_device(pdev);
+
+bail_devdata:
+       ipath_free_devdata(pdev, dd);
+
+bail_rcvhdrtail:
+       cleanup_port0_rcvhdrtail(pdev);
+
+bail:
+       return ret;
+}
+
+static void __devexit ipath_remove_one(struct pci_dev *pdev)
+{
+       struct ipath_devdata *dd;
+
+       ipath_cdbg(VERBOSE, "removing, pdev=%p\n", pdev);
+       if (!pdev)
+               return;
+
+       dd = pci_get_drvdata(pdev);
+       ipath_layer_del(dd);
+       ipath_user_del(dd);
+       ipathfs_remove_device(dd);
+       ipath_device_remove_group(&pdev->dev, dd);
+       ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
+                  "unit %u\n", dd, (u32) dd->ipath_unit);
+       if (dd->ipath_kregbase) {
+               ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n",
+                          dd->ipath_kregbase);
+               iounmap((volatile void __iomem *) dd->ipath_kregbase);
+               dd->ipath_kregbase = NULL;
+       }
+       pci_release_regions(pdev);
+       ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
+       pci_disable_device(pdev);
+
+       ipath_free_devdata(pdev, dd);
+       cleanup_port0_rcvhdrtail(pdev);
+}
+
+/* general driver use */
+DEFINE_MUTEX(ipath_mutex);
+
+static DEFINE_SPINLOCK(ipath_pioavail_lock);
+
+/**
+ * ipath_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the infinipath device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * cancel a range of PIO buffers, used when they might be armed, but
+ * not triggered.  Used at init to ensure buffer state, and also user
+ * process close, in case it died while writing to a PIO buffer
+ * Also after errors.
+ */
+void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
+                         unsigned cnt)
+{
+       unsigned i, last = first + cnt;
+       u64 sendctrl, sendorig;
+
+       ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
+       sendorig = dd->ipath_sendctrl | INFINIPATH_S_DISARM;
+       for (i = first; i < last; i++) {
+               sendctrl = sendorig |
+                       (i << INFINIPATH_S_DISARMPIOBUF_SHIFT);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                                sendctrl);
+       }
+
+       /*
+        * Write it again with current value, in case ipath_sendctrl changed
+        * while we were looping; no critical bits that would require
+        * locking.
+        *
+        * Write a 0, and then the original value, reading scratch in
+        * between.  This seems to avoid a chip timing race that causes
+        * pioavail updates to memory to stop.
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        0);
+       sendorig = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        dd->ipath_sendctrl);
+}
+
+/**
+ * ipath_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the infinipath device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * ipath_layer_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
+{
+       dd->ipath_sma_state_wanted = state;
+       wait_event_interruptible_timeout(ipath_sma_state_wait,
+                                        (dd->ipath_flags & state),
+                                        msecs_to_jiffies(msecs));
+       dd->ipath_sma_state_wanted = 0;
+
+       if (!(dd->ipath_flags & state)) {
+               u64 val;
+               ipath_cdbg(SMA, "Didn't reach linkstate %s within %u ms\n",
+                          /* test INIT ahead of DOWN, both can be set */
+                          (state & IPATH_LINKINIT) ? "INIT" :
+                          ((state & IPATH_LINKDOWN) ? "DOWN" :
+                           ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
+                          msecs);
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+               ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
+                          (unsigned long long) ipath_read_kreg64(
+                                  dd, dd->ipath_kregs->kr_ibcctrl),
+                          (unsigned long long) val,
+                          ipath_ibcstatus_str[val & 0xf]);
+       }
+       return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
+}
+
+void ipath_decode_err(char *buf, size_t blen, ipath_err_t err)
+{
+       *buf = '\0';
+       if (err & INFINIPATH_E_RHDRLEN)
+               strlcat(buf, "rhdrlen ", blen);
+       if (err & INFINIPATH_E_RBADTID)
+               strlcat(buf, "rbadtid ", blen);
+       if (err & INFINIPATH_E_RBADVERSION)
+               strlcat(buf, "rbadversion ", blen);
+       if (err & INFINIPATH_E_RHDR)
+               strlcat(buf, "rhdr ", blen);
+       if (err & INFINIPATH_E_RLONGPKTLEN)
+               strlcat(buf, "rlongpktlen ", blen);
+       if (err & INFINIPATH_E_RSHORTPKTLEN)
+               strlcat(buf, "rshortpktlen ", blen);
+       if (err & INFINIPATH_E_RMAXPKTLEN)
+               strlcat(buf, "rmaxpktlen ", blen);
+       if (err & INFINIPATH_E_RMINPKTLEN)
+               strlcat(buf, "rminpktlen ", blen);
+       if (err & INFINIPATH_E_RFORMATERR)
+               strlcat(buf, "rformaterr ", blen);
+       if (err & INFINIPATH_E_RUNSUPVL)
+               strlcat(buf, "runsupvl ", blen);
+       if (err & INFINIPATH_E_RUNEXPCHAR)
+               strlcat(buf, "runexpchar ", blen);
+       if (err & INFINIPATH_E_RIBFLOW)
+               strlcat(buf, "ribflow ", blen);
+       if (err & INFINIPATH_E_REBP)
+               strlcat(buf, "EBP ", blen);
+       if (err & INFINIPATH_E_SUNDERRUN)
+               strlcat(buf, "sunderrun ", blen);
+       if (err & INFINIPATH_E_SPIOARMLAUNCH)
+               strlcat(buf, "spioarmlaunch ", blen);
+       if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+               strlcat(buf, "sunexperrpktnum ", blen);
+       if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+               strlcat(buf, "sdroppeddatapkt ", blen);
+       if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+               strlcat(buf, "sdroppedsmppkt ", blen);
+       if (err & INFINIPATH_E_SMAXPKTLEN)
+               strlcat(buf, "smaxpktlen ", blen);
+       if (err & INFINIPATH_E_SMINPKTLEN)
+               strlcat(buf, "sminpktlen ", blen);
+       if (err & INFINIPATH_E_SUNSUPVL)
+               strlcat(buf, "sunsupVL ", blen);
+       if (err & INFINIPATH_E_SPKTLEN)
+               strlcat(buf, "spktlen ", blen);
+       if (err & INFINIPATH_E_INVALIDADDR)
+               strlcat(buf, "invalidaddr ", blen);
+       if (err & INFINIPATH_E_RICRC)
+               strlcat(buf, "CRC ", blen);
+       if (err & INFINIPATH_E_RVCRC)
+               strlcat(buf, "VCRC ", blen);
+       if (err & INFINIPATH_E_RRCVEGRFULL)
+               strlcat(buf, "rcvegrfull ", blen);
+       if (err & INFINIPATH_E_RRCVHDRFULL)
+               strlcat(buf, "rcvhdrfull ", blen);
+       if (err & INFINIPATH_E_IBSTATUSCHANGED)
+               strlcat(buf, "ibcstatuschg ", blen);
+       if (err & INFINIPATH_E_RIBLOSTLINK)
+               strlcat(buf, "riblostlink ", blen);
+       if (err & INFINIPATH_E_HARDWARE)
+               strlcat(buf, "hardware ", blen);
+       if (err & INFINIPATH_E_RESET)
+               strlcat(buf, "reset ", blen);
+}
+
+/**
+ * get_rhf_errstring - decode RHF errors
+ * @err: the err number
+ * @msg: the output buffer
+ * @len: the length of the output buffer
+ *
+ * only used one place now, may want more later
+ */
+static void get_rhf_errstring(u32 err, char *msg, size_t len)
+{
+       /* if no errors, and so don't need to check what's first */
+       *msg = '\0';
+
+       if (err & INFINIPATH_RHF_H_ICRCERR)
+               strlcat(msg, "icrcerr ", len);
+       if (err & INFINIPATH_RHF_H_VCRCERR)
+               strlcat(msg, "vcrcerr ", len);
+       if (err & INFINIPATH_RHF_H_PARITYERR)
+               strlcat(msg, "parityerr ", len);
+       if (err & INFINIPATH_RHF_H_LENERR)
+               strlcat(msg, "lenerr ", len);
+       if (err & INFINIPATH_RHF_H_MTUERR)
+               strlcat(msg, "mtuerr ", len);
+       if (err & INFINIPATH_RHF_H_IHDRERR)
+               /* infinipath hdr checksum error */
+               strlcat(msg, "ipathhdrerr ", len);
+       if (err & INFINIPATH_RHF_H_TIDERR)
+               strlcat(msg, "tiderr ", len);
+       if (err & INFINIPATH_RHF_H_MKERR)
+               /* bad port, offset, etc. */
+               strlcat(msg, "invalid ipathhdr ", len);
+       if (err & INFINIPATH_RHF_H_IBERR)
+               strlcat(msg, "iberr ", len);
+       if (err & INFINIPATH_RHF_L_SWA)
+               strlcat(msg, "swA ", len);
+       if (err & INFINIPATH_RHF_L_SWB)
+               strlcat(msg, "swB ", len);
+}
+
+/**
+ * ipath_get_egrbuf - get an eager buffer
+ * @dd: the infinipath device
+ * @bufnum: the eager buffer to get
+ * @err: unused
+ *
+ * must only be called if ipath_pd[port] is known to be allocated
+ */
+static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum,
+                                    int err)
+{
+       return dd->ipath_port0_skbs ?
+               (void *)dd->ipath_port0_skbs[bufnum]->data : NULL;
+}
+
+/**
+ * ipath_alloc_skb - allocate an skb and buffer with possible constraints
+ * @dd: the infinipath device
+ * @gfp_mask: the sk_buff SFP mask
+ */
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
+                               gfp_t gfp_mask)
+{
+       struct sk_buff *skb;
+       u32 len;
+
+       /*
+        * Only fully supported way to handle this is to allocate lots
+        * extra, align as needed, and then do skb_reserve().  That wastes
+        * a lot of memory...  I'll have to hack this into infinipath_copy
+        * also.
+        */
+
+       /*
+        * We need 4 extra bytes for unaligned transfer copying
+        */
+       if (dd->ipath_flags & IPATH_4BYTE_TID) {
+               /* we need a 4KB multiple alignment, and there is no way
+                * to do it except to allocate extra and then skb_reserve
+                * enough to bring it up to the right alignment.
+                */
+               len = dd->ipath_ibmaxlen + 4 + (1 << 11) - 1;
+       }
+       else
+               len = dd->ipath_ibmaxlen + 4;
+       skb = __dev_alloc_skb(len, gfp_mask);
+       if (!skb) {
+               ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
+                             len);
+               goto bail;
+       }
+       if (dd->ipath_flags & IPATH_4BYTE_TID) {
+               u32 una = ((1 << 11) - 1) & (unsigned long)(skb->data + 4);
+               if (una)
+                       skb_reserve(skb, 4 + (1 << 11) - una);
+               else
+                       skb_reserve(skb, 4);
+       } else
+               skb_reserve(skb, 4);
+
+bail:
+       return skb;
+}
+
+/**
+ * ipath_rcv_layer - receive a packet for the layered (ethernet) driver
+ * @dd: the infinipath device
+ * @etail: the sk_buff number
+ * @tlen: the total packet length
+ * @hdr: the ethernet header
+ *
+ * Separate routine for better overall optimization
+ */
+static void ipath_rcv_layer(struct ipath_devdata *dd, u32 etail,
+                           u32 tlen, struct ether_header *hdr)
+{
+       u32 elen;
+       u8 pad, *bthbytes;
+       struct sk_buff *skb, *nskb;
+
+       if (dd->ipath_port0_skbs && hdr->sub_opcode == OPCODE_ENCAP) {
+               /*
+                * Allocate a new sk_buff to replace the one we give
+                * to the network stack.
+                */
+               nskb = ipath_alloc_skb(dd, GFP_ATOMIC);
+               if (!nskb) {
+                       /* count OK packets that we drop */
+                       ipath_stats.sps_krdrops++;
+                       return;
+               }
+
+               bthbytes = (u8 *) hdr->bth;
+               pad = (bthbytes[1] >> 4) & 3;
+               /* +CRC32 */
+               elen = tlen - (sizeof(*hdr) + pad + sizeof(u32));
+
+               skb = dd->ipath_port0_skbs[etail];
+               dd->ipath_port0_skbs[etail] = nskb;
+               skb_put(skb, elen);
+
+               dd->ipath_f_put_tid(dd, etail + (u64 __iomem *)
+                                   ((char __iomem *) dd->ipath_kregbase
+                                    + dd->ipath_rcvegrbase), 0,
+                                   virt_to_phys(nskb->data));
+
+               __ipath_layer_rcv(dd, hdr, skb);
+
+               /* another ether packet received */
+               ipath_stats.sps_ether_rpkts++;
+       }
+       else if (hdr->sub_opcode == OPCODE_LID_ARP)
+               __ipath_layer_rcv_lid(dd, hdr);
+}
+
+/*
+ * ipath_kreceive - receive a packet
+ * @dd: the infinipath device
+ *
+ * called from interrupt handler for errors or receive interrupt
+ */
+void ipath_kreceive(struct ipath_devdata *dd)
+{
+       u64 *rc;
+       void *ebuf;
+       const u32 rsize = dd->ipath_rcvhdrentsize;      /* words */
+       const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */
+       u32 etail = -1, l, hdrqtail;
+       struct ips_message_header *hdr;
+       u32 eflags, i, etype, tlen, pkttot = 0;
+       static u64 totcalls;    /* stats, may eventually remove */
+       char emsg[128];
+
+       if (!dd->ipath_hdrqtailptr) {
+               ipath_dev_err(dd,
+                             "hdrqtailptr not set, can't do receives\n");
+               goto bail;
+       }
+
+       /* There is already a thread processing this queue. */
+       if (test_and_set_bit(0, &dd->ipath_rcv_pending))
+               goto bail;
+
+       if (dd->ipath_port0head ==
+           (u32)le64_to_cpu(*dd->ipath_hdrqtailptr))
+               goto done;
+
+gotmore:
+       /*
+        * read only once at start.  If in flood situation, this helps
+        * performance slightly.  If more arrive while we are processing,
+        * we'll come back here and do them
+        */
+       hdrqtail = (u32)le64_to_cpu(*dd->ipath_hdrqtailptr);
+
+       for (i = 0, l = dd->ipath_port0head; l != hdrqtail; i++) {
+               u32 qp;
+               u8 *bthbytes;
+
+               rc = (u64 *) (dd->ipath_pd[0]->port_rcvhdrq + (l << 2));
+               hdr = (struct ips_message_header *)&rc[1];
+               /*
+                * could make a network order version of IPATH_KD_QP, and
+                * do the obvious shift before masking to speed this up.
+                */
+               qp = ntohl(hdr->bth[1]) & 0xffffff;
+               bthbytes = (u8 *) hdr->bth;
+
+               eflags = ips_get_hdr_err_flags((__le32 *) rc);
+               etype = ips_get_rcv_type((__le32 *) rc);
+               /* total length */
+               tlen = ips_get_length_in_bytes((__le32 *) rc);
+               ebuf = NULL;
+               if (etype != RCVHQ_RCV_TYPE_EXPECTED) {
+                       /*
+                        * it turns out that the chips uses an eager buffer
+                        * for all non-expected packets, whether it "needs"
+                        * one or not.  So always get the index, but don't
+                        * set ebuf (so we try to copy data) unless the
+                        * length requires it.
+                        */
+                       etail = ips_get_index((__le32 *) rc);
+                       if (tlen > sizeof(*hdr) ||
+                           etype == RCVHQ_RCV_TYPE_NON_KD)
+                               ebuf = ipath_get_egrbuf(dd, etail, 0);
+               }
+
+               /*
+                * both tiderr and ipathhdrerr are set for all plain IB
+                * packets; only ipathhdrerr should be set.
+                */
+
+               if (etype != RCVHQ_RCV_TYPE_NON_KD && etype !=
+                   RCVHQ_RCV_TYPE_ERROR && ips_get_ipath_ver(
+                           hdr->iph.ver_port_tid_offset) !=
+                   IPS_PROTO_VERSION) {
+                       ipath_cdbg(PKT, "Bad InfiniPath protocol version "
+                                  "%x\n", etype);
+               }
+
+               if (eflags & ~(INFINIPATH_RHF_H_TIDERR |
+                              INFINIPATH_RHF_H_IHDRERR)) {
+                       get_rhf_errstring(eflags, emsg, sizeof emsg);
+                       ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
+                                  "tlen=%x opcode=%x egridx=%x: %s\n",
+                                  eflags, l, etype, tlen, bthbytes[0],
+                                  ips_get_index((__le32 *) rc), emsg);
+               } else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+                               int ret = __ipath_verbs_rcv(dd, rc + 1,
+                                                           ebuf, tlen);
+                               if (ret == -ENODEV)
+                                       ipath_cdbg(VERBOSE,
+                                                  "received IB packet, "
+                                                  "not SMA (QP=%x)\n", qp);
+               } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+                       if (qp == IPATH_KD_QP &&
+                           bthbytes[0] == ipath_layer_rcv_opcode &&
+                           ebuf)
+                               ipath_rcv_layer(dd, etail, tlen,
+                                               (struct ether_header *)hdr);
+                       else
+                               ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
+                                          "qp=%x), len %x; ignored\n",
+                                          etype, bthbytes[0], qp, tlen);
+               }
+               else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
+                       ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
+                                 be32_to_cpu(hdr->bth[0]) & 0xff);
+               else if (eflags & (INFINIPATH_RHF_H_TIDERR |
+                                  INFINIPATH_RHF_H_IHDRERR)) {
+                       /*
+                        * This is a type 3 packet, only the LRH is in the
+                        * rcvhdrq, the rest of the header is in the eager
+                        * buffer.
+                        */
+                       u8 opcode;
+                       if (ebuf) {
+                               bthbytes = (u8 *) ebuf;
+                               opcode = *bthbytes;
+                       }
+                       else
+                               opcode = 0;
+                       get_rhf_errstring(eflags, emsg, sizeof emsg);
+                       ipath_dbg("Err %x (%s), opcode %x, egrbuf %x, "
+                                 "len %x\n", eflags, emsg, opcode, etail,
+                                 tlen);
+               } else {
+                       /*
+                        * error packet, type of error  unknown.
+                        * Probably type 3, but we don't know, so don't
+                        * even try to print the opcode, etc.
+                        */
+                       ipath_dbg("Error Pkt, but no eflags! egrbuf %x, "
+                                 "len %x\nhdrq@%lx;hdrq+%x rhf: %llx; "
+                                 "hdr %llx %llx %llx %llx %llx\n",
+                                 etail, tlen, (unsigned long) rc, l,
+                                 (unsigned long long) rc[0],
+                                 (unsigned long long) rc[1],
+                                 (unsigned long long) rc[2],
+                                 (unsigned long long) rc[3],
+                                 (unsigned long long) rc[4],
+                                 (unsigned long long) rc[5]);
+               }
+               l += rsize;
+               if (l >= maxcnt)
+                       l = 0;
+               /*
+                * update for each packet, to help prevent overflows if we
+                * have lots of packets.
+                */
+               (void)ipath_write_ureg(dd, ur_rcvhdrhead,
+                                      dd->ipath_rhdrhead_intr_off | l, 0);
+               if (etype != RCVHQ_RCV_TYPE_EXPECTED)
+                       (void)ipath_write_ureg(dd, ur_rcvegrindexhead,
+                                              etail, 0);
+       }
+
+       pkttot += i;
+
+       dd->ipath_port0head = l;
+
+       if (hdrqtail != (u32)le64_to_cpu(*dd->ipath_hdrqtailptr))
+               /* more arrived while we handled first batch */
+               goto gotmore;
+
+       if (pkttot > ipath_stats.sps_maxpkts_call)
+               ipath_stats.sps_maxpkts_call = pkttot;
+       ipath_stats.sps_port0pkts += pkttot;
+       ipath_stats.sps_avgpkts_call =
+               ipath_stats.sps_port0pkts / ++totcalls;
+
+done:
+       clear_bit(0, &dd->ipath_rcv_pending);
+       smp_mb__after_clear_bit();
+
+bail:;
+}
+
+/**
+ * ipath_update_pio_bufs - update shadow copy of the PIO availability map
+ * @dd: the infinipath device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by some code
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(struct ipath_devdata *dd)
+{
+       unsigned long flags;
+       int i;
+       const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
+
+       /* If the generation (check) bits have changed, then we update the
+        * busy bit for the corresponding PIO buffer.  This algorithm will
+        * modify positions to the value they already have in some cases
+        * (i.e., no change), but it's faster than changing only the bits
+        * that have changed.
+        *
+        * We would like to do this atomicly, to avoid spinlocks in the
+        * critical send path, but that's not really possible, given the
+        * type of changes, and that this routine could be called on
+        * multiple cpu's simultaneously, so we lock in this routine only,
+        * to avoid conflicting updates; all we change is the shadow, and
+        * it's a single 64 bit memory location, so by definition the update
+        * is atomic in terms of what other cpu's can see in testing the
+        * bits.  The spin_lock overhead isn't too bad, since it only
+        * happens when all buffers are in use, so only cpu overhead, not
+        * latency or bandwidth is affected.
+        */
+#define _IPATH_ALL_CHECKBITS 0x5555555555555555ULL
+       if (!dd->ipath_pioavailregs_dma) {
+               ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
+               return;
+       }
+       if (ipath_debug & __IPATH_VERBDBG) {
+               /* only if packet debug and verbose */
+               volatile __le64 *dma = dd->ipath_pioavailregs_dma;
+               unsigned long *shadow = dd->ipath_pioavailshadow;
+
+               ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
+                          "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
+                          "s3=%lx\n",
+                          (unsigned long long) le64_to_cpu(dma[0]),
+                          shadow[0],
+                          (unsigned long long) le64_to_cpu(dma[1]),
+                          shadow[1],
+                          (unsigned long long) le64_to_cpu(dma[2]),
+                          shadow[2],
+                          (unsigned long long) le64_to_cpu(dma[3]),
+                          shadow[3]);
+               if (piobregs > 4)
+                       ipath_cdbg(
+                               PKT, "2nd group, dma4=%llx shad4=%lx, "
+                               "d5=%llx s5=%lx, d6=%llx s6=%lx, "
+                               "d7=%llx s7=%lx\n",
+                               (unsigned long long) le64_to_cpu(dma[4]),
+                               shadow[4],
+                               (unsigned long long) le64_to_cpu(dma[5]),
+                               shadow[5],
+                               (unsigned long long) le64_to_cpu(dma[6]),
+                               shadow[6],
+                               (unsigned long long) le64_to_cpu(dma[7]),
+                               shadow[7]);
+       }
+       spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       for (i = 0; i < piobregs; i++) {
+               u64 pchbusy, pchg, piov, pnew;
+               /*
+                * Chip Errata: bug 6641; even and odd qwords>3 are swapped
+                */
+               if (i > 3) {
+                       if (i & 1)
+                               piov = le64_to_cpu(
+                                       dd->ipath_pioavailregs_dma[i - 1]);
+                       else
+                               piov = le64_to_cpu(
+                                       dd->ipath_pioavailregs_dma[i + 1]);
+               } else
+                       piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
+               pchg = _IPATH_ALL_CHECKBITS &
+                       ~(dd->ipath_pioavailshadow[i] ^ piov);
+               pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+               if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
+                       pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
+                       pnew |= piov & pchbusy;
+                       dd->ipath_pioavailshadow[i] = pnew;
+               }
+       }
+       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/**
+ * ipath_setrcvhdrsize - set the receive header size
+ * @dd: the infinipath device
+ * @rhdrsize: the receive header size
+ *
+ * called from user init code, and also layered driver init
+ */
+int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
+{
+       int ret = 0;
+
+       if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
+               if (dd->ipath_rcvhdrsize != rhdrsize) {
+                       dev_info(&dd->pcidev->dev,
+                                "Error: can't set protocol header "
+                                "size %u, already %u\n",
+                                rhdrsize, dd->ipath_rcvhdrsize);
+                       ret = -EAGAIN;
+               } else
+                       ipath_cdbg(VERBOSE, "Reuse same protocol header "
+                                  "size %u\n", dd->ipath_rcvhdrsize);
+       } else if (rhdrsize > (dd->ipath_rcvhdrentsize -
+                              (sizeof(u64) / sizeof(u32)))) {
+               ipath_dbg("Error: can't set protocol header size %u "
+                         "(> max %u)\n", rhdrsize,
+                         dd->ipath_rcvhdrentsize -
+                         (u32) (sizeof(u64) / sizeof(u32)));
+               ret = -EOVERFLOW;
+       } else {
+               dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
+               dd->ipath_rcvhdrsize = rhdrsize;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+                                dd->ipath_rcvhdrsize);
+               ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
+                          dd->ipath_rcvhdrsize);
+       }
+       return ret;
+}
+
+/**
+ * ipath_getpiobuf - find an available pio buffer
+ * @dd: the infinipath device
+ * @pbufnum: the buffer number is placed here
+ *
+ * do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ * Used by ipath_sma_send_pkt and ipath_layer_send
+ */
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 * pbufnum)
+{
+       int i, j, starti, updated = 0;
+       unsigned piobcnt, iter;
+       unsigned long flags;
+       unsigned long *shadow = dd->ipath_pioavailshadow;
+       u32 __iomem *buf;
+
+       piobcnt = (unsigned)(dd->ipath_piobcnt2k
+                            + dd->ipath_piobcnt4k);
+       starti = dd->ipath_lastport_piobuf;
+       iter = piobcnt - starti;
+       if (dd->ipath_upd_pio_shadow) {
+               /*
+                * Minor optimization.  If we had no buffers on last call,
+                * start out by doing the update; continue and do scan even
+                * if no buffers were updated, to be paranoid
+                */
+               ipath_update_pio_bufs(dd);
+               /* we scanned here, don't do it at end of scan */
+               updated = 1;
+               i = starti;
+       } else
+               i = dd->ipath_lastpioindex;
+
+rescan:
+       /*
+        * while test_and_set_bit() is atomic, we do that and then the
+        * change_bit(), and the pair is not.  See if this is the cause
+        * of the remaining armlaunch errors.
+        */
+       spin_lock_irqsave(&ipath_pioavail_lock, flags);
+       for (j = 0; j < iter; j++, i++) {
+               if (i >= piobcnt)
+                       i = starti;
+               /*
+                * To avoid bus lock overhead, we first find a candidate
+                * buffer, then do the test and set, and continue if that
+                * fails.
+                */
+               if (test_bit((2 * i) + 1, shadow) ||
+                   test_and_set_bit((2 * i) + 1, shadow))
+                       continue;
+               /* flip generation bit */
+               change_bit(2 * i, shadow);
+               break;
+       }
+       spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+       if (j == iter) {
+               volatile __le64 *dma = dd->ipath_pioavailregs_dma;
+
+               /*
+                * first time through; shadow exhausted, but may be real
+                * buffers available, so go see; if any updated, rescan
+                * (once)
+                */
+               if (!updated) {
+                       ipath_update_pio_bufs(dd);
+                       updated = 1;
+                       i = starti;
+                       goto rescan;
+               }
+               dd->ipath_upd_pio_shadow = 1;
+               /*
+                * not atomic, but if we lose one once in a while, that's OK
+                */
+               ipath_stats.sps_nopiobufs++;
+               if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+                       ipath_dbg(
+                               "%u pio sends with no bufavail; dmacopy: "
+                               "%llx %llx %llx %llx; shadow:  "
+                               "%lx %lx %lx %lx\n",
+                               dd->ipath_consec_nopiobuf,
+                               (unsigned long long) le64_to_cpu(dma[0]),
+                               (unsigned long long) le64_to_cpu(dma[1]),
+                               (unsigned long long) le64_to_cpu(dma[2]),
+                               (unsigned long long) le64_to_cpu(dma[3]),
+                               shadow[0], shadow[1], shadow[2],
+                               shadow[3]);
+                       /*
+                        * 4 buffers per byte, 4 registers above, cover rest
+                        * below
+                        */
+                       if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
+                           (sizeof(shadow[0]) * 4 * 4))
+                               ipath_dbg("2nd group: dmacopy: %llx %llx "
+                                         "%llx %llx; shadow: %lx %lx "
+                                         "%lx %lx\n",
+                                         (unsigned long long)
+                                         le64_to_cpu(dma[4]),
+                                         (unsigned long long)
+                                         le64_to_cpu(dma[5]),
+                                         (unsigned long long)
+                                         le64_to_cpu(dma[6]),
+                                         (unsigned long long)
+                                         le64_to_cpu(dma[7]),
+                                         shadow[4], shadow[5],
+                                         shadow[6], shadow[7]);
+               }
+               buf = NULL;
+               goto bail;
+       }
+
+       if (updated)
+               /*
+                * ran out of bufs, now some (at least this one we just
+                * got) are now available, so tell the layered driver.
+                */
+               __ipath_layer_intr(dd, IPATH_LAYER_INT_SEND_CONTINUE);
+
+       /*
+        * set next starting place.  Since it's just an optimization,
+        * it doesn't matter who wins on this, so no locking
+        */
+       dd->ipath_lastpioindex = i + 1;
+       if (dd->ipath_upd_pio_shadow)
+               dd->ipath_upd_pio_shadow = 0;
+       if (dd->ipath_consec_nopiobuf)
+               dd->ipath_consec_nopiobuf = 0;
+       if (i < dd->ipath_piobcnt2k)
+               buf = (u32 __iomem *) (dd->ipath_pio2kbase +
+                                      i * dd->ipath_palign);
+       else
+               buf = (u32 __iomem *)
+                       (dd->ipath_pio4kbase +
+                        (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
+       ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
+                  i, (i < dd->ipath_piobcnt2k) ? 2 : 4, buf);
+       if (pbufnum)
+               *pbufnum = i;
+
+bail:
+       return buf;
+}
+
+/**
+ * ipath_create_rcvhdrq - create a receive header queue
+ * @dd: the infinipath device
+ * @pd: the port data
+ *
+ * this *must* be physically contiguous memory, and for now,
+ * that limits it to what kmalloc can do.
+ */
+int ipath_create_rcvhdrq(struct ipath_devdata *dd,
+                        struct ipath_portdata *pd)
+{
+       int ret = 0, amt;
+
+       amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+                   sizeof(u32), PAGE_SIZE);
+       if (!pd->port_rcvhdrq) {
+               /*
+                * not using REPEAT isn't viable; at 128KB, we can easily
+                * fail this.  The problem with REPEAT is we can block here
+                * "forever".  There isn't an inbetween, unfortunately.  We
+                * could reduce the risk by never freeing the rcvhdrq except
+                * at unload, but even then, the first time a port is used,
+                * we could delay for some time...
+                */
+               gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+
+               pd->port_rcvhdrq = dma_alloc_coherent(
+                       &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
+                       gfp_flags);
+
+               if (!pd->port_rcvhdrq) {
+                       ipath_dev_err(dd, "attempt to allocate %d bytes "
+                                     "for port %u rcvhdrq failed\n",
+                                     amt, pd->port_port);
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               pd->port_rcvhdrq_size = amt;
+
+               ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
+                          "for port %u rcvhdr Q\n",
+                          amt >> PAGE_SHIFT, pd->port_rcvhdrq,
+                          (unsigned long) pd->port_rcvhdrq_phys,
+                          (unsigned long) pd->port_rcvhdrq_size,
+                          pd->port_port);
+       } else {
+               /*
+                * clear for security, sanity, and/or debugging, each
+                * time we reuse
+                */
+               memset(pd->port_rcvhdrq, 0, amt);
+       }
+
+       /*
+        * tell chip each time we init it, even if we are re-using previous
+        * memory (we zero it at process close)
+        */
+       ipath_cdbg(VERBOSE, "writing port %d rcvhdraddr as %lx\n",
+                  pd->port_port, (unsigned long) pd->port_rcvhdrq_phys);
+       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+                             pd->port_port, pd->port_rcvhdrq_phys);
+
+       ret = 0;
+bail:
+       return ret;
+}
+
+int ipath_waitfor_complete(struct ipath_devdata *dd, ipath_kreg reg_id,
+                          u64 bits_to_wait_for, u64 * valp)
+{
+       unsigned long timeout;
+       u64 lastval, val;
+       int ret;
+
+       lastval = ipath_read_kreg64(dd, reg_id);
+       /* wait a ridiculously long time */
+       timeout = jiffies + msecs_to_jiffies(5);
+       do {
+               val = ipath_read_kreg64(dd, reg_id);
+               /* set so they have something, even on failures. */
+               *valp = val;
+               if ((val & bits_to_wait_for) == bits_to_wait_for) {
+                       ret = 0;
+                       break;
+               }
+               if (val != lastval)
+                       ipath_cdbg(VERBOSE, "Changed from %llx to %llx, "
+                                  "waiting for %llx bits\n",
+                                  (unsigned long long) lastval,
+                                  (unsigned long long) val,
+                                  (unsigned long long) bits_to_wait_for);
+               cond_resched();
+               if (time_after(jiffies, timeout)) {
+                       ipath_dbg("Didn't get bits %llx in register 0x%x, "
+                                 "got %llx\n",
+                                 (unsigned long long) bits_to_wait_for,
+                                 reg_id, (unsigned long long) *valp);
+                       ret = -ENODEV;
+                       break;
+               }
+       } while (1);
+
+       return ret;
+}
+
+/**
+ * ipath_waitfor_mdio_cmdready - wait for last command to complete
+ * @dd: the infinipath device
+ *
+ * Like ipath_waitfor_complete(), but we wait for the CMDVALID bit to go
+ * away indicating the last command has completed.  It doesn't return data
+ */
+int ipath_waitfor_mdio_cmdready(struct ipath_devdata *dd)
+{
+       unsigned long timeout;
+       u64 val;
+       int ret;
+
+       /* wait a ridiculously long time */
+       timeout = jiffies + msecs_to_jiffies(5);
+       do {
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_mdio);
+               if (!(val & IPATH_MDIO_CMDVALID)) {
+                       ret = 0;
+                       break;
+               }
+               cond_resched();
+               if (time_after(jiffies, timeout)) {
+                       ipath_dbg("CMDVALID stuck in mdio reg? (%llx)\n",
+                                 (unsigned long long) val);
+                       ret = -ENODEV;
+                       break;
+               }
+       } while (1);
+
+       return ret;
+}
+
+void ipath_set_ib_lstate(struct ipath_devdata *dd, int which)
+{
+       static const char *what[4] = {
+               [0] = "DOWN",
+               [INFINIPATH_IBCC_LINKCMD_INIT] = "INIT",
+               [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
+               [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
+       };
+       ipath_cdbg(SMA, "Trying to move unit %u to %s, current ltstate "
+                  "is %s\n", dd->ipath_unit,
+                  what[(which >> INFINIPATH_IBCC_LINKCMD_SHIFT) &
+                       INFINIPATH_IBCC_LINKCMD_MASK],
+                  ipath_ibcstatus_str[
+                          (ipath_read_kreg64
+                           (dd, dd->ipath_kregs->kr_ibcstatus) >>
+                           INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+                          INFINIPATH_IBCS_LINKTRAININGSTATE_MASK]);
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                        dd->ipath_ibcctrl | which);
+}
+
+/**
+ * ipath_read_kreg64_port - read a device's per-port 64-bit kernel register
+ * @dd: the infinipath device
+ * @regno: the register number to read
+ * @port: the port containing the register
+ *
+ * Registers that vary with the chip implementation constants (port)
+ * use this routine.
+ */
+u64 ipath_read_kreg64_port(const struct ipath_devdata *dd, ipath_kreg regno,
+                          unsigned port)
+{
+       u16 where;
+
+       if (port < dd->ipath_portcnt &&
+           (regno == dd->ipath_kregs->kr_rcvhdraddr ||
+            regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
+               where = regno + port;
+       else
+               where = -1;
+
+       return ipath_read_kreg64(dd, where);
+}
+
+/**
+ * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
+ * @dd: the infinipath device
+ * @regno: the register number to write
+ * @port: the port containing the register
+ * @value: the value to write
+ *
+ * Registers that vary with the chip implementation constants (port)
+ * use this routine.
+ */
+void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
+                         unsigned port, u64 value)
+{
+       u16 where;
+
+       if (port < dd->ipath_portcnt &&
+           (regno == dd->ipath_kregs->kr_rcvhdraddr ||
+            regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
+               where = regno + port;
+       else
+               where = -1;
+
+       ipath_write_kreg(dd, where, value);
+}
+
+/**
+ * ipath_shutdown_device - shut down a device
+ * @dd: the infinipath device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by ipath_init_chip(dd,1)
+ */
+void ipath_shutdown_device(struct ipath_devdata *dd)
+{
+       u64 val;
+
+       ipath_dbg("Shutting down the device\n");
+
+       dd->ipath_flags |= IPATH_LINKUNK;
+       dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
+                            IPATH_LINKINIT | IPATH_LINKARMED |
+                            IPATH_LINKACTIVE);
+       *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
+                               IPATH_STATUS_IB_READY);
+
+       /* mask interrupts, but not errors */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+       dd->ipath_rcvctrl = 0;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       /*
+        * gracefully stop all sends allowing any in progress to trickle out
+        * first.
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0ULL);
+       /* flush it */
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       /*
+        * enough for anything that's going to trickle out to have actually
+        * done so.
+        */
+       udelay(5);
+
+       /*
+        * abort any armed or launched PIO buffers that didn't go. (self
+        * clearing).  Will cause any packet currently being transmitted to
+        * go out with an EBP, and may also cause a short packet error on
+        * the receiver.
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        INFINIPATH_S_ABORT);
+
+       ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+                           INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+
+       /*
+        * we are shutting down, so tell the layered driver.  We don't do
+        * this on just a link state change, much like ethernet, a cable
+        * unplug, etc. doesn't change driver state
+        */
+       ipath_layer_intr(dd, IPATH_LAYER_INT_IF_DOWN);
+
+       /* disable IBC */
+       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                        dd->ipath_control);
+
+       /*
+        * clear SerdesEnable and turn the leds off; do this here because
+        * we are unloading, so don't count on interrupts to move along
+        * Turn the LEDs off explictly for the same reason.
+        */
+       dd->ipath_f_quiet_serdes(dd);
+       dd->ipath_f_setextled(dd, 0, 0);
+
+       if (dd->ipath_stats_timer_active) {
+               del_timer_sync(&dd->ipath_stats_timer);
+               dd->ipath_stats_timer_active = 0;
+       }
+
+       /*
+        * clear all interrupts and errors, so that the next time the driver
+        * is loaded or device is enabled, we know that whatever is set
+        * happened while we were unloaded
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+}
+
+/**
+ * ipath_free_pddata - free a port's allocated data
+ * @dd: the infinipath device
+ * @port: the port
+ * @freehdrq: free the port data structure if true
+ *
+ * when closing, free up any allocated data for a port, if the
+ * reference count goes to zero
+ * Note: this also optionally frees the portdata itself!
+ * Any changes here have to be matched up with the reinit case
+ * of ipath_init_chip(), which calls this routine on reinit after reset.
+ */
+void ipath_free_pddata(struct ipath_devdata *dd, u32 port, int freehdrq)
+{
+       struct ipath_portdata *pd = dd->ipath_pd[port];
+
+       if (!pd)
+               return;
+       if (freehdrq)
+               /*
+                * only clear and free portdata if we are going to also
+                * release the hdrq, otherwise we leak the hdrq on each
+                * open/close cycle
+                */
+               dd->ipath_pd[port] = NULL;
+       if (freehdrq && pd->port_rcvhdrq) {
+               ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
+                          "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
+                          (unsigned long) pd->port_rcvhdrq_size);
+               dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
+                                 pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
+               pd->port_rcvhdrq = NULL;
+       }
+       if (port && pd->port_rcvegrbuf) {
+               /* always free this */
+               if (pd->port_rcvegrbuf) {
+                       unsigned e;
+
+                       for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+                               void *base = pd->port_rcvegrbuf[e];
+                               size_t size = pd->port_rcvegrbuf_size;
+
+                               ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
+                                          "chunk %u/%u\n", base,
+                                          (unsigned long) size,
+                                          e, pd->port_rcvegrbuf_chunks);
+                               dma_free_coherent(
+                                       &dd->pcidev->dev, size, base,
+                                       pd->port_rcvegrbuf_phys[e]);
+                       }
+                       vfree(pd->port_rcvegrbuf);
+                       pd->port_rcvegrbuf = NULL;
+                       vfree(pd->port_rcvegrbuf_phys);
+                       pd->port_rcvegrbuf_phys = NULL;
+               }
+               pd->port_rcvegrbuf_chunks = 0;
+       } else if (port == 0 && dd->ipath_port0_skbs) {
+               unsigned e;
+               struct sk_buff **skbs = dd->ipath_port0_skbs;
+
+               dd->ipath_port0_skbs = NULL;
+               ipath_cdbg(VERBOSE, "free closed port %d ipath_port0_skbs "
+                          "@ %p\n", pd->port_port, skbs);
+               for (e = 0; e < dd->ipath_rcvegrcnt; e++)
+                       if (skbs[e])
+                               dev_kfree_skb(skbs[e]);
+               vfree(skbs);
+       }
+       if (freehdrq) {
+               kfree(pd->port_tid_pg_list);
+               kfree(pd);
+       }
+}
+
+int __init infinipath_init(void)
+{
+       int ret;
+
+       ipath_dbg(KERN_INFO DRIVER_LOAD_MSG "%s", ipath_core_version);
+
+       /*
+        * These must be called before the driver is registered with
+        * the PCI subsystem.
+        */
+       idr_init(&unit_table);
+       if (!idr_pre_get(&unit_table, GFP_KERNEL)) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       ret = pci_register_driver(&ipath_driver);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Unable to register driver: error %d\n", -ret);
+               goto bail_unit;
+       }
+
+       ret = ipath_driver_create_group(&ipath_driver.driver);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME ": Unable to create driver "
+                      "sysfs entries: error %d\n", -ret);
+               goto bail_pci;
+       }
+
+       ret = ipath_init_ipathfs();
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
+                      "ipathfs: error %d\n", -ret);
+               goto bail_group;
+       }
+
+       goto bail;
+
+bail_group:
+       ipath_driver_remove_group(&ipath_driver.driver);
+
+bail_pci:
+       pci_unregister_driver(&ipath_driver);
+
+bail_unit:
+       idr_destroy(&unit_table);
+
+bail:
+       return ret;
+}
+
+static void cleanup_device(struct ipath_devdata *dd)
+{
+       int port;
+
+       ipath_shutdown_device(dd);
+
+       if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+               /* can't do anything more with chip; needs re-init */
+               *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+               if (dd->ipath_kregbase) {
+                       /*
+                        * if we haven't already cleaned up before these are
+                        * to ensure any register reads/writes "fail" until
+                        * re-init
+                        */
+                       dd->ipath_kregbase = NULL;
+                       dd->ipath_kregvirt = NULL;
+                       dd->ipath_uregbase = 0;
+                       dd->ipath_sregbase = 0;
+                       dd->ipath_cregbase = 0;
+                       dd->ipath_kregsize = 0;
+               }
+               ipath_disable_wc(dd);
+       }
+
+       if (dd->ipath_pioavailregs_dma) {
+               dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+                                 (void *) dd->ipath_pioavailregs_dma,
+                                 dd->ipath_pioavailregs_phys);
+               dd->ipath_pioavailregs_dma = NULL;
+       }
+
+       if (dd->ipath_pageshadow) {
+               struct page **tmpp = dd->ipath_pageshadow;
+               int i, cnt = 0;
+
+               ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
+                          "locked\n");
+               for (port = 0; port < dd->ipath_cfgports; port++) {
+                       int port_tidbase = port * dd->ipath_rcvtidcnt;
+                       int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+                       for (i = port_tidbase; i < maxtid; i++) {
+                               if (!tmpp[i])
+                                       continue;
+                               ipath_release_user_pages(&tmpp[i], 1);
+                               tmpp[i] = NULL;
+                               cnt++;
+                       }
+               }
+               if (cnt) {
+                       ipath_stats.sps_pageunlocks += cnt;
+                       ipath_cdbg(VERBOSE, "There were still %u expTID "
+                                  "entries locked\n", cnt);
+               }
+               if (ipath_stats.sps_pagelocks ||
+                   ipath_stats.sps_pageunlocks)
+                       ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
+                                  "unlocked via ipath_m{un}lock\n",
+                                  (unsigned long long)
+                                  ipath_stats.sps_pagelocks,
+                                  (unsigned long long)
+                                  ipath_stats.sps_pageunlocks);
+
+               ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
+                          dd->ipath_pageshadow);
+               vfree(dd->ipath_pageshadow);
+               dd->ipath_pageshadow = NULL;
+       }
+
+       /*
+        * free any resources still in use (usually just kernel ports)
+        * at unload
+        */
+       for (port = 0; port < dd->ipath_cfgports; port++)
+               ipath_free_pddata(dd, port, 1);
+       kfree(dd->ipath_pd);
+       /*
+        * debuggability, in case some cleanup path tries to use it
+        * after this
+        */
+       dd->ipath_pd = NULL;
+}
+
+static void __exit infinipath_cleanup(void)
+{
+       struct ipath_devdata *dd, *tmp;
+       unsigned long flags;
+
+       ipath_exit_ipathfs();
+
+       ipath_driver_remove_group(&ipath_driver.driver);
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       /*
+        * turn off rcv, send, and interrupts for all ports, all drivers
+        * should also hard reset the chip here?
+        * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+        * for all versions of the driver, if they were allocated
+        */
+       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+               spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+               if (dd->ipath_kregbase)
+                       cleanup_device(dd);
+
+               if (dd->pcidev) {
+                       if (dd->pcidev->irq) {
+                               ipath_cdbg(VERBOSE,
+                                          "unit %u free_irq of irq %x\n",
+                                          dd->ipath_unit, dd->pcidev->irq);
+                               free_irq(dd->pcidev->irq, dd);
+                       } else
+                               ipath_dbg("irq is 0, not doing free_irq "
+                                         "for unit %u\n", dd->ipath_unit);
+                       dd->pcidev = NULL;
+               }
+
+               /*
+                * we check for NULL here, because it's outside the kregbase
+                * check, and we need to call it after the free_irq.  Thus
+                * it's possible that the function pointers were never
+                * initialized.
+                */
+               if (dd->ipath_f_cleanup)
+                       /* clean up chip-specific stuff */
+                       dd->ipath_f_cleanup(dd);
+
+               spin_lock_irqsave(&ipath_devs_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
+       pci_unregister_driver(&ipath_driver);
+
+       idr_destroy(&unit_table);
+}
+
+/**
+ * ipath_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user ports are open that use chip resources
+ */
+int ipath_reset_device(int unit)
+{
+       int ret, i;
+       struct ipath_devdata *dd = ipath_lookup(unit);
+
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
+
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
+               dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
+                        "not initialized or not present\n", unit);
+               ret = -ENXIO;
+               goto bail;
+       }
+
+       if (dd->ipath_pd)
+               for (i = 1; i < dd->ipath_portcnt; i++) {
+                       if (dd->ipath_pd[i] && dd->ipath_pd[i]->port_cnt) {
+                               ipath_dbg("unit %u port %d is in use "
+                                         "(PID %u cmd %s), can't reset\n",
+                                         unit, i,
+                                         dd->ipath_pd[i]->port_pid,
+                                         dd->ipath_pd[i]->port_comm);
+                               ret = -EBUSY;
+                               goto bail;
+                       }
+               }
+
+       dd->ipath_flags &= ~IPATH_INITTED;
+       ret = dd->ipath_f_reset(dd);
+       if (ret != 1)
+               ipath_dbg("reset was not successful\n");
+       ipath_dbg("Trying to reinitialize unit %u after reset attempt\n",
+                 unit);
+       ret = ipath_init_chip(dd, 1);
+       if (ret)
+               ipath_dev_err(dd, "Reinitialize unit %u after "
+                             "reset failed with %d\n", unit, ret);
+       else
+               dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
+                        "resetting\n", unit);
+
+bail:
+       return ret;
+}
+
+module_init(infinipath_init);
+module_exit(infinipath_cleanup);
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
new file mode 100644 (file)
index 0000000..f11a900
--- /dev/null
@@ -0,0 +1,613 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+
+/*
+ * InfiniPath I2C driver for a serial eeprom.  This is not a generic
+ * I2C interface.  For a start, the device we're using (Atmel AT24C11)
+ * doesn't work like a regular I2C device.  It looks like one
+ * electrically, but not logically.  Normal I2C devices have a single
+ * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
+ * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
+ * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
+ * call" address.)  The Atmel device, on the other hand, responds to ALL
+ * 7-bit addresses.  It's designed to be the only device on a given I2C
+ * bus.  A 7-bit address corresponds to the memory address within the
+ * Atmel device itself.
+ *
+ * Also, the timing requirements mean more than simple software
+ * bitbanging, with readbacks from chip to ensure timing (simple udelay
+ * is not enough).
+ *
+ * This all means that accessing the device is specialized enough
+ * that using the standard kernel I2C bitbanging interface would be
+ * impossible.  For example, the core I2C eeprom driver expects to find
+ * a device at one or more of a limited set of addresses only.  It doesn't
+ * allow writing to an eeprom.  It also doesn't provide any means of
+ * accessing eeprom contents from within the kernel, only via sysfs.
+ */
+
+enum i2c_type {
+       i2c_line_scl = 0,
+       i2c_line_sda
+};
+
+enum i2c_state {
+       i2c_line_low = 0,
+       i2c_line_high
+};
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+static int eeprom_init;
+
+/*
+ * The gpioval manipulation really should be protected by spinlocks
+ * or be converted to use atomic operations.
+ */
+
+/**
+ * i2c_gpio_set - set a GPIO line
+ * @dd: the infinipath device
+ * @line: the line to set
+ * @new_line_state: the state to set
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.
+ */
+static int i2c_gpio_set(struct ipath_devdata *dd,
+                       enum i2c_type line,
+                       enum i2c_state new_line_state)
+{
+       u64 read_val, write_val, mask, *gpioval;
+
+       gpioval = &dd->ipath_gpio_out;
+       read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
+       if (line == i2c_line_scl)
+               mask = ipath_gpio_scl;
+       else
+               mask = ipath_gpio_sda;
+
+       if (new_line_state == i2c_line_high)
+               /* tri-state the output rather than force high */
+               write_val = read_val & ~mask;
+       else
+               /* config line to be an output */
+               write_val = read_val | mask;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, write_val);
+
+       /* set high and verify */
+       if (new_line_state == i2c_line_high)
+               write_val = 0x1UL;
+       else
+               write_val = 0x0UL;
+
+       if (line == i2c_line_scl) {
+               write_val <<= ipath_gpio_scl_num;
+               *gpioval = *gpioval & ~(1UL << ipath_gpio_scl_num);
+               *gpioval |= write_val;
+       } else {
+               write_val <<= ipath_gpio_sda_num;
+               *gpioval = *gpioval & ~(1UL << ipath_gpio_sda_num);
+               *gpioval |= write_val;
+       }
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
+
+       return 0;
+}
+
+/**
+ * i2c_gpio_get - get a GPIO line state
+ * @dd: the infinipath device
+ * @line: the line to get
+ * @curr_statep: where to put the line state
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.  curr_state is not set on error.
+ */
+static int i2c_gpio_get(struct ipath_devdata *dd,
+                       enum i2c_type line,
+                       enum i2c_state *curr_statep)
+{
+       u64 read_val, write_val, mask;
+       int ret;
+
+       /* check args */
+       if (curr_statep == NULL) {
+               ret = 1;
+               goto bail;
+       }
+
+       read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
+       /* config line to be an input */
+       if (line == i2c_line_scl)
+               mask = ipath_gpio_scl;
+       else
+               mask = ipath_gpio_sda;
+       write_val = read_val & ~mask;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, write_val);
+       read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+       if (read_val & mask)
+               *curr_statep = i2c_line_high;
+       else
+               *curr_statep = i2c_line_low;
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the infinipath device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct ipath_devdata *dd)
+{
+       (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+}
+
+static void scl_out(struct ipath_devdata *dd, u8 bit)
+{
+       i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
+
+       i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct ipath_devdata *dd, u8 bit)
+{
+       i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
+
+       i2c_wait_for_writes(dd);
+}
+
+static u8 sda_in(struct ipath_devdata *dd, int wait)
+{
+       enum i2c_state bit;
+
+       if (i2c_gpio_get(dd, i2c_line_sda, &bit))
+               ipath_dbg("get bit failed!\n");
+
+       if (wait)
+               i2c_wait_for_writes(dd);
+
+       return bit == i2c_line_high ? 1U : 0;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the infinipath device
+ */
+static int i2c_ackrcv(struct ipath_devdata *dd)
+{
+       u8 ack_received;
+
+       /* AT ENTRY SCL = LOW */
+       /* change direction, ignore data */
+       ack_received = sda_in(dd, 1);
+       scl_out(dd, i2c_line_high);
+       ack_received = sda_in(dd, 1) == 0;
+       scl_out(dd, i2c_line_low);
+       return ack_received;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the infinipath device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct ipath_devdata *dd, u8 data)
+{
+       int bit_cntr;
+       u8 bit;
+
+       for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+               bit = (data >> bit_cntr) & 1;
+               sda_out(dd, bit);
+               scl_out(dd, i2c_line_high);
+               scl_out(dd, i2c_line_low);
+       }
+       return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+static void send_ack(struct ipath_devdata *dd)
+{
+       sda_out(dd, i2c_line_low);
+       scl_out(dd, i2c_line_high);
+       scl_out(dd, i2c_line_low);
+       sda_out(dd, i2c_line_high);
+}
+
+/**
+ * i2c_startcmd - transmit the start condition, followed by address/cmd
+ * @dd: the infinipath device
+ * @offset_dir: direction byte
+ *
+ *      (both clock/data high, clock high, data low while clock is high)
+ */
+static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
+{
+       int res;
+
+       /* issue start sequence */
+       sda_out(dd, i2c_line_high);
+       scl_out(dd, i2c_line_high);
+       sda_out(dd, i2c_line_low);
+       scl_out(dd, i2c_line_low);
+
+       /* issue length and direction byte */
+       res = wr_byte(dd, offset_dir);
+
+       if (res)
+               ipath_cdbg(VERBOSE, "No ack to complete start\n");
+
+       return res;
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the infinipath device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct ipath_devdata *dd)
+{
+       scl_out(dd, i2c_line_low);
+       sda_out(dd, i2c_line_low);
+       scl_out(dd, i2c_line_high);
+       sda_out(dd, i2c_line_high);
+       udelay(2);
+}
+
+/**
+ * eeprom_reset - reset I2C communication
+ * @dd: the infinipath device
+ */
+
+static int eeprom_reset(struct ipath_devdata *dd)
+{
+       int clock_cycles_left = 9;
+       u64 *gpioval = &dd->ipath_gpio_out;
+       int ret;
+
+       eeprom_init = 1;
+       *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
+       ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
+                  "is %llx\n", (unsigned long long) *gpioval);
+
+       /*
+        * This is to get the i2c into a known state, by first going low,
+        * then tristate sda (and then tristate scl as first thing
+        * in loop)
+        */
+       scl_out(dd, i2c_line_low);
+       sda_out(dd, i2c_line_high);
+
+       while (clock_cycles_left--) {
+               scl_out(dd, i2c_line_high);
+
+               if (sda_in(dd, 0)) {
+                       sda_out(dd, i2c_line_low);
+                       scl_out(dd, i2c_line_low);
+                       ret = 0;
+                       goto bail;
+               }
+
+               scl_out(dd, i2c_line_low);
+       }
+
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+
+int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
+                     void *buffer, int len)
+{
+       /* compiler complains unless initialized */
+       u8 single_byte = 0;
+       int bit_cntr;
+       int ret;
+
+       if (!eeprom_init)
+               eeprom_reset(dd);
+
+       eeprom_offset = (eeprom_offset << 1) | READ_CMD;
+
+       if (i2c_startcmd(dd, eeprom_offset)) {
+               ipath_dbg("Failed startcmd\n");
+               stop_cmd(dd);
+               ret = 1;
+               goto bail;
+       }
+
+       /*
+        * eeprom keeps clocking data out as long as we ack, automatically
+        * incrementing the address.
+        */
+       while (len-- > 0) {
+               /* get data */
+               single_byte = 0;
+               for (bit_cntr = 8; bit_cntr; bit_cntr--) {
+                       u8 bit;
+                       scl_out(dd, i2c_line_high);
+                       bit = sda_in(dd, 0);
+                       single_byte |= bit << (bit_cntr - 1);
+                       scl_out(dd, i2c_line_low);
+               }
+
+               /* send ack if not the last byte */
+               if (len)
+                       send_ack(dd);
+
+               *((u8 *) buffer) = single_byte;
+               buffer++;
+       }
+
+       stop_cmd(dd);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
+                      const void *buffer, int len)
+{
+       u8 single_byte;
+       int sub_len;
+       const u8 *bp = buffer;
+       int max_wait_time, i;
+       int ret;
+
+       if (!eeprom_init)
+               eeprom_reset(dd);
+
+       while (len > 0) {
+               if (i2c_startcmd(dd, (eeprom_offset << 1) | WRITE_CMD)) {
+                       ipath_dbg("Failed to start cmd offset %u\n",
+                                 eeprom_offset);
+                       goto failed_write;
+               }
+
+               sub_len = min(len, 4);
+               eeprom_offset += sub_len;
+               len -= sub_len;
+
+               for (i = 0; i < sub_len; i++) {
+                       if (wr_byte(dd, *bp++)) {
+                               ipath_dbg("no ack after byte %u/%u (%u "
+                                         "total remain)\n", i, sub_len,
+                                         len + sub_len - i);
+                               goto failed_write;
+                       }
+               }
+
+               stop_cmd(dd);
+
+               /*
+                * wait for write complete by waiting for a successful
+                * read (the chip replies with a zero after the write
+                * cmd completes, and before it writes to the eeprom.
+                * The startcmd for the read will fail the ack until
+                * the writes have completed.   We do this inline to avoid
+                * the debug prints that are in the real read routine
+                * if the startcmd fails.
+                */
+               max_wait_time = 100;
+               while (i2c_startcmd(dd, READ_CMD)) {
+                       stop_cmd(dd);
+                       if (!--max_wait_time) {
+                               ipath_dbg("Did not get successful read to "
+                                         "complete write\n");
+                               goto failed_write;
+                       }
+               }
+               /* now read the zero byte */
+               for (i = single_byte = 0; i < 8; i++) {
+                       u8 bit;
+                       scl_out(dd, i2c_line_high);
+                       bit = sda_in(dd, 0);
+                       scl_out(dd, i2c_line_low);
+                       single_byte <<= 1;
+                       single_byte |= bit;
+               }
+               stop_cmd(dd);
+       }
+
+       ret = 0;
+       goto bail;
+
+failed_write:
+       stop_cmd(dd);
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+static u8 flash_csum(struct ipath_flash *ifp, int adjust)
+{
+       u8 *ip = (u8 *) ifp;
+       u8 csum = 0, len;
+
+       for (len = 0; len < ifp->if_length; len++)
+               csum += *ip++;
+       csum -= ifp->if_csum;
+       csum = ~csum;
+       if (adjust)
+               ifp->if_csum = csum;
+
+       return csum;
+}
+
+/**
+ * ipath_get_guid - get the GUID from the i2c device
+ * @dd: the infinipath device
+ *
+ * When we add the multi-chip support, we will probably have to add
+ * the ability to use the number of guids field, and get the guid from
+ * the first chip's flash, to use for all of them.
+ */
+void ipath_get_guid(struct ipath_devdata *dd)
+{
+       void *buf;
+       struct ipath_flash *ifp;
+       __be64 guid;
+       int len;
+       u8 csum, *bguid;
+       int t = dd->ipath_unit;
+       struct ipath_devdata *dd0 = ipath_lookup(0);
+
+       if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
+               u8 *bguid, oguid;
+               dd->ipath_guid = dd0->ipath_guid;
+               bguid = (u8 *) & dd->ipath_guid;
+
+               oguid = bguid[7];
+               bguid[7] += t;
+               if (oguid > bguid[7]) {
+                       if (bguid[6] == 0xff) {
+                               if (bguid[5] == 0xff) {
+                                       ipath_dev_err(
+                                               dd,
+                                               "Can't set %s GUID from "
+                                               "base, wraps to OUI!\n",
+                                               ipath_get_unit_name(t));
+                                       dd->ipath_guid = 0;
+                                       goto bail;
+                               }
+                               bguid[5]++;
+                       }
+                       bguid[6]++;
+               }
+               dd->ipath_nguid = 1;
+
+               ipath_dbg("nguid %u, so adding %u to device 0 guid, "
+                         "for %llx\n",
+                         dd0->ipath_nguid, t,
+                         (unsigned long long) be64_to_cpu(dd->ipath_guid));
+               goto bail;
+       }
+
+       len = offsetof(struct ipath_flash, if_future);
+       buf = vmalloc(len);
+       if (!buf) {
+               ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+                             "bytes from eeprom for GUID\n", len);
+               goto bail;
+       }
+
+       if (ipath_eeprom_read(dd, 0, buf, len)) {
+               ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
+               goto done;
+       }
+       ifp = (struct ipath_flash *)buf;
+
+       csum = flash_csum(ifp, 0);
+       if (csum != ifp->if_csum) {
+               dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
+                        "0x%x, not 0x%x\n", csum, ifp->if_csum);
+               goto done;
+       }
+       if (*(__be64 *) ifp->if_guid == 0ULL ||
+           *(__be64 *) ifp->if_guid == __constant_cpu_to_be64(-1LL)) {
+               ipath_dev_err(dd, "Invalid GUID %llx from flash; "
+                             "ignoring\n",
+                             *(unsigned long long *) ifp->if_guid);
+               /* don't allow GUID if all 0 or all 1's */
+               goto done;
+       }
+
+       /* complain, but allow it */
+       if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+               dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
+                        "default, probably not correct!\n",
+                        *(unsigned long long *) ifp->if_guid);
+
+       bguid = ifp->if_guid;
+       if (!bguid[0] && !bguid[1] && !bguid[2]) {
+               /* original incorrect GUID format in flash; fix in
+                * core copy, by shifting up 2 octets; don't need to
+                * change top octet, since both it and shifted are
+                * 0.. */
+               bguid[1] = bguid[3];
+               bguid[2] = bguid[4];
+               bguid[3] = bguid[4] = 0;
+               guid = *(__be64 *) ifp->if_guid;
+               ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
+                          "shifting 2 octets\n");
+       } else
+               guid = *(__be64 *) ifp->if_guid;
+       dd->ipath_guid = guid;
+       dd->ipath_nguid = ifp->if_numguid;
+       memcpy(dd->ipath_serial, ifp->if_serial,
+              sizeof(ifp->if_serial));
+       ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
+                  (unsigned long long) be64_to_cpu(dd->ipath_guid));
+
+done:
+       vfree(buf);
+
+bail:;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
new file mode 100644 (file)
index 0000000..c347191
--- /dev/null
@@ -0,0 +1,1910 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <asm/pgtable.h>
+
+#include "ipath_kernel.h"
+#include "ips_common.h"
+#include "ipath_layer.h"
+
+static int ipath_open(struct inode *, struct file *);
+static int ipath_close(struct inode *, struct file *);
+static ssize_t ipath_write(struct file *, const char __user *, size_t,
+                          loff_t *);
+static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
+static int ipath_mmap(struct file *, struct vm_area_struct *);
+
+static struct file_operations ipath_file_ops = {
+       .owner = THIS_MODULE,
+       .write = ipath_write,
+       .open = ipath_open,
+       .release = ipath_close,
+       .poll = ipath_poll,
+       .mmap = ipath_mmap
+};
+
+static int ipath_get_base_info(struct ipath_portdata *pd,
+                              void __user *ubase, size_t ubase_size)
+{
+       int ret = 0;
+       struct ipath_base_info *kinfo = NULL;
+       struct ipath_devdata *dd = pd->port_dd;
+
+       if (ubase_size < sizeof(*kinfo)) {
+               ipath_cdbg(PROC,
+                          "Base size %lu, need %lu (version mismatch?)\n",
+                          (unsigned long) ubase_size,
+                          (unsigned long) sizeof(*kinfo));
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+       if (kinfo == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       ret = dd->ipath_f_get_base_info(pd, kinfo);
+       if (ret < 0)
+               goto bail;
+
+       kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+       kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+       kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
+       kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+       /*
+        * have to mmap whole thing
+        */
+       kinfo->spi_rcv_egrbuftotlen =
+               pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+       kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+       kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+               pd->port_rcvegrbuf_chunks;
+       kinfo->spi_tidcnt = dd->ipath_rcvtidcnt;
+       /*
+        * for this use, may be ipath_cfgports summed over all chips that
+        * are are configured and present
+        */
+       kinfo->spi_nports = dd->ipath_cfgports;
+       /* unit (chip/board) our port is on */
+       kinfo->spi_unit = dd->ipath_unit;
+       /* for now, only a single page */
+       kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+       /*
+        * Doing this per port, and based on the skip value, etc.  This has
+        * to be the actual buffer size, since the protocol code treats it
+        * as an array.
+        *
+        * These have to be set to user addresses in the user code via mmap.
+        * These values are used on return to user code for the mmap target
+        * addresses only.  For 32 bit, same 44 bit address problem, so use
+        * the physical address, not virtual.  Before 2.6.11, using the
+        * page_address() macro worked, but in 2.6.11, even that returns the
+        * full 64 bit address (upper bits all 1's).  So far, using the
+        * physical addresses (or chip offsets, for chip mapping) works, but
+        * no doubt some future kernel release will chang that, and we'll be
+        * on to yet another method of dealing with this
+        */
+       kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
+       kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
+       kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
+       kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+               (void *) dd->ipath_statusp -
+               (void *) dd->ipath_pioavailregs_dma;
+       kinfo->spi_piobufbase = (u64) pd->port_piobufs;
+       kinfo->__spi_uregbase =
+               dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+
+       kinfo->spi_pioindex = dd->ipath_pbufsport * (pd->port_port - 1);
+       kinfo->spi_piocnt = dd->ipath_pbufsport;
+       kinfo->spi_pioalign = dd->ipath_palign;
+
+       kinfo->spi_qpair = IPATH_KD_QP;
+       kinfo->spi_piosize = dd->ipath_ibmaxlen;
+       kinfo->spi_mtu = dd->ipath_ibmaxlen;    /* maxlen, not ibmtu */
+       kinfo->spi_port = pd->port_port;
+       kinfo->spi_sw_version = IPATH_USER_SWVERSION;
+       kinfo->spi_hw_version = dd->ipath_revision;
+
+       if (copy_to_user(ubase, kinfo, sizeof(*kinfo)))
+               ret = -EFAULT;
+
+bail:
+       kfree(kinfo);
+       return ret;
+}
+
+/**
+ * ipath_tid_update - update a port TID
+ * @pd: the port
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(struct ipath_portdata *pd,
+                           const struct ipath_tid_info *ti)
+{
+       int ret = 0, ntids;
+       u32 tid, porttid, cnt, i, tidcnt;
+       u16 *tidlist;
+       struct ipath_devdata *dd = pd->port_dd;
+       u64 physaddr;
+       unsigned long vaddr;
+       u64 __iomem *tidbase;
+       unsigned long tidmap[8];
+       struct page **pagep = NULL;
+
+       if (!dd->ipath_pageshadow) {
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       cnt = ti->tidcnt;
+       if (!cnt) {
+               ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
+                         (unsigned long long) ti->tidlist);
+               /*
+                * Should we treat as success?  likely a bug
+                */
+               ret = -EFAULT;
+               goto done;
+       }
+       tidcnt = dd->ipath_rcvtidcnt;
+       if (cnt >= tidcnt) {
+               /* make sure it all fits in port_tid_pg_list */
+               dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
+                        "TIDs, only trying max (%u)\n", cnt, tidcnt);
+               cnt = tidcnt;
+       }
+       pagep = (struct page **)pd->port_tid_pg_list;
+       tidlist = (u16 *) (&pagep[cnt]);
+
+       memset(tidmap, 0, sizeof(tidmap));
+       tid = pd->port_tidcursor;
+       /* before decrement; chip actual # */
+       porttid = pd->port_port * tidcnt;
+       ntids = tidcnt;
+       tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+                                  dd->ipath_rcvtidbase +
+                                  porttid * sizeof(*tidbase));
+
+       ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
+                  pd->port_port, cnt, tid, tidbase);
+
+       /* virtual address of first page in transfer */
+       vaddr = ti->tidvaddr;
+       if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+                      cnt * PAGE_SIZE)) {
+               ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
+                         (void *)vaddr, cnt);
+               ret = -EFAULT;
+               goto done;
+       }
+       ret = ipath_get_user_pages(vaddr, cnt, pagep);
+       if (ret) {
+               if (ret == -EBUSY) {
+                       ipath_dbg("Failed to lock addr %p, %u pages "
+                                 "(already locked)\n",
+                                 (void *) vaddr, cnt);
+                       /*
+                        * for now, continue, and see what happens but with
+                        * the new implementation, this should never happen,
+                        * unless perhaps the user has mpin'ed the pages
+                        * themselves (something we need to test)
+                        */
+                       ret = 0;
+               } else {
+                       dev_info(&dd->pcidev->dev,
+                                "Failed to lock addr %p, %u pages: "
+                                "errno %d\n", (void *) vaddr, cnt, -ret);
+                       goto done;
+               }
+       }
+       for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+               for (; ntids--; tid++) {
+                       if (tid == tidcnt)
+                               tid = 0;
+                       if (!dd->ipath_pageshadow[porttid + tid])
+                               break;
+               }
+               if (ntids < 0) {
+                       /*
+                        * oops, wrapped all the way through their TIDs,
+                        * and didn't have enough free; see comments at
+                        * start of routine
+                        */
+                       ipath_dbg("Not enough free TIDs for %u pages "
+                                 "(index %d), failing\n", cnt, i);
+                       i--;    /* last tidlist[i] not filled in */
+                       ret = -ENOMEM;
+                       break;
+               }
+               tidlist[i] = tid;
+               ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
+                          "vaddr %lx\n", i, tid, vaddr);
+               /* we "know" system pages and TID pages are same size */
+               dd->ipath_pageshadow[porttid + tid] = pagep[i];
+               /*
+                * don't need atomic or it's overhead
+                */
+               __set_bit(tid, tidmap);
+               physaddr = page_to_phys(pagep[i]);
+               ipath_stats.sps_pagelocks++;
+               ipath_cdbg(VERBOSE,
+                          "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
+                          tid, vaddr, (unsigned long long) physaddr,
+                          pagep[i]);
+               dd->ipath_f_put_tid(dd, &tidbase[tid], 1, physaddr);
+               /*
+                * don't check this tid in ipath_portshadow, since we
+                * just filled it in; start with the next one.
+                */
+               tid++;
+       }
+
+       if (ret) {
+               u32 limit;
+       cleanup:
+               /* jump here if copy out of updated info failed... */
+               ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
+                         -ret, i, cnt);
+               /* same code that's in ipath_free_tid() */
+               limit = sizeof(tidmap) * BITS_PER_BYTE;
+               if (limit > tidcnt)
+                       /* just in case size changes in future */
+                       limit = tidcnt;
+               tid = find_first_bit((const unsigned long *)tidmap, limit);
+               for (; tid < limit; tid++) {
+                       if (!test_bit(tid, tidmap))
+                               continue;
+                       if (dd->ipath_pageshadow[porttid + tid]) {
+                               ipath_cdbg(VERBOSE, "Freeing TID %u\n",
+                                          tid);
+                               dd->ipath_f_put_tid(dd, &tidbase[tid], 1,
+                                                   dd->ipath_tidinvalid);
+                               dd->ipath_pageshadow[porttid + tid] = NULL;
+                               ipath_stats.sps_pageunlocks++;
+                       }
+               }
+               ipath_release_user_pages(pagep, cnt);
+       } else {
+               /*
+                * Copy the updated array, with ipath_tid's filled in, back
+                * to user.  Since we did the copy in already, this "should
+                * never fail" If it does, we have to clean up...
+                */
+               if (copy_to_user((void __user *)
+                                (unsigned long) ti->tidlist,
+                                tidlist, cnt * sizeof(*tidlist))) {
+                       ret = -EFAULT;
+                       goto cleanup;
+               }
+               if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+                                tidmap, sizeof tidmap)) {
+                       ret = -EFAULT;
+                       goto cleanup;
+               }
+               if (tid == tidcnt)
+                       tid = 0;
+               pd->port_tidcursor = tid;
+       }
+
+done:
+       if (ret)
+               ipath_dbg("Failed to map %u TID pages, failing with %d\n",
+                         ti->tidcnt, -ret);
+       return ret;
+}
+
+/**
+ * ipath_tid_free - free a port TID
+ * @pd: the port
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(struct ipath_portdata *pd,
+                         const struct ipath_tid_info *ti)
+{
+       int ret = 0;
+       u32 tid, porttid, cnt, limit, tidcnt;
+       struct ipath_devdata *dd = pd->port_dd;
+       u64 __iomem *tidbase;
+       unsigned long tidmap[8];
+
+       if (!dd->ipath_pageshadow) {
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+                          sizeof tidmap)) {
+               ret = -EFAULT;
+               goto done;
+       }
+
+       porttid = pd->port_port * dd->ipath_rcvtidcnt;
+       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+                                  dd->ipath_rcvtidbase +
+                                  porttid * sizeof(*tidbase));
+
+       tidcnt = dd->ipath_rcvtidcnt;
+       limit = sizeof(tidmap) * BITS_PER_BYTE;
+       if (limit > tidcnt)
+               /* just in case size changes in future */
+               limit = tidcnt;
+       tid = find_first_bit(tidmap, limit);
+       ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
+                  "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
+                  limit, tid, porttid);
+       for (cnt = 0; tid < limit; tid++) {
+               /*
+                * small optimization; if we detect a run of 3 or so without
+                * any set, use find_first_bit again.  That's mainly to
+                * accelerate the case where we wrapped, so we have some at
+                * the beginning, and some at the end, and a big gap
+                * in the middle.
+                */
+               if (!test_bit(tid, tidmap))
+                       continue;
+               cnt++;
+               if (dd->ipath_pageshadow[porttid + tid]) {
+                       ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
+                                  pd->port_pid, tid);
+                       dd->ipath_f_put_tid(dd, &tidbase[tid], 1,
+                                           dd->ipath_tidinvalid);
+                       ipath_release_user_pages(
+                               &dd->ipath_pageshadow[porttid + tid], 1);
+                       dd->ipath_pageshadow[porttid + tid] = NULL;
+                       ipath_stats.sps_pageunlocks++;
+               } else
+                       ipath_dbg("Unused tid %u, ignoring\n", tid);
+       }
+       if (cnt != ti->tidcnt)
+               ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
+                         ti->tidcnt, cnt);
+done:
+       if (ret)
+               ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
+                         ti->tidcnt, -ret);
+       return ret;
+}
+
+/**
+ * ipath_set_part_key - set a partition key
+ * @pd: the port
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple ports may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single infinipath register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       int i, any = 0, pidx = -1;
+       u16 lkey = key & 0x7FFF;
+       int ret;
+
+       if (lkey == (IPS_DEFAULT_P_KEY & 0x7FFF)) {
+               /* nothing to do; this key always valid */
+               ret = 0;
+               goto bail;
+       }
+
+       ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
+                  "%hx:%x %hx:%x %hx:%x %hx:%x\n",
+                  pd->port_port, key, dd->ipath_pkeys[0],
+                  atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
+                  atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
+                  atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
+                  atomic_read(&dd->ipath_pkeyrefs[3]));
+
+       if (!lkey) {
+               ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
+                          pd->port_port);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /*
+        * Set the full membership bit, because it has to be
+        * set in the register or the packet, and it seems
+        * cleaner to set in the register than to force all
+        * callers to set it. (see bug 4331)
+        */
+       key |= 0x8000;
+
+       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+               if (!pd->port_pkeys[i] && pidx == -1)
+                       pidx = i;
+               if (pd->port_pkeys[i] == key) {
+                       ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
+                                  "(%x) more than once\n",
+                                  pd->port_port, key);
+                       ret = -EEXIST;
+                       goto bail;
+               }
+       }
+       if (pidx == -1) {
+               ipath_dbg("All pkeys for port %u already in use, "
+                         "can't set %x\n", pd->port_port, key);
+               ret = -EBUSY;
+               goto bail;
+       }
+       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i]) {
+                       any++;
+                       continue;
+               }
+               if (dd->ipath_pkeys[i] == key) {
+                       atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
+
+                       if (atomic_inc_return(pkrefs) > 1) {
+                               pd->port_pkeys[pidx] = key;
+                               ipath_cdbg(VERBOSE, "p%u set key %x "
+                                          "matches #%d, count now %d\n",
+                                          pd->port_port, key, i,
+                                          atomic_read(pkrefs));
+                               ret = 0;
+                               goto bail;
+                       } else {
+                               /*
+                                * lost race, decrement count, catch below
+                                */
+                               atomic_dec(pkrefs);
+                               ipath_cdbg(VERBOSE, "Lost race, count was "
+                                          "0, after dec, it's %d\n",
+                                          atomic_read(pkrefs));
+                               any++;
+                       }
+               }
+               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+                       /*
+                        * It makes no sense to have both the limited and
+                        * full membership PKEY set at the same time since
+                        * the unlimited one will disable the limited one.
+                        */
+                       ret = -EEXIST;
+                       goto bail;
+               }
+       }
+       if (!any) {
+               ipath_dbg("port %u, all pkeys already in use, "
+                         "can't set %x\n", pd->port_port, key);
+               ret = -EBUSY;
+               goto bail;
+       }
+       for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i] &&
+                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+                       u64 pkey;
+
+                       /* for ipathstats, etc. */
+                       ipath_stats.sps_pkeys[i] = lkey;
+                       pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
+                       pkey =
+                               (u64) dd->ipath_pkeys[0] |
+                               ((u64) dd->ipath_pkeys[1] << 16) |
+                               ((u64) dd->ipath_pkeys[2] << 32) |
+                               ((u64) dd->ipath_pkeys[3] << 48);
+                       ipath_cdbg(PROC, "p%u set key %x in #%d, "
+                                  "portidx %d, new pkey reg %llx\n",
+                                  pd->port_port, key, i, pidx,
+                                  (unsigned long long) pkey);
+                       ipath_write_kreg(
+                               dd, dd->ipath_kregs->kr_partitionkey, pkey);
+
+                       ret = 0;
+                       goto bail;
+               }
+       }
+       ipath_dbg("port %u, all pkeys already in use 2nd pass, "
+                 "can't set %x\n", pd->port_port, key);
+       ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_manage_rcvq - manage a port's receive queue
+ * @pd: the port
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the port, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int ipath_manage_rcvq(struct ipath_portdata *pd, int start_stop)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       u64 tval;
+
+       ipath_cdbg(PROC, "%sabling rcv for unit %u port %u\n",
+                  start_stop ? "en" : "dis", dd->ipath_unit,
+                  pd->port_port);
+       /* atomically clear receive enable port. */
+       if (start_stop) {
+               /*
+                * On enable, force in-memory copy of the tail register to
+                * 0, so that protocol code doesn't have to worry about
+                * whether or not the chip has yet updated the in-memory
+                * copy or not on return from the system call. The chip
+                * always resets it's tail register back to 0 on a
+                * transition from disabled to enabled.  This could cause a
+                * problem if software was broken, and did the enable w/o
+                * the disable, but eventually the in-memory copy will be
+                * updated and correct itself, even in the face of software
+                * bugs.
+                */
+               *pd->port_rcvhdrtail_kvaddr = 0;
+               set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
+                       &dd->ipath_rcvctrl);
+       } else
+               clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
+                         &dd->ipath_rcvctrl);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+       /* now be sure chip saw it before we return */
+       tval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       if (start_stop) {
+               /*
+                * And try to be sure that tail reg update has happened too.
+                * This should in theory interlock with the RXE changes to
+                * the tail register.  Don't assign it to the tail register
+                * in memory copy, since we could overwrite an update by the
+                * chip if we did.
+                */
+               tval = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+       }
+       /* always; new head should be equal to new tail; see above */
+       return 0;
+}
+
+static void ipath_clean_part_key(struct ipath_portdata *pd,
+                                struct ipath_devdata *dd)
+{
+       int i, j, pchanged = 0;
+       u64 oldpkey;
+
+       /* for debugging only */
+       oldpkey = (u64) dd->ipath_pkeys[0] |
+               ((u64) dd->ipath_pkeys[1] << 16) |
+               ((u64) dd->ipath_pkeys[2] << 32) |
+               ((u64) dd->ipath_pkeys[3] << 48);
+
+       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+               if (!pd->port_pkeys[i])
+                       continue;
+               ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
+                          pd->port_pkeys[i]);
+               for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
+                       /* check for match independent of the global bit */
+                       if ((dd->ipath_pkeys[j] & 0x7fff) !=
+                           (pd->port_pkeys[i] & 0x7fff))
+                               continue;
+                       if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
+                               ipath_cdbg(VERBOSE, "p%u clear key "
+                                          "%x matches #%d\n",
+                                          pd->port_port,
+                                          pd->port_pkeys[i], j);
+                               ipath_stats.sps_pkeys[j] =
+                                       dd->ipath_pkeys[j] = 0;
+                               pchanged++;
+                       }
+                       else ipath_cdbg(
+                               VERBOSE, "p%u key %x matches #%d, "
+                               "but ref still %d\n", pd->port_port,
+                               pd->port_pkeys[i], j,
+                               atomic_read(&dd->ipath_pkeyrefs[j]));
+                       break;
+               }
+               pd->port_pkeys[i] = 0;
+       }
+       if (pchanged) {
+               u64 pkey = (u64) dd->ipath_pkeys[0] |
+                       ((u64) dd->ipath_pkeys[1] << 16) |
+                       ((u64) dd->ipath_pkeys[2] << 32) |
+                       ((u64) dd->ipath_pkeys[3] << 48);
+               ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
+                          "new pkey reg %llx\n", pd->port_port,
+                          (unsigned long long) oldpkey,
+                          (unsigned long long) pkey);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+                                pkey);
+       }
+}
+
+/**
+ * ipath_create_user_egr - allocate eager TID buffers
+ * @pd: the port to allocate TID buffers for
+ *
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the user port version
+ *
+ * Allocate the eager TID buffers and program them into infinipath
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.
+ */
+static int ipath_create_user_egr(struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned e, egrcnt, alloced, egrperchunk, chunk, egrsize, egroff;
+       size_t size;
+       int ret;
+
+       egrcnt = dd->ipath_rcvegrcnt;
+       /* TID number offset for this port */
+       egroff = pd->port_port * egrcnt;
+       egrsize = dd->ipath_rcvegrbufsize;
+       ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
+                  "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
+
+       /*
+        * to avoid wasting a lot of memory, we allocate 32KB chunks of
+        * physically contiguous memory, advance through it until used up
+        * and then allocate more.  Of course, we need memory to store those
+        * extra pointers, now.  Started out with 256KB, but under heavy
+        * memory pressure (creating large files and then copying them over
+        * NFS while doing lots of MPI jobs), we hit some allocation
+        * failures, even though we can sleep...  (2.6.10) Still get
+        * failures at 64K.  32K is the lowest we can go without waiting
+        * more memory again.  It seems likely that the coalescing in
+        * free_pages, etc. still has issues (as it has had previously
+        * during 2.6.x development).
+        */
+       size = 0x8000;
+       alloced = ALIGN(egrsize * egrcnt, size);
+       egrperchunk = size / egrsize;
+       chunk = (egrcnt + egrperchunk - 1) / egrperchunk;
+       pd->port_rcvegrbuf_chunks = chunk;
+       pd->port_rcvegrbufs_perchunk = egrperchunk;
+       pd->port_rcvegrbuf_size = size;
+       pd->port_rcvegrbuf = vmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]));
+       if (!pd->port_rcvegrbuf) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       pd->port_rcvegrbuf_phys =
+               vmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]));
+       if (!pd->port_rcvegrbuf_phys) {
+               ret = -ENOMEM;
+               goto bail_rcvegrbuf;
+       }
+       for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+               /*
+                * GFP_USER, but without GFP_FS, so buffer cache can be
+                * coalesced (we hope); otherwise, even at order 4,
+                * heavy filesystem activity makes these fail
+                */
+               gfp_t gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+               pd->port_rcvegrbuf[e] = dma_alloc_coherent(
+                       &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
+                       gfp_flags);
+
+               if (!pd->port_rcvegrbuf[e]) {
+                       ret = -ENOMEM;
+                       goto bail_rcvegrbuf_phys;
+               }
+       }
+
+       pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
+
+       for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
+               dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
+               unsigned i;
+
+               for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+                       dd->ipath_f_put_tid(dd, e + egroff +
+                                           (u64 __iomem *)
+                                           ((char __iomem *)
+                                            dd->ipath_kregbase +
+                                            dd->ipath_rcvegrbase), 0, pa);
+                       pa += egrsize;
+               }
+               cond_resched(); /* don't hog the cpu */
+       }
+
+       ret = 0;
+       goto bail;
+
+bail_rcvegrbuf_phys:
+       for (e = 0; e < pd->port_rcvegrbuf_chunks &&
+                    pd->port_rcvegrbuf[e]; e++)
+               dma_free_coherent(&dd->pcidev->dev, size,
+                                 pd->port_rcvegrbuf[e],
+                                 pd->port_rcvegrbuf_phys[e]);
+
+       vfree(pd->port_rcvegrbuf_phys);
+       pd->port_rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+       vfree(pd->port_rcvegrbuf);
+       pd->port_rcvegrbuf = NULL;
+bail:
+       return ret;
+}
+
+static int ipath_do_user_init(struct ipath_portdata *pd,
+                             const struct ipath_user_info *uinfo)
+{
+       int ret = 0;
+       struct ipath_devdata *dd = pd->port_dd;
+       u64 physaddr, uaddr, off, atmp;
+       struct page *pagep;
+       u32 head32;
+       u64 head;
+
+       /* for now, if major version is different, bail */
+       if ((uinfo->spu_userversion >> 16) != IPATH_USER_SWMAJOR) {
+               dev_info(&dd->pcidev->dev,
+                        "User major version %d not same as driver "
+                        "major %d\n", uinfo->spu_userversion >> 16,
+                        IPATH_USER_SWMAJOR);
+               ret = -ENODEV;
+               goto done;
+       }
+
+       if ((uinfo->spu_userversion & 0xffff) != IPATH_USER_SWMINOR)
+               ipath_dbg("User minor version %d not same as driver "
+                         "minor %d\n", uinfo->spu_userversion & 0xffff,
+                         IPATH_USER_SWMINOR);
+
+       if (uinfo->spu_rcvhdrsize) {
+               ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
+               if (ret)
+                       goto done;
+       }
+
+       /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
+
+       /* set up for the rcvhdr Q tail register writeback to user memory */
+       if (!uinfo->spu_rcvhdraddr ||
+           !access_ok(VERIFY_WRITE, (u64 __user *) (unsigned long)
+                      uinfo->spu_rcvhdraddr, sizeof(u64))) {
+               ipath_dbg("Port %d rcvhdrtail addr %llx not valid\n",
+                         pd->port_port,
+                         (unsigned long long) uinfo->spu_rcvhdraddr);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       off = offset_in_page(uinfo->spu_rcvhdraddr);
+       uaddr = PAGE_MASK & (unsigned long) uinfo->spu_rcvhdraddr;
+       ret = ipath_get_user_pages_nocopy(uaddr, &pagep);
+       if (ret) {
+               dev_info(&dd->pcidev->dev, "Failed to lookup and lock "
+                        "address %llx for rcvhdrtail: errno %d\n",
+                        (unsigned long long) uinfo->spu_rcvhdraddr, -ret);
+               goto done;
+       }
+       ipath_stats.sps_pagelocks++;
+       pd->port_rcvhdrtail_uaddr = uaddr;
+       pd->port_rcvhdrtail_pagep = pagep;
+       pd->port_rcvhdrtail_kvaddr =
+               page_address(pagep);
+       pd->port_rcvhdrtail_kvaddr += off;
+       physaddr = page_to_phys(pagep) + off;
+       ipath_cdbg(VERBOSE, "port %d user addr %llx hdrtailaddr, %llx "
+                  "physical (off=%llx)\n",
+                  pd->port_port,
+                  (unsigned long long) uinfo->spu_rcvhdraddr,
+                  (unsigned long long) physaddr, (unsigned long long) off);
+       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+                             pd->port_port, physaddr);
+       atmp = ipath_read_kreg64_port(dd,
+                                     dd->ipath_kregs->kr_rcvhdrtailaddr,
+                                     pd->port_port);
+       if (physaddr != atmp) {
+               ipath_dev_err(dd,
+                             "Catastrophic software error, "
+                             "RcvHdrTailAddr%u written as %llx, "
+                             "read back as %llx\n", pd->port_port,
+                             (unsigned long long) physaddr,
+                             (unsigned long long) atmp);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       /* for right now, kernel piobufs are at end, so port 1 is at 0 */
+       pd->port_piobufs = dd->ipath_piobufbase +
+               dd->ipath_pbufsport * (pd->port_port -
+                                      1) * dd->ipath_palign;
+       ipath_cdbg(VERBOSE, "Set base of piobufs for port %u to 0x%x\n",
+                  pd->port_port, pd->port_piobufs);
+
+       /*
+        * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+        * array for time being.  If pd->port_port > chip-supported,
+        * we need to do extra stuff here to handle by handling overflow
+        * through port 0, someday
+        */
+       ret = ipath_create_rcvhdrq(dd, pd);
+       if (!ret)
+               ret = ipath_create_user_egr(pd);
+       if (ret)
+               goto done;
+       /* enable receives now */
+       /* atomically set enable bit for this port */
+       set_bit(INFINIPATH_R_PORTENABLE_SHIFT + pd->port_port,
+               &dd->ipath_rcvctrl);
+
+       /*
+        * set the head registers for this port to the current values
+        * of the tail pointers, since we don't know if they were
+        * updated on last use of the port.
+        */
+       head32 = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+       head = (u64) head32;
+       ipath_write_ureg(dd, ur_rcvhdrhead, head, pd->port_port);
+       head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
+       ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
+       dd->ipath_lastegrheads[pd->port_port] = -1;
+       dd->ipath_lastrcvhdrqtails[pd->port_port] = -1;
+       ipath_cdbg(VERBOSE, "Wrote port%d head %llx, egrhead %x from "
+                  "tail regs\n", pd->port_port,
+                  (unsigned long long) head, head32);
+       pd->port_tidcursor = 0; /* start at beginning after open */
+       /*
+        * now enable the port; the tail registers will be written to memory
+        * by the chip as soon as it sees the write to
+        * dd->ipath_kregs->kr_rcvctrl.  The update only happens on
+        * transition from 0 to 1, so clear it first, then set it as part of
+        * enabling the port.  This will (very briefly) affect any other
+        * open ports, but it shouldn't be long enough to be an issue.
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl & ~INFINIPATH_R_TAILUPD);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+done:
+       return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
+                    u64 ureg)
+{
+       unsigned long phys;
+       int ret;
+
+       /* it's the real hardware, so io_remap works */
+
+       if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+               dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
+                        "%lx > PAGE\n", vma->vm_end - vma->vm_start);
+               ret = -EFAULT;
+       } else {
+               phys = dd->ipath_physaddr + ureg;
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+               vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+               ret = io_remap_pfn_range(vma, vma->vm_start,
+                                        phys >> PAGE_SHIFT,
+                                        vma->vm_end - vma->vm_start,
+                                        vma->vm_page_prot);
+       }
+       return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+                       struct ipath_devdata *dd,
+                       struct ipath_portdata *pd)
+{
+       unsigned long phys;
+       int ret;
+
+       /*
+        * When we map the PIO buffers, we want to map them as writeonly, no
+        * read possible.
+        */
+
+       if ((vma->vm_end - vma->vm_start) >
+           (dd->ipath_pbufsport * dd->ipath_palign)) {
+               dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
+                        "reqlen %lx > PAGE\n",
+                        vma->vm_end - vma->vm_start);
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       phys = dd->ipath_physaddr + pd->port_piobufs;
+       /*
+        * Do *NOT* mark this as non-cached (PWT bit), or we don't get the
+        * write combining behavior we want on the PIO buffers!
+        * vma->vm_page_prot =
+        *        pgprot_noncached(vma->vm_page_prot);
+        */
+
+       if (vma->vm_flags & VM_READ) {
+               dev_info(&dd->pcidev->dev,
+                        "Can't map piobufs as readable (flags=%lx)\n",
+                        vma->vm_flags);
+               ret = -EPERM;
+               goto bail;
+       }
+
+       /* don't allow them to later change to readable with mprotect */
+
+       vma->vm_flags &= ~VM_MAYWRITE;
+       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+       ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+                                vma->vm_end - vma->vm_start,
+                                vma->vm_page_prot);
+bail:
+       return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+                          struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       unsigned long start, size;
+       size_t total_size, i;
+       dma_addr_t *phys;
+       int ret;
+
+       if (!pd->port_rcvegrbuf) {
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       size = pd->port_rcvegrbuf_size;
+       total_size = pd->port_rcvegrbuf_chunks * size;
+       if ((vma->vm_end - vma->vm_start) > total_size) {
+               dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
+                        "reqlen %lx > actual %lx\n",
+                        vma->vm_end - vma->vm_start,
+                        (unsigned long) total_size);
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       if (vma->vm_flags & VM_WRITE) {
+               dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
+                        "writable (flags=%lx)\n", vma->vm_flags);
+               ret = -EPERM;
+               goto bail;
+       }
+
+       start = vma->vm_start;
+       phys = pd->port_rcvegrbuf_phys;
+
+       /* don't allow them to later change to writeable with mprotect */
+       vma->vm_flags &= ~VM_MAYWRITE;
+
+       for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
+               ret = remap_pfn_range(vma, start, phys[i] >> PAGE_SHIFT,
+                                     size, vma->vm_page_prot);
+               if (ret < 0)
+                       goto bail;
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int mmap_rcvhdrq(struct vm_area_struct *vma,
+                       struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       size_t total_size;
+       int ret;
+
+       /*
+        * kmalloc'ed memory, physically contiguous; this is from
+        * spi_rcvhdr_base; we allow user to map read-write so they can
+        * write hdrq entries to allow protocol code to directly poll
+        * whether a hdrq entry has been written.
+        */
+       total_size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+                          sizeof(u32), PAGE_SIZE);
+       if ((vma->vm_end - vma->vm_start) > total_size) {
+               dev_info(&dd->pcidev->dev,
+                        "FAIL on rcvhdrq: reqlen %lx > actual %lx\n",
+                        vma->vm_end - vma->vm_start,
+                        (unsigned long) total_size);
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       ret = remap_pfn_range(vma, vma->vm_start,
+                             pd->port_rcvhdrq_phys >> PAGE_SHIFT,
+                             vma->vm_end - vma->vm_start,
+                             vma->vm_page_prot);
+bail:
+       return ret;
+}
+
+static int mmap_pioavailregs(struct vm_area_struct *vma,
+                            struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       int ret;
+
+       /*
+        * when we map the PIO bufferavail registers, we want to map them as
+        * readonly, no write possible.
+        *
+        * kmalloc'ed memory, physically contiguous, one page only, readonly
+        */
+
+       if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+               dev_info(&dd->pcidev->dev, "FAIL on pioavailregs_dma: "
+                        "reqlen %lx > actual %lx\n",
+                        vma->vm_end - vma->vm_start,
+                        (unsigned long) PAGE_SIZE);
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       if (vma->vm_flags & VM_WRITE) {
+               dev_info(&dd->pcidev->dev,
+                        "Can't map pioavailregs as writable (flags=%lx)\n",
+                        vma->vm_flags);
+               ret = -EPERM;
+               goto bail;
+       }
+
+       /* don't allow them to later change with mprotect */
+       vma->vm_flags &= ~VM_MAYWRITE;
+
+       ret = remap_pfn_range(vma, vma->vm_start,
+                             dd->ipath_pioavailregs_phys >> PAGE_SHIFT,
+                             PAGE_SIZE, vma->vm_page_prot);
+bail:
+       return ret;
+}
+
+/**
+ * ipath_mmap - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+       struct ipath_portdata *pd;
+       struct ipath_devdata *dd;
+       u64 pgaddr, ureg;
+       int ret;
+
+       pd = port_fp(fp);
+       dd = pd->port_dd;
+       /*
+        * This is the ipath_do_user_init() code, mapping the shared buffers
+        * into the user process. The address referred to by vm_pgoff is the
+        * virtual, not physical, address; we only do one mmap for each
+        * space mapped.
+        */
+       pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+       /*
+        * note that ureg does *NOT* have the kregvirt as part of it, to be
+        * sure that for 32 bit programs, we don't end up trying to map a >
+        * 44 address.  Has to match ipath_get_base_info() code that sets
+        * __spi_uregbase
+        */
+
+       ureg = dd->ipath_uregbase + dd->ipath_palign * pd->port_port;
+
+       ipath_cdbg(MM, "ushare: pgaddr %llx vm_start=%lx, vmlen %lx\n",
+                  (unsigned long long) pgaddr, vma->vm_start,
+                  vma->vm_end - vma->vm_start);
+
+       if (pgaddr == ureg)
+               ret = mmap_ureg(vma, dd, ureg);
+       else if (pgaddr == pd->port_piobufs)
+               ret = mmap_piobufs(vma, dd, pd);
+       else if (pgaddr == (u64) pd->port_rcvegr_phys)
+               ret = mmap_rcvegrbufs(vma, pd);
+       else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
+               ret = mmap_rcvhdrq(vma, pd);
+       else if (pgaddr == dd->ipath_pioavailregs_phys)
+               ret = mmap_pioavailregs(vma, pd);
+       else
+               ret = -EINVAL;
+
+       vma->vm_private_data = NULL;
+
+       if (ret < 0)
+               dev_info(&dd->pcidev->dev,
+                        "Failure %d on addr %lx, off %lx\n",
+                        -ret, vma->vm_start, vma->vm_pgoff);
+
+       return ret;
+}
+
+static unsigned int ipath_poll(struct file *fp,
+                              struct poll_table_struct *pt)
+{
+       struct ipath_portdata *pd;
+       u32 head, tail;
+       int bit;
+       struct ipath_devdata *dd;
+
+       pd = port_fp(fp);
+       dd = pd->port_dd;
+
+       bit = pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT;
+       set_bit(bit, &dd->ipath_rcvctrl);
+
+       /*
+        * Before blocking, make sure that head is still == tail,
+        * reading from the chip, so we can be sure the interrupt
+        * enable has made it to the chip.  If not equal, disable
+        * interrupt again and return immediately.  This avoids races,
+        * and the overhead of the chip read doesn't matter much at
+        * this point, since we are waiting for something anyway.
+        */
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
+       tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+
+       if (tail == head) {
+               set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
+               poll_wait(fp, &pd->port_wait, pt);
+
+               if (test_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag)) {
+                       /* timed out, no packets received */
+                       clear_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
+                       pd->port_rcvwait_to++;
+               }
+       }
+       else {
+               /* it's already happened; don't do wait_event overhead */
+               pd->port_rcvnowait++;
+       }
+
+       clear_bit(bit, &dd->ipath_rcvctrl);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       return 0;
+}
+
+static int try_alloc_port(struct ipath_devdata *dd, int port,
+                         struct file *fp)
+{
+       int ret;
+
+       if (!dd->ipath_pd[port]) {
+               void *p, *ptmp;
+
+               p = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
+
+               /*
+                * Allocate memory for use in ipath_tid_update() just once
+                * at open, not per call.  Reduces cost of expected send
+                * setup.
+                */
+               ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
+                              dd->ipath_rcvtidcnt * sizeof(struct page **),
+                              GFP_KERNEL);
+               if (!p || !ptmp) {
+                       ipath_dev_err(dd, "Unable to allocate portdata "
+                                     "memory, failing open\n");
+                       ret = -ENOMEM;
+                       kfree(p);
+                       kfree(ptmp);
+                       goto bail;
+               }
+               dd->ipath_pd[port] = p;
+               dd->ipath_pd[port]->port_port = port;
+               dd->ipath_pd[port]->port_dd = dd;
+               dd->ipath_pd[port]->port_tid_pg_list = ptmp;
+               init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
+       }
+       if (!dd->ipath_pd[port]->port_cnt) {
+               dd->ipath_pd[port]->port_cnt = 1;
+               fp->private_data = (void *) dd->ipath_pd[port];
+               ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
+                          current->comm, current->pid, dd->ipath_unit,
+                          port);
+               dd->ipath_pd[port]->port_pid = current->pid;
+               strncpy(dd->ipath_pd[port]->port_comm, current->comm,
+                       sizeof(dd->ipath_pd[port]->port_comm));
+               ipath_stats.sps_ports++;
+               ret = 0;
+               goto bail;
+       }
+       ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+static inline int usable(struct ipath_devdata *dd)
+{
+       return dd &&
+               (dd->ipath_flags & IPATH_PRESENT) &&
+               dd->ipath_kregbase &&
+               dd->ipath_lid &&
+               !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
+                                    | IPATH_LINKUNK));
+}
+
+static int find_free_port(int unit, struct file *fp)
+{
+       struct ipath_devdata *dd = ipath_lookup(unit);
+       int ret, i;
+
+       if (!dd) {
+               ret = -ENODEV;
+               goto bail;
+       }
+
+       if (!usable(dd)) {
+               ret = -ENETDOWN;
+               goto bail;
+       }
+
+       for (i = 0; i < dd->ipath_cfgports; i++) {
+               ret = try_alloc_port(dd, i, fp);
+               if (ret != -EBUSY)
+                       goto bail;
+       }
+       ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+static int find_best_unit(struct file *fp)
+{
+       int ret = 0, i, prefunit = -1, devmax;
+       int maxofallports, npresent, nup;
+       int ndev;
+
+       (void) ipath_count_units(&npresent, &nup, &maxofallports);
+
+       /*
+        * This code is present to allow a knowledgeable person to
+        * specify the layout of processes to processors before opening
+        * this driver, and then we'll assign the process to the "closest"
+        * HT-400 to that processor (we assume reasonable connectivity,
+        * for now).  This code assumes that if affinity has been set
+        * before this point, that at most one cpu is set; for now this
+        * is reasonable.  I check for both cpus_empty() and cpus_full(),
+        * in case some kernel variant sets none of the bits when no
+        * affinity is set.  2.6.11 and 12 kernels have all present
+        * cpus set.  Some day we'll have to fix it up further to handle
+        * a cpu subset.  This algorithm fails for two HT-400's connected
+        * in tunnel fashion.  Eventually this needs real topology
+        * information.  There may be some issues with dual core numbering
+        * as well.  This needs more work prior to release.
+        */
+       if (!cpus_empty(current->cpus_allowed) &&
+           !cpus_full(current->cpus_allowed)) {
+               int ncpus = num_online_cpus(), curcpu = -1;
+               for (i = 0; i < ncpus; i++)
+                       if (cpu_isset(i, current->cpus_allowed)) {
+                               ipath_cdbg(PROC, "%s[%u] affinity set for "
+                                          "cpu %d\n", current->comm,
+                                          current->pid, i);
+                               curcpu = i;
+                       }
+               if (curcpu != -1) {
+                       if (npresent) {
+                               prefunit = curcpu / (ncpus / npresent);
+                               ipath_dbg("%s[%u] %d chips, %d cpus, "
+                                         "%d cpus/chip, select unit %d\n",
+                                         current->comm, current->pid,
+                                         npresent, ncpus, ncpus / npresent,
+                                         prefunit);
+                       }
+               }
+       }
+
+       /*
+        * user ports start at 1, kernel port is 0
+        * For now, we do round-robin access across all chips
+        */
+
+       if (prefunit != -1)
+               devmax = prefunit + 1;
+       else
+               devmax = ipath_count_units(NULL, NULL, NULL);
+recheck:
+       for (i = 1; i < maxofallports; i++) {
+               for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
+                    ndev++) {
+                       struct ipath_devdata *dd = ipath_lookup(ndev);
+
+                       if (!usable(dd))
+                               continue; /* can't use this unit */
+                       if (i >= dd->ipath_cfgports)
+                               /*
+                                * Maxed out on users of this unit. Try
+                                * next.
+                                */
+                               continue;
+                       ret = try_alloc_port(dd, i, fp);
+                       if (!ret)
+                               goto done;
+               }
+       }
+
+       if (npresent) {
+               if (nup == 0) {
+                       ret = -ENETDOWN;
+                       ipath_dbg("No ports available (none initialized "
+                                 "and ready)\n");
+               } else {
+                       if (prefunit > 0) {
+                               /* if started above 0, retry from 0 */
+                               ipath_cdbg(PROC,
+                                          "%s[%u] no ports on prefunit "
+                                          "%d, clear and re-check\n",
+                                          current->comm, current->pid,
+                                          prefunit);
+                               devmax = ipath_count_units(NULL, NULL,
+                                                          NULL);
+                               prefunit = -1;
+                               goto recheck;
+                       }
+                       ret = -EBUSY;
+                       ipath_dbg("No ports available\n");
+               }
+       } else {
+               ret = -ENXIO;
+               ipath_dbg("No boards found\n");
+       }
+
+done:
+       return ret;
+}
+
+static int ipath_open(struct inode *in, struct file *fp)
+{
+       int ret, minor;
+
+       mutex_lock(&ipath_mutex);
+
+       minor = iminor(in);
+       ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
+                  (long)in->i_rdev, minor);
+
+       if (minor)
+               ret = find_free_port(minor - 1, fp);
+       else
+               ret = find_best_unit(fp);
+
+       mutex_unlock(&ipath_mutex);
+       return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries port still had in use
+ * @pd: port
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using ipath_f_clear_tids.
+ */
+static void unlock_expected_tids(struct ipath_portdata *pd)
+{
+       struct ipath_devdata *dd = pd->port_dd;
+       int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
+       int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+
+       ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
+                  pd->port_port);
+       for (i = port_tidbase; i < maxtid; i++) {
+               if (!dd->ipath_pageshadow[i])
+                       continue;
+
+               ipath_release_user_pages_on_close(&dd->ipath_pageshadow[i],
+                                                 1);
+               dd->ipath_pageshadow[i] = NULL;
+               cnt++;
+               ipath_stats.sps_pageunlocks++;
+       }
+       if (cnt)
+               ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
+                          pd->port_port, cnt);
+
+       if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
+               ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
+                          (unsigned long long) ipath_stats.sps_pagelocks,
+                          (unsigned long long)
+                          ipath_stats.sps_pageunlocks);
+}
+
+static int ipath_close(struct inode *in, struct file *fp)
+{
+       int ret = 0;
+       struct ipath_portdata *pd;
+       struct ipath_devdata *dd;
+       unsigned port;
+
+       ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
+                  (long)in->i_rdev, fp->private_data);
+
+       mutex_lock(&ipath_mutex);
+
+       pd = port_fp(fp);
+       port = pd->port_port;
+       fp->private_data = NULL;
+       dd = pd->port_dd;
+
+       if (pd->port_hdrqfull) {
+               ipath_cdbg(PROC, "%s[%u] had %u rcvhdrqfull errors "
+                          "during run\n", pd->port_comm, pd->port_pid,
+                          pd->port_hdrqfull);
+               pd->port_hdrqfull = 0;
+       }
+
+       if (pd->port_rcvwait_to || pd->port_piowait_to
+           || pd->port_rcvnowait || pd->port_pionowait) {
+               ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
+                          "%u rcv %u, pio already\n",
+                          pd->port_port, pd->port_rcvwait_to,
+                          pd->port_piowait_to, pd->port_rcvnowait,
+                          pd->port_pionowait);
+               pd->port_rcvwait_to = pd->port_piowait_to =
+                       pd->port_rcvnowait = pd->port_pionowait = 0;
+       }
+       if (pd->port_flag) {
+               ipath_dbg("port %u port_flag still set to 0x%lx\n",
+                         pd->port_port, pd->port_flag);
+               pd->port_flag = 0;
+       }
+
+       if (dd->ipath_kregbase) {
+               if (pd->port_rcvhdrtail_uaddr) {
+                       pd->port_rcvhdrtail_uaddr = 0;
+                       pd->port_rcvhdrtail_kvaddr = NULL;
+                       ipath_release_user_pages_on_close(
+                               &pd->port_rcvhdrtail_pagep, 1);
+                       pd->port_rcvhdrtail_pagep = NULL;
+                       ipath_stats.sps_pageunlocks++;
+               }
+               ipath_write_kreg_port(
+                       dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+                       port, 0ULL);
+               ipath_write_kreg_port(
+                       dd, dd->ipath_kregs->kr_rcvhdraddr,
+                       pd->port_port, 0);
+
+               /* clean up the pkeys for this port user */
+               ipath_clean_part_key(pd, dd);
+
+               if (port < dd->ipath_cfgports) {
+                       int i = dd->ipath_pbufsport * (port - 1);
+                       ipath_disarm_piobufs(dd, i, dd->ipath_pbufsport);
+
+                       /* atomically clear receive enable port. */
+                       clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port,
+                                 &dd->ipath_rcvctrl);
+                       ipath_write_kreg(
+                               dd,
+                               dd->ipath_kregs->kr_rcvctrl,
+                               dd->ipath_rcvctrl);
+
+                       if (dd->ipath_pageshadow)
+                               unlock_expected_tids(pd);
+                       ipath_stats.sps_ports--;
+                       ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
+                                  pd->port_comm, pd->port_pid,
+                                  dd->ipath_unit, port);
+               }
+       }
+
+       pd->port_cnt = 0;
+       pd->port_pid = 0;
+
+       dd->ipath_f_clear_tids(dd, pd->port_port);
+
+       ipath_free_pddata(dd, pd->port_port, 0);
+
+       mutex_unlock(&ipath_mutex);
+
+       return ret;
+}
+
+static int ipath_port_info(struct ipath_portdata *pd,
+                          struct ipath_port_info __user *uinfo)
+{
+       struct ipath_port_info info;
+       int nup;
+       int ret;
+
+       (void) ipath_count_units(NULL, &nup, NULL);
+       info.num_active = nup;
+       info.unit = pd->port_dd->ipath_unit;
+       info.port = pd->port_port;
+
+       if (copy_to_user(uinfo, &info, sizeof(info))) {
+               ret = -EFAULT;
+               goto bail;
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static ssize_t ipath_write(struct file *fp, const char __user *data,
+                          size_t count, loff_t *off)
+{
+       const struct ipath_cmd __user *ucmd;
+       struct ipath_portdata *pd;
+       const void __user *src;
+       size_t consumed, copy;
+       struct ipath_cmd cmd;
+       ssize_t ret = 0;
+       void *dest;
+
+       if (count < sizeof(cmd.type)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ucmd = (const struct ipath_cmd __user *) data;
+
+       if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       consumed = sizeof(cmd.type);
+
+       switch (cmd.type) {
+       case IPATH_CMD_USER_INIT:
+               copy = sizeof(cmd.cmd.user_info);
+               dest = &cmd.cmd.user_info;
+               src = &ucmd->cmd.user_info;
+               break;
+       case IPATH_CMD_RECV_CTRL:
+               copy = sizeof(cmd.cmd.recv_ctrl);
+               dest = &cmd.cmd.recv_ctrl;
+               src = &ucmd->cmd.recv_ctrl;
+               break;
+       case IPATH_CMD_PORT_INFO:
+               copy = sizeof(cmd.cmd.port_info);
+               dest = &cmd.cmd.port_info;
+               src = &ucmd->cmd.port_info;
+               break;
+       case IPATH_CMD_TID_UPDATE:
+       case IPATH_CMD_TID_FREE:
+               copy = sizeof(cmd.cmd.tid_info);
+               dest = &cmd.cmd.tid_info;
+               src = &ucmd->cmd.tid_info;
+               break;
+       case IPATH_CMD_SET_PART_KEY:
+               copy = sizeof(cmd.cmd.part_key);
+               dest = &cmd.cmd.part_key;
+               src = &ucmd->cmd.part_key;
+               break;
+       default:
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if ((count - consumed) < copy) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (copy_from_user(dest, src, copy)) {
+               ret = -EFAULT;
+               goto bail;
+       }
+
+       consumed += copy;
+       pd = port_fp(fp);
+
+       switch (cmd.type) {
+       case IPATH_CMD_USER_INIT:
+               ret = ipath_do_user_init(pd, &cmd.cmd.user_info);
+               if (ret < 0)
+                       goto bail;
+               ret = ipath_get_base_info(
+                       pd, (void __user *) (unsigned long)
+                       cmd.cmd.user_info.spu_base_info,
+                       cmd.cmd.user_info.spu_base_info_size);
+               break;
+       case IPATH_CMD_RECV_CTRL:
+               ret = ipath_manage_rcvq(pd, cmd.cmd.recv_ctrl);
+               break;
+       case IPATH_CMD_PORT_INFO:
+               ret = ipath_port_info(pd,
+                                     (struct ipath_port_info __user *)
+                                     (unsigned long) cmd.cmd.port_info);
+               break;
+       case IPATH_CMD_TID_UPDATE:
+               ret = ipath_tid_update(pd, &cmd.cmd.tid_info);
+               break;
+       case IPATH_CMD_TID_FREE:
+               ret = ipath_tid_free(pd, &cmd.cmd.tid_info);
+               break;
+       case IPATH_CMD_SET_PART_KEY:
+               ret = ipath_set_part_key(pd, cmd.cmd.part_key);
+               break;
+       }
+
+       if (ret >= 0)
+               ret = consumed;
+
+bail:
+       return ret;
+}
+
+static struct class *ipath_class;
+
+static int init_cdev(int minor, char *name, struct file_operations *fops,
+                    struct cdev **cdevp, struct class_device **class_devp)
+{
+       const dev_t dev = MKDEV(IPATH_MAJOR, minor);
+       struct cdev *cdev = NULL;
+       struct class_device *class_dev = NULL;
+       int ret;
+
+       cdev = cdev_alloc();
+       if (!cdev) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not allocate cdev for minor %d, %s\n",
+                      minor, name);
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       cdev->owner = THIS_MODULE;
+       cdev->ops = fops;
+       kobject_set_name(&cdev->kobj, name);
+
+       ret = cdev_add(cdev, dev, 1);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME
+                      ": Could not add cdev for minor %d, %s (err %d)\n",
+                      minor, name, -ret);
+               goto err_cdev;
+       }
+
+       class_dev = class_device_create(ipath_class, NULL, dev, NULL, name);
+
+       if (IS_ERR(class_dev)) {
+               ret = PTR_ERR(class_dev);
+               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+                      "class_dev for minor %d, %s (err %d)\n",
+                      minor, name, -ret);
+               goto err_cdev;
+       }
+
+       goto done;
+
+err_cdev:
+       cdev_del(cdev);
+       cdev = NULL;
+
+done:
+       if (ret >= 0) {
+               *cdevp = cdev;
+               *class_devp = class_dev;
+       } else {
+               *cdevp = NULL;
+               *class_devp = NULL;
+       }
+
+       return ret;
+}
+
+int ipath_cdev_init(int minor, char *name, struct file_operations *fops,
+                   struct cdev **cdevp, struct class_device **class_devp)
+{
+       return init_cdev(minor, name, fops, cdevp, class_devp);
+}
+
+static void cleanup_cdev(struct cdev **cdevp,
+                        struct class_device **class_devp)
+{
+       struct class_device *class_dev = *class_devp;
+
+       if (class_dev) {
+               class_device_unregister(class_dev);
+               *class_devp = NULL;
+       }
+
+       if (*cdevp) {
+               cdev_del(*cdevp);
+               *cdevp = NULL;
+       }
+}
+
+void ipath_cdev_cleanup(struct cdev **cdevp,
+                       struct class_device **class_devp)
+{
+       cleanup_cdev(cdevp, class_devp);
+}
+
+static struct cdev *wildcard_cdev;
+static struct class_device *wildcard_class_dev;
+
+static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
+
+static int user_init(void)
+{
+       int ret;
+
+       ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
+       if (ret < 0) {
+               printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
+                      "chrdev region (err %d)\n", -ret);
+               goto done;
+       }
+
+       ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
+
+       if (IS_ERR(ipath_class)) {
+               ret = PTR_ERR(ipath_class);
+               printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+                      "device class (err %d)\n", -ret);
+               goto bail;
+       }
+
+       goto done;
+bail:
+       unregister_chrdev_region(dev, IPATH_NMINORS);
+done:
+       return ret;
+}
+
+static void user_cleanup(void)
+{
+       if (ipath_class) {
+               class_destroy(ipath_class);
+               ipath_class = NULL;
+       }
+
+       unregister_chrdev_region(dev, IPATH_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+static atomic_t user_setup = ATOMIC_INIT(0);
+
+int ipath_user_add(struct ipath_devdata *dd)
+{
+       char name[10];
+       int ret;
+
+       if (atomic_inc_return(&user_count) == 1) {
+               ret = user_init();
+               if (ret < 0) {
+                       ipath_dev_err(dd, "Unable to set up user support: "
+                                     "error %d\n", -ret);
+                       goto bail;
+               }
+               ret = ipath_diag_init();
+               if (ret < 0) {
+                       ipath_dev_err(dd, "Unable to set up diag support: "
+                                     "error %d\n", -ret);
+                       goto bail_sma;
+               }
+
+               ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
+                               &wildcard_class_dev);
+               if (ret < 0) {
+                       ipath_dev_err(dd, "Could not create wildcard "
+                                     "minor: error %d\n", -ret);
+                       goto bail_diag;
+               }
+
+               atomic_set(&user_setup, 1);
+       }
+
+       snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
+
+       ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
+                       &dd->cdev, &dd->class_dev);
+       if (ret < 0)
+               ipath_dev_err(dd, "Could not create user minor %d, %s\n",
+                             dd->ipath_unit + 1, name);
+
+       goto bail;
+
+bail_diag:
+       ipath_diag_cleanup();
+bail_sma:
+       user_cleanup();
+bail:
+       return ret;
+}
+
+void ipath_user_del(struct ipath_devdata *dd)
+{
+       cleanup_cdev(&dd->cdev, &dd->class_dev);
+
+       if (atomic_dec_return(&user_count) == 0) {
+               if (atomic_read(&user_setup) == 0)
+                       goto bail;
+
+               cleanup_cdev(&wildcard_cdev, &wildcard_class_dev);
+               ipath_diag_cleanup();
+               user_cleanup();
+
+               atomic_set(&user_setup, 0);
+       }
+bail:
+       return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
new file mode 100644 (file)
index 0000000..e274120
--- /dev/null
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/version.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/pci.h>
+
+#include "ipath_kernel.h"
+
+#define IPATHFS_MAGIC 0x726a77
+
+static struct super_block *ipath_super;
+
+static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
+                        int mode, struct file_operations *fops,
+                        void *data)
+{
+       int error;
+       struct inode *inode = new_inode(dir->i_sb);
+
+       if (!inode) {
+               error = -EPERM;
+               goto bail;
+       }
+
+       inode->i_mode = mode;
+       inode->i_uid = 0;
+       inode->i_gid = 0;
+       inode->i_blksize = PAGE_CACHE_SIZE;
+       inode->i_blocks = 0;
+       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       inode->u.generic_ip = data;
+       if ((mode & S_IFMT) == S_IFDIR) {
+               inode->i_op = &simple_dir_inode_operations;
+               inode->i_nlink++;
+               dir->i_nlink++;
+       }
+
+       inode->i_fop = fops;
+
+       d_instantiate(dentry, inode);
+       error = 0;
+
+bail:
+       return error;
+}
+
+static int create_file(const char *name, mode_t mode,
+                      struct dentry *parent, struct dentry **dentry,
+                      struct file_operations *fops, void *data)
+{
+       int error;
+
+       *dentry = NULL;
+       mutex_lock(&parent->d_inode->i_mutex);
+       *dentry = lookup_one_len(name, parent, strlen(name));
+       if (!IS_ERR(dentry))
+               error = ipathfs_mknod(parent->d_inode, *dentry,
+                                     mode, fops, data);
+       else
+               error = PTR_ERR(dentry);
+       mutex_unlock(&parent->d_inode->i_mutex);
+
+       return error;
+}
+
+static ssize_t atomic_stats_read(struct file *file, char __user *buf,
+                                size_t count, loff_t *ppos)
+{
+       return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
+                                      sizeof ipath_stats);
+}
+
+static struct file_operations atomic_stats_ops = {
+       .read = atomic_stats_read,
+};
+
+#define NUM_COUNTERS sizeof(struct infinipath_counters) / sizeof(u64)
+
+static ssize_t atomic_counters_read(struct file *file, char __user *buf,
+                                   size_t count, loff_t *ppos)
+{
+       u64 counters[NUM_COUNTERS];
+       u16 i;
+       struct ipath_devdata *dd;
+
+       dd = file->f_dentry->d_inode->u.generic_ip;
+
+       for (i = 0; i < NUM_COUNTERS; i++)
+               counters[i] = ipath_snap_cntr(dd, i);
+
+       return simple_read_from_buffer(buf, count, ppos, counters,
+                                      sizeof counters);
+}
+
+static struct file_operations atomic_counters_ops = {
+       .read = atomic_counters_read,
+};
+
+static ssize_t atomic_node_info_read(struct file *file, char __user *buf,
+                                    size_t count, loff_t *ppos)
+{
+       u32 nodeinfo[10];
+       struct ipath_devdata *dd;
+       u64 guid;
+
+       dd = file->f_dentry->d_inode->u.generic_ip;
+
+       guid = be64_to_cpu(dd->ipath_guid);
+
+       nodeinfo[0] =                   /* BaseVersion is SMA */
+               /* ClassVersion is SMA */
+               (1 << 8)                /* NodeType  */
+               | (1 << 0);             /* NumPorts */
+       nodeinfo[1] = (u32) (guid >> 32);
+       nodeinfo[2] = (u32) (guid & 0xffffffff);
+       /* PortGUID == SystemImageGUID for us */
+       nodeinfo[3] = nodeinfo[1];
+       /* PortGUID == SystemImageGUID for us */
+       nodeinfo[4] = nodeinfo[2];
+       /* PortGUID == NodeGUID for us */
+       nodeinfo[5] = nodeinfo[3];
+       /* PortGUID == NodeGUID for us */
+       nodeinfo[6] = nodeinfo[4];
+       nodeinfo[7] = (4 << 16) /* we support 4 pkeys */
+               | (dd->ipath_deviceid << 0);
+       /* our chip version as 16 bits major, 16 bits minor */
+       nodeinfo[8] = dd->ipath_minrev | (dd->ipath_majrev << 16);
+       nodeinfo[9] = (dd->ipath_unit << 24) | (dd->ipath_vendorid << 0);
+
+       return simple_read_from_buffer(buf, count, ppos, nodeinfo,
+                                      sizeof nodeinfo);
+}
+
+static struct file_operations atomic_node_info_ops = {
+       .read = atomic_node_info_read,
+};
+
+static ssize_t atomic_port_info_read(struct file *file, char __user *buf,
+                                    size_t count, loff_t *ppos)
+{
+       u32 portinfo[13];
+       u32 tmp, tmp2;
+       struct ipath_devdata *dd;
+
+       dd = file->f_dentry->d_inode->u.generic_ip;
+
+       /* so we only initialize non-zero fields. */
+       memset(portinfo, 0, sizeof portinfo);
+
+       /*
+        * Notimpl yet M_Key (64)
+        * Notimpl yet GID (64)
+        */
+
+       portinfo[4] = (dd->ipath_lid << 16);
+
+       /*
+        * Notimpl yet SMLID (should we store this in the driver, in case
+        * SMA dies?)  CapabilityMask is 0, we don't support any of these
+        * DiagCode is 0; we don't store any diag info for now Notimpl yet
+        * M_KeyLeasePeriod (we don't support M_Key)
+        */
+
+       /* LocalPortNum is whichever port number they ask for */
+       portinfo[7] = (dd->ipath_unit << 24)
+               /* LinkWidthEnabled */
+               | (2 << 16)
+               /* LinkWidthSupported (really 2, but not IB valid) */
+               | (3 << 8)
+               /* LinkWidthActive */
+               | (2 << 0);
+       tmp = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK;
+       tmp2 = 5;
+       if (tmp == IPATH_IBSTATE_INIT)
+               tmp = 2;
+       else if (tmp == IPATH_IBSTATE_ARM)
+               tmp = 3;
+       else if (tmp == IPATH_IBSTATE_ACTIVE)
+               tmp = 4;
+       else {
+               tmp = 0;        /* down */
+               tmp2 = tmp & 0xf;
+       }
+
+       portinfo[8] = (1 << 28) /* LinkSpeedSupported */
+               | (tmp << 24)   /* PortState */
+               | (tmp2 << 20)  /* PortPhysicalState */
+               | (2 << 16)
+
+               /* LinkDownDefaultState */
+               /* M_KeyProtectBits == 0 */
+               /* NotImpl yet LMC == 0 (we can support all values) */
+               | (1 << 4)      /* LinkSpeedActive */
+               | (1 << 0);     /* LinkSpeedEnabled */
+       switch (dd->ipath_ibmtu) {
+       case 4096:
+               tmp = 5;
+               break;
+       case 2048:
+               tmp = 4;
+               break;
+       case 1024:
+               tmp = 3;
+               break;
+       case 512:
+               tmp = 2;
+               break;
+       case 256:
+               tmp = 1;
+               break;
+       default:                /* oops, something is wrong */
+               ipath_dbg("Problem, ipath_ibmtu 0x%x not a valid IB MTU, "
+                         "treat as 2048\n", dd->ipath_ibmtu);
+               tmp = 4;
+               break;
+       }
+       portinfo[9] = (tmp << 28)
+               /* NeighborMTU */
+               /* Notimpl MasterSMSL */
+               | (1 << 20)
+
+               /* VLCap */
+               /* Notimpl InitType (actually, an SMA decision) */
+               /* VLHighLimit is 0 (only one VL) */
+               ; /* VLArbitrationHighCap is 0 (only one VL) */
+       portinfo[10] =  /* VLArbitrationLowCap is 0 (only one VL) */
+               /* InitTypeReply is SMA decision */
+               (5 << 16)       /* MTUCap 4096 */
+               | (7 << 13)     /* VLStallCount */
+               | (0x1f << 8)   /* HOQLife */
+               | (1 << 4)
+
+               /* OperationalVLs 0 */
+               /* PartitionEnforcementInbound */
+               /* PartitionEnforcementOutbound not enforced */
+               /* FilterRawinbound not enforced */
+               ;               /* FilterRawOutbound not enforced */
+       /* M_KeyViolations are not counted by hardware, SMA can count */
+       tmp = ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
+       /* P_KeyViolations are counted by hardware. */
+       portinfo[11] = ((tmp & 0xffff) << 0);
+       portinfo[12] =
+               /* Q_KeyViolations are not counted by hardware */
+               (1 << 8)
+
+               /* GUIDCap */
+               /* SubnetTimeOut handled by SMA */
+               /* RespTimeValue handled by SMA */
+               ;
+       /* LocalPhyErrors are programmed to max */
+       portinfo[12] |= (0xf << 20)
+               | (0xf << 16)   /* OverRunErrors are programmed to max */
+               ;
+
+       return simple_read_from_buffer(buf, count, ppos, portinfo,
+                                      sizeof portinfo);
+}
+
+static struct file_operations atomic_port_info_ops = {
+       .read = atomic_port_info_read,
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+                         size_t count, loff_t *ppos)
+{
+       struct ipath_devdata *dd;
+       ssize_t ret;
+       loff_t pos;
+       char *tmp;
+
+       pos = *ppos;
+
+       if ( pos < 0) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (pos >= sizeof(struct ipath_flash)) {
+               ret = 0;
+               goto bail;
+       }
+
+       if (count > sizeof(struct ipath_flash) - pos)
+               count = sizeof(struct ipath_flash) - pos;
+
+       tmp = kmalloc(count, GFP_KERNEL);
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       dd = file->f_dentry->d_inode->u.generic_ip;
+       if (ipath_eeprom_read(dd, pos, tmp, count)) {
+               ipath_dev_err(dd, "failed to read from flash\n");
+               ret = -ENXIO;
+               goto bail_tmp;
+       }
+
+       if (copy_to_user(buf, tmp, count)) {
+               ret = -EFAULT;
+               goto bail_tmp;
+       }
+
+       *ppos = pos + count;
+       ret = count;
+
+bail_tmp:
+       kfree(tmp);
+
+bail:
+       return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       struct ipath_devdata *dd;
+       ssize_t ret;
+       loff_t pos;
+       char *tmp;
+
+       pos = *ppos;
+
+       if ( pos < 0) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (pos >= sizeof(struct ipath_flash)) {
+               ret = 0;
+               goto bail;
+       }
+
+       if (count > sizeof(struct ipath_flash) - pos)
+               count = sizeof(struct ipath_flash) - pos;
+
+       tmp = kmalloc(count, GFP_KERNEL);
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       if (copy_from_user(tmp, buf, count)) {
+               ret = -EFAULT;
+               goto bail_tmp;
+       }
+
+       dd = file->f_dentry->d_inode->u.generic_ip;
+       if (ipath_eeprom_write(dd, pos, tmp, count)) {
+               ret = -ENXIO;
+               ipath_dev_err(dd, "failed to write to flash\n");
+               goto bail_tmp;
+       }
+
+       *ppos = pos + count;
+       ret = count;
+
+bail_tmp:
+       kfree(tmp);
+
+bail:
+       return ret;
+}
+
+static struct file_operations flash_ops = {
+       .read = flash_read,
+       .write = flash_write,
+};
+
+static int create_device_files(struct super_block *sb,
+                              struct ipath_devdata *dd)
+{
+       struct dentry *dir, *tmp;
+       char unit[10];
+       int ret;
+
+       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+       ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+                         (struct file_operations *) &simple_dir_operations,
+                         dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+       ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
+                         &atomic_counters_ops, dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s/atomic_counters) "
+                      "failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+       ret = create_file("node_info", S_IFREG|S_IRUGO, dir, &tmp,
+                         &atomic_node_info_ops, dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s/node_info) "
+                      "failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+       ret = create_file("port_info", S_IFREG|S_IRUGO, dir, &tmp,
+                         &atomic_port_info_ops, dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s/port_info) "
+                      "failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+       ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+                         &flash_ops, dd);
+       if (ret) {
+               printk(KERN_ERR "create_file(%s/flash) "
+                      "failed: %d\n", unit, ret);
+               goto bail;
+       }
+
+bail:
+       return ret;
+}
+
+static void remove_file(struct dentry *parent, char *name)
+{
+       struct dentry *tmp;
+
+       tmp = lookup_one_len(name, parent, strlen(name));
+
+       spin_lock(&dcache_lock);
+       spin_lock(&tmp->d_lock);
+       if (!(d_unhashed(tmp) && tmp->d_inode)) {
+               dget_locked(tmp);
+               __d_drop(tmp);
+               spin_unlock(&tmp->d_lock);
+               spin_unlock(&dcache_lock);
+               simple_unlink(parent->d_inode, tmp);
+       } else {
+               spin_unlock(&tmp->d_lock);
+               spin_unlock(&dcache_lock);
+       }
+}
+
+static int remove_device_files(struct super_block *sb,
+                              struct ipath_devdata *dd)
+{
+       struct dentry *dir, *root;
+       char unit[10];
+       int ret;
+
+       root = dget(sb->s_root);
+       mutex_lock(&root->d_inode->i_mutex);
+       snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+       dir = lookup_one_len(unit, root, strlen(unit));
+
+       if (IS_ERR(dir)) {
+               ret = PTR_ERR(dir);
+               printk(KERN_ERR "Lookup of %s failed\n", unit);
+               goto bail;
+       }
+
+       remove_file(dir, "flash");
+       remove_file(dir, "port_info");
+       remove_file(dir, "node_info");
+       remove_file(dir, "atomic_counters");
+       d_delete(dir);
+       ret = simple_rmdir(root->d_inode, dir);
+
+bail:
+       mutex_unlock(&root->d_inode->i_mutex);
+       dput(root);
+       return ret;
+}
+
+static int ipathfs_fill_super(struct super_block *sb, void *data,
+                             int silent)
+{
+       struct ipath_devdata *dd, *tmp;
+       unsigned long flags;
+       int ret;
+
+       static struct tree_descr files[] = {
+               [1] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
+               {""},
+       };
+
+       ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
+       if (ret) {
+               printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
+               goto bail;
+       }
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+               spin_unlock_irqrestore(&ipath_devs_lock, flags);
+               ret = create_device_files(sb, dd);
+               if (ret) {
+                       deactivate_super(sb);
+                       goto bail;
+               }
+               spin_lock_irqsave(&ipath_devs_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+bail:
+       return ret;
+}
+
+static struct super_block *ipathfs_get_sb(struct file_system_type *fs_type,
+                                       int flags, const char *dev_name,
+                                       void *data)
+{
+       ipath_super = get_sb_single(fs_type, flags, data,
+                                   ipathfs_fill_super);
+       return ipath_super;
+}
+
+static void ipathfs_kill_super(struct super_block *s)
+{
+       kill_litter_super(s);
+       ipath_super = NULL;
+}
+
+int ipathfs_add_device(struct ipath_devdata *dd)
+{
+       int ret;
+
+       if (ipath_super == NULL) {
+               ret = 0;
+               goto bail;
+       }
+
+       ret = create_device_files(ipath_super, dd);
+
+bail:
+       return ret;
+}
+
+int ipathfs_remove_device(struct ipath_devdata *dd)
+{
+       int ret;
+
+       if (ipath_super == NULL) {
+               ret = 0;
+               goto bail;
+       }
+
+       ret = remove_device_files(ipath_super, dd);
+
+bail:
+       return ret;
+}
+
+static struct file_system_type ipathfs_fs_type = {
+       .owner =        THIS_MODULE,
+       .name =         "ipathfs",
+       .get_sb =       ipathfs_get_sb,
+       .kill_sb =      ipathfs_kill_super,
+};
+
+int __init ipath_init_ipathfs(void)
+{
+       return register_filesystem(&ipathfs_fs_type);
+}
+
+void __exit ipath_exit_ipathfs(void)
+{
+       unregister_filesystem(&ipathfs_fs_type);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_ht400.c b/drivers/infiniband/hw/ipath/ipath_ht400.c
new file mode 100644 (file)
index 0000000..4652435
--- /dev/null
@@ -0,0 +1,1586 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the InfiniPath
+ * HT-400 chip.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "ipath_kernel.h"
+#include "ipath_registers.h"
+
+/*
+ * This lists the InfiniPath HT400 registers, in the actual chip layout.
+ * This structure should never be directly accessed.
+ *
+ * The names are in InterCap form because they're taken straight from
+ * the chip specification.  Since they're only used in this file, they
+ * don't pollute the rest of the source.
+*/
+
+struct _infinipath_do_not_use_kernel_regs {
+       unsigned long long Revision;
+       unsigned long long Control;
+       unsigned long long PageAlign;
+       unsigned long long PortCnt;
+       unsigned long long DebugPortSelect;
+       unsigned long long DebugPort;
+       unsigned long long SendRegBase;
+       unsigned long long UserRegBase;
+       unsigned long long CounterRegBase;
+       unsigned long long Scratch;
+       unsigned long long ReservedMisc1;
+       unsigned long long InterruptConfig;
+       unsigned long long IntBlocked;
+       unsigned long long IntMask;
+       unsigned long long IntStatus;
+       unsigned long long IntClear;
+       unsigned long long ErrorMask;
+       unsigned long long ErrorStatus;
+       unsigned long long ErrorClear;
+       unsigned long long HwErrMask;
+       unsigned long long HwErrStatus;
+       unsigned long long HwErrClear;
+       unsigned long long HwDiagCtrl;
+       unsigned long long MDIO;
+       unsigned long long IBCStatus;
+       unsigned long long IBCCtrl;
+       unsigned long long ExtStatus;
+       unsigned long long ExtCtrl;
+       unsigned long long GPIOOut;
+       unsigned long long GPIOMask;
+       unsigned long long GPIOStatus;
+       unsigned long long GPIOClear;
+       unsigned long long RcvCtrl;
+       unsigned long long RcvBTHQP;
+       unsigned long long RcvHdrSize;
+       unsigned long long RcvHdrCnt;
+       unsigned long long RcvHdrEntSize;
+       unsigned long long RcvTIDBase;
+       unsigned long long RcvTIDCnt;
+       unsigned long long RcvEgrBase;
+       unsigned long long RcvEgrCnt;
+       unsigned long long RcvBufBase;
+       unsigned long long RcvBufSize;
+       unsigned long long RxIntMemBase;
+       unsigned long long RxIntMemSize;
+       unsigned long long RcvPartitionKey;
+       unsigned long long ReservedRcv[10];
+       unsigned long long SendCtrl;
+       unsigned long long SendPIOBufBase;
+       unsigned long long SendPIOSize;
+       unsigned long long SendPIOBufCnt;
+       unsigned long long SendPIOAvailAddr;
+       unsigned long long TxIntMemBase;
+       unsigned long long TxIntMemSize;
+       unsigned long long ReservedSend[9];
+       unsigned long long SendBufferError;
+       unsigned long long SendBufferErrorCONT1;
+       unsigned long long SendBufferErrorCONT2;
+       unsigned long long SendBufferErrorCONT3;
+       unsigned long long ReservedSBE[4];
+       unsigned long long RcvHdrAddr0;
+       unsigned long long RcvHdrAddr1;
+       unsigned long long RcvHdrAddr2;
+       unsigned long long RcvHdrAddr3;
+       unsigned long long RcvHdrAddr4;
+       unsigned long long RcvHdrAddr5;
+       unsigned long long RcvHdrAddr6;
+       unsigned long long RcvHdrAddr7;
+       unsigned long long RcvHdrAddr8;
+       unsigned long long ReservedRHA[7];
+       unsigned long long RcvHdrTailAddr0;
+       unsigned long long RcvHdrTailAddr1;
+       unsigned long long RcvHdrTailAddr2;
+       unsigned long long RcvHdrTailAddr3;
+       unsigned long long RcvHdrTailAddr4;
+       unsigned long long RcvHdrTailAddr5;
+       unsigned long long RcvHdrTailAddr6;
+       unsigned long long RcvHdrTailAddr7;
+       unsigned long long RcvHdrTailAddr8;
+       unsigned long long ReservedRHTA[7];
+       unsigned long long Sync;        /* Software only */
+       unsigned long long Dump;        /* Software only */
+       unsigned long long SimVer;      /* Software only */
+       unsigned long long ReservedSW[5];
+       unsigned long long SerdesConfig0;
+       unsigned long long SerdesConfig1;
+       unsigned long long SerdesStatus;
+       unsigned long long XGXSConfig;
+       unsigned long long ReservedSW2[4];
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof(struct \
+    _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+#define IPATH_CREG_OFFSET(field) (offsetof( \
+    struct infinipath_counters, field) / sizeof(u64))
+
+static const struct ipath_kregs ipath_ht_kregs = {
+       .kr_control = IPATH_KREG_OFFSET(Control),
+       .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
+       .kr_debugport = IPATH_KREG_OFFSET(DebugPort),
+       .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
+       .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
+       .kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
+       .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
+       .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
+       .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
+       .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
+       .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
+       .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
+       .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
+       .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
+       .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
+       .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
+       .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
+       .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
+       .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
+       .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
+       .kr_intclear = IPATH_KREG_OFFSET(IntClear),
+       .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
+       .kr_intmask = IPATH_KREG_OFFSET(IntMask),
+       .kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
+       .kr_mdio = IPATH_KREG_OFFSET(MDIO),
+       .kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
+       .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
+       .kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
+       .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
+       .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
+       .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
+       .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
+       .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
+       .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
+       .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
+       .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
+       .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
+       .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
+       .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
+       .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
+       .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
+       .kr_revision = IPATH_KREG_OFFSET(Revision),
+       .kr_scratch = IPATH_KREG_OFFSET(Scratch),
+       .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
+       .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
+       .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
+       .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
+       .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
+       .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
+       .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
+       .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
+       .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
+       .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
+       .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
+       .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
+       .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
+       .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
+       /*
+        * These should not be used directly via ipath_read_kreg64(),
+        * use them with ipath_read_kreg64_port(),
+        */
+       .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
+       .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
+};
+
+static const struct ipath_cregs ipath_ht_cregs = {
+       .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
+       .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
+       .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
+       .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
+       .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
+       .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
+       .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
+       .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
+       .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
+       .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
+       .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
+       .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
+       /* calc from Reg_CounterRegBase + offset */
+       .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
+       .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
+       .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
+       .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
+       .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
+       .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
+       .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
+       .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
+       .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
+       .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
+       .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
+       .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
+       .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
+       .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
+       .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
+       .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
+       .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
+       .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
+       .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
+       .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
+       .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
+};
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_MASK 0x1FF
+#define INFINIPATH_I_RCVAVAIL_MASK 0x1FF
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
+#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
+#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
+#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
+#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
+#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
+#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
+#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
+#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
+#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
+#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
+#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
+#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_FREQSEL 0x2
+#define INFINIPATH_EXTS_SERDESSEL 0x4
+#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
+
+/*
+ * masks and bits that are different in different chips, or present only
+ * in one
+ */
+static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
+    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
+static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
+    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
+
+static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
+
+#define _IPATH_GPIO_SDA_NUM 1
+#define _IPATH_GPIO_SCL_NUM 0
+
+#define IPATH_GPIO_SDA \
+       (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+#define IPATH_GPIO_SCL \
+       (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+
+/* keep the code below somewhat more readonable; not used elsewhere */
+#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
+                               infinipath_hwe_htclnkabyte1crcerr)
+#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |    \
+                               infinipath_hwe_htclnkbbyte1crcerr)
+#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |    \
+                               infinipath_hwe_htclnkbbyte0crcerr)
+#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |    \
+                               infinipath_hwe_htclnkbbyte1crcerr)
+
+static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
+                         char *msg, size_t msgl)
+{
+       char bitsmsg[64];
+       ipath_err_t crcbits = hwerrs &
+               (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
+       /* don't check if 8bit HT */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+               crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
+       /* don't check if 8bit HT */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+               crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
+       /*
+        * we'll want to ignore link errors on link that is
+        * not in use, if any.  For now, complain about both
+        */
+       if (crcbits) {
+               u16 ctrl0, ctrl1;
+               snprintf(bitsmsg, sizeof bitsmsg,
+                        "[HT%s lane %s CRC (%llx); ignore till reload]",
+                        !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
+                        "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
+                                   ? "1 (B)" : "0+1 (A+B)"),
+                        !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
+                        : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
+                           "0+1"), (unsigned long long) crcbits);
+               strlcat(msg, bitsmsg, msgl);
+
+               /*
+                * print extra info for debugging.  slave/primary
+                * config word 4, 8 (link control 0, 1)
+                */
+
+               if (pci_read_config_word(dd->pcidev,
+                                        dd->ipath_ht_slave_off + 0x4,
+                                        &ctrl0))
+                       dev_info(&dd->pcidev->dev, "Couldn't read "
+                                "linkctrl0 of slave/primary "
+                                "config block\n");
+               else if (!(ctrl0 & 1 << 6))
+                       /* not if EOC bit set */
+                       ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
+                                 ((ctrl0 >> 8) & 7) ? " CRC" : "",
+                                 ((ctrl0 >> 4) & 1) ? "linkfail" :
+                                 "");
+               if (pci_read_config_word(dd->pcidev,
+                                        dd->ipath_ht_slave_off + 0x8,
+                                        &ctrl1))
+                       dev_info(&dd->pcidev->dev, "Couldn't read "
+                                "linkctrl1 of slave/primary "
+                                "config block\n");
+               else if (!(ctrl1 & 1 << 6))
+                       /* not if EOC bit set */
+                       ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
+                                 ((ctrl1 >> 8) & 7) ? " CRC" : "",
+                                 ((ctrl1 >> 4) & 1) ? "linkfail" :
+                                 "");
+
+               /* disable until driver reloaded */
+               dd->ipath_hwerrmask &= ~crcbits;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+               ipath_dbg("HT crc errs: %s\n", msg);
+       } else
+               ipath_dbg("ignoring HT crc errors 0x%llx, "
+                         "not in use\n", (unsigned long long)
+                         (hwerrs & (_IPATH_HTLINK0_CRCBITS |
+                                    _IPATH_HTLINK1_CRCBITS)));
+}
+
+/**
+ * ipath_ht_handle_hwerrors - display hardware errors
+ * @dd: the infinipath device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid
+ * excessive stack use.  Most hardware errors are catastrophic, but for
+ * right now, we'll print them and continue.
+ * We reuse the same message buffer as ipath_handle_errors() to avoid
+ * excessive stack usage.
+ */
+static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
+                                    size_t msgl)
+{
+       ipath_err_t hwerrs;
+       u32 bits, ctrl;
+       int isfatal = 0;
+       char bitsmsg[64];
+
+       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+
+       if (!hwerrs) {
+               ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
+               /*
+                * better than printing cofusing messages
+                * This seems to be related to clearing the crc error, or
+                * the pll error during init.
+                */
+               goto bail;
+       } else if (hwerrs == -1LL) {
+               ipath_dev_err(dd, "Read of hardware error status failed "
+                             "(all bits set); ignoring\n");
+               goto bail;
+       }
+       ipath_stats.sps_hwerrs++;
+
+       /* Always clear the error status register, except MEMBISTFAIL,
+        * regardless of whether we continue or stop using the chip.
+        * We want that set so we know it failed, even across driver reload.
+        * We'll still ignore it in the hwerrmask.  We do this partly for
+        * diagnostics, but also for support */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
+
+       hwerrs &= dd->ipath_hwerrmask;
+
+       /*
+        * make sure we get this much out, unless told to be quiet,
+        * or it's occurred within the last 5 seconds
+        */
+       if ((hwerrs & ~dd->ipath_lasthwerror) ||
+           (ipath_debug & __IPATH_VERBDBG))
+               dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
+                        "(cleared)\n", (unsigned long long) hwerrs);
+       dd->ipath_lasthwerror |= hwerrs;
+
+       if (hwerrs & ~infinipath_hwe_bitsextant)
+               ipath_dev_err(dd, "hwerror interrupt with unknown errors "
+                             "%llx set\n", (unsigned long long)
+                             (hwerrs & ~infinipath_hwe_bitsextant));
+
+       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+       if (ctrl & INFINIPATH_C_FREEZEMODE) {
+               if (hwerrs) {
+                       /*
+                        * if any set that we aren't ignoring; only
+                        * make the complaint once, in case it's stuck
+                        * or recurring, and we get here multiple
+                        * times.
+                        */
+                       if (dd->ipath_flags & IPATH_INITTED) {
+                               ipath_dev_err(dd, "Fatal Error (freeze "
+                                             "mode), no longer usable\n");
+                               isfatal = 1;
+                       }
+                       *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+                       /* mark as having had error */
+                       *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+                       /*
+                        * mark as not usable, at a minimum until driver
+                        * is reloaded, probably until reboot, since no
+                        * other reset is possible.
+                        */
+                       dd->ipath_flags &= ~IPATH_INITTED;
+               } else {
+                       ipath_dbg("Clearing freezemode on ignored hardware "
+                                 "error\n");
+                       ctrl &= ~INFINIPATH_C_FREEZEMODE;
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                                        ctrl);
+               }
+       }
+
+       *msg = '\0';
+
+       /*
+        * may someday want to decode into which bits are which
+        * functional area for parity errors, etc.
+        */
+       if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
+                     << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
+               bits = (u32) ((hwerrs >>
+                              INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
+                             INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
+               snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
+                        bits);
+               strlcat(msg, bitsmsg, msgl);
+       }
+       if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK
+                     << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) {
+               bits = (u32) ((hwerrs >>
+                              INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) &
+                             INFINIPATH_HWE_RXEMEMPARITYERR_MASK);
+               snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ",
+                        bits);
+               strlcat(msg, bitsmsg, msgl);
+       }
+       if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK
+                     << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
+               bits = (u32) ((hwerrs >>
+                              INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) &
+                             INFINIPATH_HWE_TXEMEMPARITYERR_MASK);
+               snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ",
+                        bits);
+               strlcat(msg, bitsmsg, msgl);
+       }
+       if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR)
+               strlcat(msg, "[IB2IPATH Parity]", msgl);
+       if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR)
+               strlcat(msg, "[IPATH2IB Parity]", msgl);
+       if (hwerrs & INFINIPATH_HWE_HTCBUSIREQPARITYERR)
+               strlcat(msg, "[HTC Ireq Parity]", msgl);
+       if (hwerrs & INFINIPATH_HWE_HTCBUSTREQPARITYERR)
+               strlcat(msg, "[HTC Treq Parity]", msgl);
+       if (hwerrs & INFINIPATH_HWE_HTCBUSTRESPPARITYERR)
+               strlcat(msg, "[HTC Tresp Parity]", msgl);
+
+       if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
+               hwerr_crcbits(dd, hwerrs, msg, msgl);
+
+       if (hwerrs & INFINIPATH_HWE_HTCMISCERR5)
+               strlcat(msg, "[HT core Misc5]", msgl);
+       if (hwerrs & INFINIPATH_HWE_HTCMISCERR6)
+               strlcat(msg, "[HT core Misc6]", msgl);
+       if (hwerrs & INFINIPATH_HWE_HTCMISCERR7)
+               strlcat(msg, "[HT core Misc7]", msgl);
+       if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
+               strlcat(msg, "[Memory BIST test failed, HT-400 unusable]",
+                       msgl);
+               /* ignore from now on, so disable until driver reloaded */
+               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |       \
+                        INFINIPATH_HWE_COREPLL_RFSLIP |        \
+                        INFINIPATH_HWE_HTBPLL_FBSLIP |         \
+                        INFINIPATH_HWE_HTBPLL_RFSLIP |         \
+                        INFINIPATH_HWE_HTAPLL_FBSLIP |         \
+                        INFINIPATH_HWE_HTAPLL_RFSLIP)
+
+       if (hwerrs & _IPATH_PLL_FAIL) {
+               snprintf(bitsmsg, sizeof bitsmsg,
+                        "[PLL failed (%llx), HT-400 unusable]",
+                        (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
+               strlcat(msg, bitsmsg, msgl);
+               /* ignore from now on, so disable until driver reloaded */
+               dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+
+       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
+               /*
+                * If it occurs, it is left masked since the eternal
+                * interface is unused
+                */
+               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+
+       if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR)
+               strlcat(msg, "[Rx Dsync]", msgl);
+       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED)
+               strlcat(msg, "[SerDes PLL]", msgl);
+
+       ipath_dev_err(dd, "%s hardware error\n", msg);
+       if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
+               /*
+                * for status file; if no trailing brace is copied,
+                * we'll know it was truncated.
+                */
+               snprintf(dd->ipath_freezemsg,
+                        dd->ipath_freezelen, "{%s}", msg);
+
+bail:;
+}
+
+/**
+ * ipath_ht_boardname - fill in the board name
+ * @dd: the infinipath device
+ * @name: the output buffer
+ * @namelen: the size of the output buffer
+ *
+ * fill in the board name, based on the board revision register
+ */
+static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
+                             size_t namelen)
+{
+       char *n = NULL;
+       u8 boardrev = dd->ipath_boardrev;
+       int ret;
+
+       switch (boardrev) {
+       case 4:         /* Ponderosa is one of the bringup boards */
+               n = "Ponderosa";
+               break;
+       case 5:         /* HT-460 original production board */
+               n = "InfiniPath_HT-460";
+               break;
+       case 6:
+               n = "OEM_Board_3";
+               break;
+       case 7:
+               /* HT-460 small form factor production board */
+               n = "InfiniPath_HT-465";
+               break;
+       case 8:
+               n = "LS/X-1";
+               break;
+       case 9:         /* Comstock bringup test board */
+               n = "Comstock";
+               break;
+       case 10:
+               n = "OEM_Board_2";
+               break;
+       case 11:
+               n = "InfiniPath_HT-470";
+               break;
+       case 12:
+               n = "OEM_Board_4";
+               break;
+       default:                /* don't know, just print the number */
+               ipath_dev_err(dd, "Don't yet know about board "
+                             "with ID %u\n", boardrev);
+               snprintf(name, namelen, "Unknown_InfiniPath_HT-4xx_%u",
+                        boardrev);
+               break;
+       }
+       if (n)
+               snprintf(name, namelen, "%s", n);
+
+       if (dd->ipath_majrev != 3 || dd->ipath_minrev != 2) {
+               /*
+                * This version of the driver only supports the HT-400
+                * Rev 3.2
+                */
+               ipath_dev_err(dd,
+                             "Unsupported HT-400 revision %u.%u!\n",
+                             dd->ipath_majrev, dd->ipath_minrev);
+               ret = 1;
+               goto bail;
+       }
+       /*
+        * pkt/word counters are 32 bit, and therefore wrap fast enough
+        * that we snapshot them from a timer, and maintain 64 bit shadow
+        * copies
+        */
+       dd->ipath_flags |= IPATH_32BITCOUNTERS;
+       if (dd->ipath_htspeed != 800)
+               ipath_dev_err(dd,
+                             "Incorrectly configured for HT @ %uMHz\n",
+                             dd->ipath_htspeed);
+       if (dd->ipath_boardrev == 7 || dd->ipath_boardrev == 11 ||
+           dd->ipath_boardrev == 6)
+               dd->ipath_flags |= IPATH_GPIO_INTR;
+       else
+               dd->ipath_flags |= IPATH_POLL_RX_INTR;
+       if (dd->ipath_boardrev == 8) {  /* LS/X-1 */
+               u64 val;
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+               if (val & INFINIPATH_EXTS_SERDESSEL) {
+                       /*
+                        * hardware disabled
+                        *
+                        * This means that the chip is hardware disabled,
+                        * and will not be able to bring up the link,
+                        * in any case.  We special case this and abort
+                        * early, to avoid later messages.  We also set
+                        * the DISABLED status bit
+                        */
+                       ipath_dbg("Unit %u is hardware-disabled\n",
+                                 dd->ipath_unit);
+                       *dd->ipath_statusp |= IPATH_STATUS_DISABLED;
+                       /* this value is handled differently */
+                       ret = 2;
+                       goto bail;
+               }
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static void ipath_check_htlink(struct ipath_devdata *dd)
+{
+       u8 linkerr, link_off, i;
+
+       for (i = 0; i < 2; i++) {
+               link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
+               if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
+                       dev_info(&dd->pcidev->dev, "Couldn't read "
+                                "linkerror%d of HT slave/primary block\n",
+                                i);
+               else if (linkerr & 0xf0) {
+                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+                                  "clearing\n", linkerr >> 4, i);
+                       /*
+                        * writing the linkerr bits that are set should
+                        * clear them
+                        */
+                       if (pci_write_config_byte(dd->pcidev, link_off,
+                                                 linkerr))
+                               ipath_dbg("Failed write to clear HT "
+                                         "linkerror%d\n", i);
+                       if (pci_read_config_byte(dd->pcidev, link_off,
+                                                &linkerr))
+                               dev_info(&dd->pcidev->dev,
+                                        "Couldn't reread linkerror%d of "
+                                        "HT slave/primary block\n", i);
+                       else if (linkerr & 0xf0)
+                               dev_info(&dd->pcidev->dev,
+                                        "HT linkerror%d bits 0x%x "
+                                        "couldn't be cleared\n",
+                                        i, linkerr >> 4);
+               }
+       }
+}
+
+static int ipath_setup_ht_reset(struct ipath_devdata *dd)
+{
+       ipath_dbg("No reset possible for HT-400\n");
+       return 0;
+}
+
+#define HT_CAPABILITY_ID   0x08        /* HT capabilities not defined in kernel */
+#define HT_INTR_DISC_CONFIG  0x80      /* HT interrupt and discovery cap */
+#define HT_INTR_REG_INDEX    2 /* intconfig requires indirect accesses */
+
+/*
+ * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
+ * errors.  We only bother to do this at load time, because it's OK if
+ * it happened before we were loaded (first time after boot/reset),
+ * but any time after that, it's fatal anyway.  Also need to not check
+ * for for upper byte errors if we are in 8 bit mode, so figure out
+ * our width.  For now, at least, also complain if it's 8 bit.
+ */
+static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
+                            int pos, u8 cap_type)
+{
+       u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
+       u16 linkctrl = 0;
+       int i;
+
+       dd->ipath_ht_slave_off = pos;
+       /* command word, master_host bit */
+       /* master host || slave */
+       if ((cap_type >> 2) & 1)
+               link_a_b_off = 4;
+       else
+               link_a_b_off = 0;
+       ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
+                  link_a_b_off ? 1 : 0,
+                  link_a_b_off ? 'B' : 'A');
+
+       link_a_b_off += pos;
+
+       /*
+        * check both link control registers; clear both HT CRC sets if
+        * necessary.
+        */
+       for (i = 0; i < 2; i++) {
+               link_off = pos + i * 4 + 0x4;
+               if (pci_read_config_word(pdev, link_off, &linkctrl))
+                       ipath_dev_err(dd, "Couldn't read HT link control%d "
+                                     "register\n", i);
+               else if (linkctrl & (0xf << 8)) {
+                       ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
+                                  "bits %x\n", i, linkctrl & (0xf << 8));
+                       /*
+                        * now write them back to clear the error.
+                        */
+                       pci_write_config_byte(pdev, link_off,
+                                             linkctrl & (0xf << 8));
+               }
+       }
+
+       /*
+        * As with HT CRC bits, same for protocol errors that might occur
+        * during boot.
+        */
+       for (i = 0; i < 2; i++) {
+               link_off = pos + i * 4 + 0xd;
+               if (pci_read_config_byte(pdev, link_off, &linkerr))
+                       dev_info(&pdev->dev, "Couldn't read linkerror%d "
+                                "of HT slave/primary block\n", i);
+               else if (linkerr & 0xf0) {
+                       ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+                                  "clearing\n", linkerr >> 4, i);
+                       /*
+                        * writing the linkerr bits that are set will clear
+                        * them
+                        */
+                       if (pci_write_config_byte
+                           (pdev, link_off, linkerr))
+                               ipath_dbg("Failed write to clear HT "
+                                         "linkerror%d\n", i);
+                       if (pci_read_config_byte(pdev, link_off, &linkerr))
+                               dev_info(&pdev->dev, "Couldn't reread "
+                                        "linkerror%d of HT slave/primary "
+                                        "block\n", i);
+                       else if (linkerr & 0xf0)
+                               dev_info(&pdev->dev, "HT linkerror%d bits "
+                                        "0x%x couldn't be cleared\n",
+                                        i, linkerr >> 4);
+               }
+       }
+
+       /*
+        * this is just for our link to the host, not devices connected
+        * through tunnel.
+        */
+
+       if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
+               ipath_dev_err(dd, "Couldn't read HT link width "
+                             "config register\n");
+       else {
+               u32 width;
+               switch (linkwidth & 7) {
+               case 5:
+                       width = 4;
+                       break;
+               case 4:
+                       width = 2;
+                       break;
+               case 3:
+                       width = 32;
+                       break;
+               case 1:
+                       width = 16;
+                       break;
+               case 0:
+               default:        /* if wrong, assume 8 bit */
+                       width = 8;
+                       break;
+               }
+
+               dd->ipath_htwidth = width;
+
+               if (linkwidth != 0x11) {
+                       ipath_dev_err(dd, "Not configured for 16 bit HT "
+                                     "(%x)\n", linkwidth);
+                       if (!(linkwidth & 0xf)) {
+                               ipath_dbg("Will ignore HT lane1 errors\n");
+                               dd->ipath_flags |= IPATH_8BIT_IN_HT0;
+                       }
+               }
+       }
+
+       /*
+        * this is just for our link to the host, not devices connected
+        * through tunnel.
+        */
+       if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
+               ipath_dev_err(dd, "Couldn't read HT link frequency "
+                             "config register\n");
+       else {
+               u32 speed;
+               switch (linkwidth & 0xf) {
+               case 6:
+                       speed = 1000;
+                       break;
+               case 5:
+                       speed = 800;
+                       break;
+               case 4:
+                       speed = 600;
+                       break;
+               case 3:
+                       speed = 500;
+                       break;
+               case 2:
+                       speed = 400;
+                       break;
+               case 1:
+                       speed = 300;
+                       break;
+               default:
+                       /*
+                        * assume reserved and vendor-specific are 200...
+                        */
+               case 0:
+                       speed = 200;
+                       break;
+               }
+               dd->ipath_htspeed = speed;
+       }
+}
+
+static int set_int_handler(struct ipath_devdata *dd, struct pci_dev *pdev,
+                           int pos)
+{
+       u32 int_handler_addr_lower;
+       u32 int_handler_addr_upper;
+       u64 ihandler;
+       u32 intvec;
+
+       /* use indirection register to get the intr handler */
+       pci_write_config_byte(pdev, pos + HT_INTR_REG_INDEX, 0x10);
+       pci_read_config_dword(pdev, pos + 4, &int_handler_addr_lower);
+       pci_write_config_byte(pdev, pos + HT_INTR_REG_INDEX, 0x11);
+       pci_read_config_dword(pdev, pos + 4, &int_handler_addr_upper);
+
+       ihandler = (u64) int_handler_addr_lower |
+               ((u64) int_handler_addr_upper << 32);
+
+       /*
+        * kernels with CONFIG_PCI_MSI set the vector in the irq field of
+        * struct pci_device, so we use that to program the HT-400 internal
+        * interrupt register (not config space) with that value. The BIOS
+        * must still have done the basic MSI setup.
+        */
+       intvec = pdev->irq;
+       /*
+        * clear any vector bits there; normally not set but we'll overload
+        * this for some debug purposes (setting the HTC debug register
+        * value from software, rather than GPIOs), so it might be set on a
+        * driver reload.
+        */
+       ihandler &= ~0xff0000;
+       /* x86 vector goes in intrinfo[23:16] */
+       ihandler |= intvec << 16;
+       ipath_cdbg(VERBOSE, "ihandler lower %x, upper %x, intvec %x, "
+                  "interruptconfig %llx\n", int_handler_addr_lower,
+                  int_handler_addr_upper, intvec,
+                  (unsigned long long) ihandler);
+
+       /* can't program yet, so save for interrupt setup */
+       dd->ipath_intconfig = ihandler;
+       /* keep going, so we find link control stuff also */
+
+       return ihandler != 0;
+}
+
+/**
+ * ipath_setup_ht_config - setup the interruptconfig register
+ * @dd: the infinipath device
+ * @pdev: the PCI device
+ *
+ * setup the interruptconfig register from the HT config info.
+ * Also clear CRC errors in HT linkcontrol, if necessary.
+ * This is done only for the real hardware.  It is done before
+ * chip address space is initted, so can't touch infinipath registers
+ */
+static int ipath_setup_ht_config(struct ipath_devdata *dd,
+                                struct pci_dev *pdev)
+{
+       int pos, ret = 0;
+       int ihandler = 0;
+
+       /*
+        * Read the capability info to find the interrupt info, and also
+        * handle clearing CRC errors in linkctrl register if necessary.  We
+        * do this early, before we ever enable errors or hardware errors,
+        * mostly to avoid causing the chip to enter freeze mode.
+        */
+       pos = pci_find_capability(pdev, HT_CAPABILITY_ID);
+       if (!pos) {
+               ipath_dev_err(dd, "Couldn't find HyperTransport "
+                             "capability; no interrupts\n");
+               ret = -ENODEV;
+               goto bail;
+       }
+       do {
+               u8 cap_type;
+
+               /* the HT capability type byte is 3 bytes after the
+                * capability byte.
+                */
+               if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
+                       dev_info(&pdev->dev, "Couldn't read config "
+                                "command @ %d\n", pos);
+                       continue;
+               }
+               if (!(cap_type & 0xE0))
+                       slave_or_pri_blk(dd, pdev, pos, cap_type);
+               else if (cap_type == HT_INTR_DISC_CONFIG)
+                       ihandler = set_int_handler(dd, pdev, pos);
+       } while ((pos = pci_find_next_capability(pdev, pos,
+                                                HT_CAPABILITY_ID)));
+
+       if (!ihandler) {
+               ipath_dev_err(dd, "Couldn't find interrupt handler in "
+                             "config space\n");
+               ret = -ENODEV;
+       }
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the infinipath device
+ *
+ * Called during driver unload.
+ * This is currently a nop for the HT-400, not for all chips
+ */
+static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
+{
+}
+
+/**
+ * ipath_setup_ht_setextled - set the state of the two external LEDs
+ * @dd: the infinipath device
+ * @lst: the L state
+ * @ltst: the LT state
+ *
+ * Set the state of the two external LEDs, to indicate physical and
+ * logical state of IB link.   For this chip (at least with recommended
+ * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
+ * (logical state)
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
+                                    u64 lst, u64 ltst)
+{
+       u64 extctl;
+
+       /* the diags use the LED to indicate diag info, so we leave
+        * the external LED alone when the diags are running */
+       if (ipath_diag_inuse)
+               return;
+
+       /*
+        * start by setting both LED control bits to off, then turn
+        * on the appropriate bit(s).
+        */
+       if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
+               /*
+                * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
+                * is inverted,  because it is normally used to indicate
+                * a hardware fault at reset, if there were errors
+                */
+               extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
+                       | INFINIPATH_EXTC_LEDGBLERR_OFF;
+               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+                       extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
+               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+                       extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
+       }
+       else {
+               extctl = dd->ipath_extctrl &
+                       ~(INFINIPATH_EXTC_LED1PRIPORT_ON |
+                         INFINIPATH_EXTC_LED2PRIPORT_ON);
+               if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+                       extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
+               if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+                       extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
+       }
+       dd->ipath_extctrl = extctl;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
+}
+
+static void ipath_init_ht_variables(void)
+{
+       ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+       ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+       ipath_gpio_sda = IPATH_GPIO_SDA;
+       ipath_gpio_scl = IPATH_GPIO_SCL;
+
+       infinipath_i_bitsextant =
+               (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
+               (INFINIPATH_I_RCVAVAIL_MASK <<
+                INFINIPATH_I_RCVAVAIL_SHIFT) |
+               INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
+               INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
+
+       infinipath_e_bitsextant =
+               INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
+               INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
+               INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
+               INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
+               INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
+               INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
+               INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+               INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
+               INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
+               INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
+               INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
+               INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
+               INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
+               INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
+               INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
+               INFINIPATH_E_HARDWARE;
+
+       infinipath_hwe_bitsextant =
+               (INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
+               (INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
+               (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+               INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
+               INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
+               INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
+               INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
+               INFINIPATH_HWE_HTCMISCERR4 |
+               INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
+               INFINIPATH_HWE_HTCMISCERR7 |
+               INFINIPATH_HWE_HTCBUSTREQPARITYERR |
+               INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
+               INFINIPATH_HWE_HTCBUSIREQPARITYERR |
+               INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
+               INFINIPATH_HWE_MEMBISTFAILED |
+               INFINIPATH_HWE_COREPLL_FBSLIP |
+               INFINIPATH_HWE_COREPLL_RFSLIP |
+               INFINIPATH_HWE_HTBPLL_FBSLIP |
+               INFINIPATH_HWE_HTBPLL_RFSLIP |
+               INFINIPATH_HWE_HTAPLL_FBSLIP |
+               INFINIPATH_HWE_HTAPLL_RFSLIP |
+               INFINIPATH_HWE_SERDESPLLFAILED |
+               INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
+               INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
+
+       infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+       infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+}
+
+/**
+ * ipath_ht_init_hwerrors - enable hardware errors
+ * @dd: the infinipath device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
+{
+       ipath_err_t val;
+       u64 extsval;
+
+       extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+       if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
+               ipath_dev_err(dd, "MemBIST did not complete!\n");
+
+       ipath_check_htlink(dd);
+
+       /* barring bugs, all hwerrors become interrupts, which can */
+       val = -1LL;
+       /* don't look at crc lane1 if 8 bit */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+               val &= ~infinipath_hwe_htclnkabyte1crcerr;
+       /* don't look at crc lane1 if 8 bit */
+       if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+               val &= ~infinipath_hwe_htclnkbbyte1crcerr;
+
+       /*
+        * disable RXDSYNCMEMPARITY because external serdes is unused,
+        * and therefore the logic will never be used or initialized,
+        * and uninitialized state will normally result in this error
+        * being asserted.  Similarly for the external serdess pll
+        * lock signal.
+        */
+       val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
+                INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
+
+       /*
+        * Disable MISCERR4 because of an inversion in the HT core
+        * logic checking for errors that cause this bit to be set.
+        * The errata can also cause the protocol error bit to be set
+        * in the HT config space linkerror register(s).
+        */
+       val &= ~INFINIPATH_HWE_HTCMISCERR4;
+
+       /*
+        * PLL ignored because MDIO interface has a logic problem
+        * for reads, on Comstock and Ponderosa.  BRINGUP
+        */
+       if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
+               val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+       dd->ipath_hwerrmask = val;
+}
+
+/**
+ * ipath_ht_bringup_serdes - bring up the serdes
+ * @dd: the infinipath device
+ */
+static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
+{
+       u64 val, config1;
+       int ret = 0, change = 0;
+
+       ipath_dbg("Trying to bringup serdes\n");
+
+       if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
+           INFINIPATH_HWE_SERDESPLLFAILED)
+       {
+               ipath_dbg("At start, serdes PLL failed bit set in "
+                         "hwerrstatus, clearing and continuing\n");
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                                INFINIPATH_HWE_SERDESPLLFAILED);
+       }
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+       config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
+
+       ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
+                  "config1=%llx, sstatus=%llx xgxs %llx\n",
+                  (unsigned long long) val, (unsigned long long) config1,
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+       /* force reset on */
+       val |= INFINIPATH_SERDC0_RESET_PLL
+               /* | INFINIPATH_SERDC0_RESET_MASK */
+               ;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+       udelay(15);             /* need pll reset set at least for a bit */
+
+       if (val & INFINIPATH_SERDC0_RESET_PLL) {
+               u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
+               /* set lane resets, and tx idle, during pll reset */
+               val2 |= INFINIPATH_SERDC0_RESET_MASK |
+                       INFINIPATH_SERDC0_TXIDLE;
+               ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
+                          "%llx)\n", (unsigned long long) val2);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+                                val2);
+               /*
+                * be sure chip saw it
+                */
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+               /*
+                * need pll reset clear at least 11 usec before lane
+                * resets cleared; give it a few more
+                */
+               udelay(15);
+               val = val2;     /* for check below */
+       }
+
+       if (val & (INFINIPATH_SERDC0_RESET_PLL |
+                  INFINIPATH_SERDC0_RESET_MASK |
+                  INFINIPATH_SERDC0_TXIDLE)) {
+               val &= ~(INFINIPATH_SERDC0_RESET_PLL |
+                        INFINIPATH_SERDC0_RESET_MASK |
+                        INFINIPATH_SERDC0_TXIDLE);
+               /* clear them */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+                                val);
+       }
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+       if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) &
+            INFINIPATH_XGXS_MDIOADDR_MASK) != 3) {
+               val &= ~(INFINIPATH_XGXS_MDIOADDR_MASK <<
+                        INFINIPATH_XGXS_MDIOADDR_SHIFT);
+               /*
+                * we use address 3
+                */
+               val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT;
+               change = 1;
+       }
+       if (val & INFINIPATH_XGXS_RESET) {
+               /* normally true after boot */
+               val &= ~INFINIPATH_XGXS_RESET;
+               change = 1;
+       }
+       if (change)
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+       /* clear current and de-emphasis bits */
+       config1 &= ~0x0ffffffff00ULL;
+       /* set current to 20ma */
+       config1 |= 0x00000000000ULL;
+       /* set de-emphasis to -5.68dB */
+       config1 |= 0x0cccc000000ULL;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
+
+       ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
+                  "config1=%llx, sstatus=%llx xgxs %llx\n",
+                  (unsigned long long) val, (unsigned long long) config1,
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+       if (!ipath_waitfor_mdio_cmdready(dd)) {
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_mdio,
+                                ipath_mdio_req(IPATH_MDIO_CMD_READ, 31,
+                                               IPATH_MDIO_CTRL_XGXS_REG_8,
+                                               0));
+               if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio,
+                                          IPATH_MDIO_DATAVALID, &val))
+                       ipath_dbg("Never got MDIO data for XGXS status "
+                                 "read\n");
+               else
+                       ipath_cdbg(VERBOSE, "MDIO Read reg8, "
+                                  "'bank' 31 %x\n", (u32) val);
+       } else
+               ipath_dbg("Never got MDIO cmdready for XGXS status read\n");
+
+       return ret;             /* for now, say we always succeeded */
+}
+
+/**
+ * ipath_ht_quiet_serdes - set serdes to txidle
+ * @dd: the infinipath device
+ * driver is being unloaded
+ */
+static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
+{
+       u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+       val |= INFINIPATH_SERDC0_TXIDLE;
+       ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
+                 (unsigned long long) val);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+}
+
+static int ipath_ht_intconfig(struct ipath_devdata *dd)
+{
+       int ret;
+
+       if (!dd->ipath_intconfig) {
+               ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
+                             "interrupt address\n");
+               ret = 1;
+               goto bail;
+       }
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
+                        dd->ipath_intconfig);  /* interrupt address */
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_pe_put_tid - write a TID in chip
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to udpate
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void ipath_ht_put_tid(struct ipath_devdata *dd,
+                            u64 __iomem *tidptr, u32 type,
+                            unsigned long pa)
+{
+       if (pa != dd->ipath_tidinvalid) {
+               if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
+                       dev_info(&dd->pcidev->dev,
+                                "physaddr %lx has more than "
+                                "40 bits, using only 40!!!\n", pa);
+                       pa &= INFINIPATH_RT_ADDR_MASK;
+               }
+               if (type == 0)
+                       pa |= dd->ipath_tidtemplate;
+               else {
+                       /* in words (fixed, full page).  */
+                       u64 lenvalid = PAGE_SIZE >> 2;
+                       lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+                       pa |= lenvalid | INFINIPATH_RT_VALID;
+               }
+       }
+       if (dd->ipath_kregbase)
+               writeq(pa, tidptr);
+}
+
+/**
+ * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
+ * @dd: the infinipath device
+ * @port: the port
+ *
+ * Used from ipath_close(), and at chip initialization.
+ */
+static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
+{
+       u64 __iomem *tidbase;
+       int i;
+
+       if (!dd->ipath_kregbase)
+               return;
+
+       ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
+
+       /*
+        * need to invalidate all of the expected TID entries for this
+        * port, so we don't have valid entries that might somehow get
+        * used (early in next use of this port, or through some bug)
+        */
+       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+                                  dd->ipath_rcvtidbase +
+                                  port * dd->ipath_rcvtidcnt *
+                                  sizeof(*tidbase));
+       for (i = 0; i < dd->ipath_rcvtidcnt; i++)
+               ipath_ht_put_tid(dd, &tidbase[i], 1, dd->ipath_tidinvalid);
+
+       tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+                                  dd->ipath_rcvegrbase +
+                                  port * dd->ipath_rcvegrcnt *
+                                  sizeof(*tidbase));
+
+       for (i = 0; i < dd->ipath_rcvegrcnt; i++)
+               ipath_ht_put_tid(dd, &tidbase[i], 0, dd->ipath_tidinvalid);
+}
+
+/**
+ * ipath_ht_tidtemplate - setup constants for TID updates
+ * @dd: the infinipath device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
+{
+       dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
+       dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+       dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
+
+       /*
+        * work around chip errata bug 7358, by marking invalid tids
+        * as having max length
+        */
+       dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+               INFINIPATH_RT_BUFSIZE_SHIFT;
+}
+
+static int ipath_ht_early_init(struct ipath_devdata *dd)
+{
+       u32 __iomem *piobuf;
+       u32 pioincr, val32, egrsize;
+       int i;
+
+       /*
+        * one cache line; long IB headers will spill over into received
+        * buffer
+        */
+       dd->ipath_rcvhdrentsize = 16;
+       dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+
+       /*
+        * For HT-400, we allocate a somewhat overly large eager buffer,
+        * such that we can guarantee that we can receive the largest
+        * packet that we can send out.  To truly support a 4KB MTU,
+        * we need to bump this to a large value.  To date, other than
+        * testing, we have never encountered an HCA that can really
+        * send 4KB MTU packets, so we do not handle that (we'll get
+        * errors interrupts if we ever see one).
+        */
+       dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
+       egrsize = dd->ipath_rcvegrbufsize;
+
+       /*
+        * the min() check here is currently a nop, but it may not
+        * always be, depending on just how we do ipath_rcvegrbufsize
+        */
+       dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
+                                dd->ipath_rcvegrbufsize);
+       dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+       ipath_ht_tidtemplate(dd);
+
+       /*
+        * zero all the TID entries at startup.  We do this for sanity,
+        * in case of a previous driver crash of some kind, and also
+        * because the chip powers up with these memories in an unknown
+        * state.  Use portcnt, not cfgports, since this is for the
+        * full chip, not for current (possibly different) configuration
+        * value.
+        * Chip Errata bug 6447
+        */
+       for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
+               ipath_ht_clear_tids(dd, val32);
+
+       /*
+        * write the pbc of each buffer, to be sure it's initialized, then
+        * cancel all the buffers, and also abort any packets that might
+        * have been in flight for some reason (the latter is for driver
+        * unload/reload, but isn't a bad idea at first init).  PIO send
+        * isn't enabled at this point, so there is no danger of sending
+        * these out on the wire.
+        * Chip Errata bug 6610
+        */
+       piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
+                                 dd->ipath_piobufbase);
+       pioincr = dd->ipath_palign / sizeof(*piobuf);
+       for (i = 0; i < dd->ipath_piobcnt2k; i++) {
+               /*
+                * reasonable word count, just to init pbc
+                */
+               writel(16, piobuf);
+               piobuf += pioincr;
+       }
+       /*
+        * self-clearing
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        INFINIPATH_S_ABORT);
+       return 0;
+}
+
+/**
+ * ipath_init_ht_get_base_info - set chip-specific flags for user code
+ * @dd: the infinipath device
+ * @kbase: ipath_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
+{
+       struct ipath_base_info *kinfo = kbase;
+
+       kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
+               IPATH_RUNTIME_RCVHDR_COPY;
+
+       return 0;
+}
+
+/**
+ * ipath_init_ht400_funcs - set up the chip-specific function pointers
+ * @dd: the infinipath device
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+void ipath_init_ht400_funcs(struct ipath_devdata *dd)
+{
+       dd->ipath_f_intrsetup = ipath_ht_intconfig;
+       dd->ipath_f_bus = ipath_setup_ht_config;
+       dd->ipath_f_reset = ipath_setup_ht_reset;
+       dd->ipath_f_get_boardname = ipath_ht_boardname;
+       dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
+       dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
+       dd->ipath_f_early_init = ipath_ht_early_init;
+       dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
+       dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
+       dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
+       dd->ipath_f_clear_tids = ipath_ht_clear_tids;
+       dd->ipath_f_put_tid = ipath_ht_put_tid;
+       dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
+       dd->ipath_f_setextled = ipath_setup_ht_setextled;
+       dd->ipath_f_get_base_info = ipath_ht_get_base_info;
+
+       /*
+        * initialize chip-specific variables
+        */
+       dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
+
+       /*
+        * setup the register offsets, since they are different for each
+        * chip
+        */
+       dd->ipath_kregs = &ipath_ht_kregs;
+       dd->ipath_cregs = &ipath_ht_cregs;
+
+       /*
+        * do very early init that is needed before ipath_f_bus is
+        * called
+        */
+       ipath_init_ht_variables();
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
new file mode 100644 (file)
index 0000000..2823ff9
--- /dev/null
@@ -0,0 +1,951 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+#include "ips_common.h"
+
+/*
+ * min buffers we want to have per port, after driver
+ */
+#define IPATH_MIN_USER_PORT_BUFCNT 8
+
+/*
+ * Number of ports we are configured to use (to allow for more pio
+ * buffers per port, etc.)  Zero means use chip value.
+ */
+static ushort ipath_cfgports;
+
+module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
+
+/*
+ * Number of buffers reserved for driver (layered drivers and SMA
+ * send).  Reserved at end of buffer list.
+ */
+static ushort ipath_kpiobufs = 32;
+
+static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
+
+module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_uint,
+                 &ipath_kpiobufs, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
+
+/**
+ * create_port0_egr - allocate the eager TID buffers
+ * @dd: the infinipath device
+ *
+ * This code is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance.
+ * This is the kernel (port0) version.
+ *
+ * Allocate the eager TID buffers and program them into infinipath.
+ * We use the network layer alloc_skb() allocator to allocate the
+ * memory, and either use the buffers as is for things like SMA
+ * packets, or pass the buffers up to the ipath layered driver and
+ * thence the network layer, replacing them as we do so (see
+ * ipath_rcv_layer()).
+ */
+static int create_port0_egr(struct ipath_devdata *dd)
+{
+       unsigned e, egrcnt;
+       struct sk_buff **skbs;
+       int ret;
+
+       egrcnt = dd->ipath_rcvegrcnt;
+
+       skbs = vmalloc(sizeof(*dd->ipath_port0_skbs) * egrcnt);
+       if (skbs == NULL) {
+               ipath_dev_err(dd, "allocation error for eager TID "
+                             "skb array\n");
+               ret = -ENOMEM;
+               goto bail;
+       }
+       for (e = 0; e < egrcnt; e++) {
+               /*
+                * This is a bit tricky in that we allocate extra
+                * space for 2 bytes of the 14 byte ethernet header.
+                * These two bytes are passed in the ipath header so
+                * the rest of the data is word aligned.  We allocate
+                * 4 bytes so that the data buffer stays word aligned.
+                * See ipath_kreceive() for more details.
+                */
+               skbs[e] = ipath_alloc_skb(dd, GFP_KERNEL);
+               if (!skbs[e]) {
+                       ipath_dev_err(dd, "SKB allocation error for "
+                                     "eager TID %u\n", e);
+                       while (e != 0)
+                               dev_kfree_skb(skbs[--e]);
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+       }
+       /*
+        * After loop above, so we can test non-NULL to see if ready
+        * to use at receive, etc.
+        */
+       dd->ipath_port0_skbs = skbs;
+
+       for (e = 0; e < egrcnt; e++) {
+               unsigned long phys =
+                       virt_to_phys(dd->ipath_port0_skbs[e]->data);
+               dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
+                                   ((char __iomem *) dd->ipath_kregbase +
+                                    dd->ipath_rcvegrbase), 0, phys);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int bringup_link(struct ipath_devdata *dd)
+{
+       u64 val, ibc;
+       int ret = 0;
+
+       /* hold IBC in reset */
+       dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                        dd->ipath_control);
+
+       /*
+        * Note that prior to try 14 or 15 of IB, the credit scaling
+        * wasn't working, because it was swapped for writes with the
+        * 1 bit default linkstate field
+        */
+
+       /* ignore pbc and align word */
+       val = dd->ipath_piosize2k - 2 * sizeof(u32);
+       /*
+        * for ICRC, which we only send in diag test pkt mode, and we
+        * don't need to worry about that for mtu
+        */
+       val += 1;
+       /*
+        * Set the IBC maxpktlength to the size of our pio buffers the
+        * maxpktlength is in words.  This is *not* the IB data MTU.
+        */
+       ibc = (val / sizeof(u32)) << INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+       /* in KB */
+       ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
+       /*
+        * How often flowctrl sent.  More or less in usecs; balance against
+        * watermark value, so that in theory senders always get a flow
+        * control update in time to not let the IB link go idle.
+        */
+       ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
+       /* max error tolerance */
+       ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+       /* use "real" buffer space for */
+       ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
+       /* IB credit flow control. */
+       ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+       /* initially come up waiting for TS1, without sending anything. */
+       dd->ipath_ibcctrl = ibc;
+       /*
+        * Want to start out with both LINKCMD and LINKINITCMD in NOP
+        * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
+        * to stay a NOP
+        */
+       ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+               INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+       ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
+                  (unsigned long long) ibc);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
+
+       // be sure chip saw it
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+       ret = dd->ipath_f_bringup_serdes(dd);
+
+       if (ret)
+               dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
+                        "not usable\n");
+       else {
+               /* enable IBC */
+               dd->ipath_control |= INFINIPATH_C_LINKENABLE;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                                dd->ipath_control);
+       }
+
+       return ret;
+}
+
+static int init_chip_first(struct ipath_devdata *dd,
+                          struct ipath_portdata **pdp)
+{
+       struct ipath_portdata *pd = NULL;
+       int ret = 0;
+       u64 val;
+
+       /*
+        * skip cfgports stuff because we are not allocating memory,
+        * and we don't want problems if the portcnt changed due to
+        * cfgports.  We do still check and report a difference, if
+        * not same (should be impossible).
+        */
+       dd->ipath_portcnt =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+       if (!ipath_cfgports)
+               dd->ipath_cfgports = dd->ipath_portcnt;
+       else if (ipath_cfgports <= dd->ipath_portcnt) {
+               dd->ipath_cfgports = ipath_cfgports;
+               ipath_dbg("Configured to use %u ports out of %u in chip\n",
+                         dd->ipath_cfgports, dd->ipath_portcnt);
+       } else {
+               dd->ipath_cfgports = dd->ipath_portcnt;
+               ipath_dbg("Tried to configured to use %u ports; chip "
+                         "only supports %u\n", ipath_cfgports,
+                         dd->ipath_portcnt);
+       }
+       dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_cfgports,
+                              GFP_KERNEL);
+
+       if (!dd->ipath_pd) {
+               ipath_dev_err(dd, "Unable to allocate portdata array, "
+                             "failing\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       dd->ipath_lastegrheads = kzalloc(sizeof(*dd->ipath_lastegrheads)
+                                        * dd->ipath_cfgports,
+                                        GFP_KERNEL);
+       dd->ipath_lastrcvhdrqtails =
+               kzalloc(sizeof(*dd->ipath_lastrcvhdrqtails)
+                       * dd->ipath_cfgports, GFP_KERNEL);
+
+       if (!dd->ipath_lastegrheads || !dd->ipath_lastrcvhdrqtails) {
+               ipath_dev_err(dd, "Unable to allocate head arrays, "
+                             "failing\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       dd->ipath_pd[0] = kzalloc(sizeof(*pd), GFP_KERNEL);
+
+       if (!dd->ipath_pd[0]) {
+               ipath_dev_err(dd, "Unable to allocate portdata for port "
+                             "0, failing\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+       pd = dd->ipath_pd[0];
+       pd->port_dd = dd;
+       pd->port_port = 0;
+       pd->port_cnt = 1;
+       /* The port 0 pkey table is used by the layer interface. */
+       pd->port_pkeys[0] = IPS_DEFAULT_P_KEY;
+       dd->ipath_rcvtidcnt =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+       dd->ipath_rcvtidbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+       dd->ipath_rcvegrcnt =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+       dd->ipath_rcvegrbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+       dd->ipath_palign =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+       dd->ipath_piobufbase =
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
+       dd->ipath_piosize2k = val & ~0U;
+       dd->ipath_piosize4k = val >> 32;
+       dd->ipath_ibmtu = 4096; /* default to largest legal MTU */
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
+       dd->ipath_piobcnt2k = val & ~0U;
+       dd->ipath_piobcnt4k = val >> 32;
+       dd->ipath_pio2kbase =
+               (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+                                (dd->ipath_piobufbase & 0xffffffff));
+       if (dd->ipath_piobcnt4k) {
+               dd->ipath_pio4kbase = (u32 __iomem *)
+                       (((char __iomem *) dd->ipath_kregbase) +
+                        (dd->ipath_piobufbase >> 32));
+               /*
+                * 4K buffers take 2 pages; we use roundup just to be
+                * paranoid; we calculate it once here, rather than on
+                * ever buf allocate
+                */
+               dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
+                                         dd->ipath_palign);
+               ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
+                         "(%x aligned)\n",
+                         dd->ipath_piobcnt2k, dd->ipath_piosize2k,
+                         dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
+                         dd->ipath_piosize4k, dd->ipath_pio4kbase,
+                         dd->ipath_4kalign);
+       }
+       else ipath_dbg("%u 2k piobufs @ %p\n",
+                      dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
+
+       spin_lock_init(&dd->ipath_tid_lock);
+
+done:
+       *pdp = pd;
+       return ret;
+}
+
+/**
+ * init_chip_reset - re-initialize after a reset, or enable
+ * @dd: the infinipath device
+ * @pdp: output for port data
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explictly, in case reset
+ * failed
+ */
+static int init_chip_reset(struct ipath_devdata *dd,
+                          struct ipath_portdata **pdp)
+{
+       struct ipath_portdata *pd;
+       u32 rtmp;
+
+       *pdp = pd = dd->ipath_pd[0];
+       /* ensure chip does no sends or receives while we re-initialize */
+       dd->ipath_control = dd->ipath_sendctrl = dd->ipath_rcvctrl = 0U;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, 0);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0);
+
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+       if (dd->ipath_portcnt != rtmp)
+               dev_info(&dd->pcidev->dev, "portcnt was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_portcnt, rtmp);
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+       if (rtmp != dd->ipath_rcvtidcnt)
+               dev_info(&dd->pcidev->dev, "tidcnt was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvtidcnt, rtmp);
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+       if (rtmp != dd->ipath_rcvtidbase)
+               dev_info(&dd->pcidev->dev, "tidbase was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvtidbase, rtmp);
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+       if (rtmp != dd->ipath_rcvegrcnt)
+               dev_info(&dd->pcidev->dev, "egrcnt was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvegrcnt, rtmp);
+       rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+       if (rtmp != dd->ipath_rcvegrbase)
+               dev_info(&dd->pcidev->dev, "egrbase was %u before "
+                        "reset, now %u, using original\n",
+                        dd->ipath_rcvegrbase, rtmp);
+
+       return 0;
+}
+
+static int init_pioavailregs(struct ipath_devdata *dd)
+{
+       int ret;
+
+       dd->ipath_pioavailregs_dma = dma_alloc_coherent(
+               &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
+               GFP_KERNEL);
+       if (!dd->ipath_pioavailregs_dma) {
+               ipath_dev_err(dd, "failed to allocate PIOavail reg area "
+                             "in memory\n");
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       /*
+        * we really want L2 cache aligned, but for current CPUs of
+        * interest, they are the same.
+        */
+       dd->ipath_statusp = (u64 *)
+               ((char *)dd->ipath_pioavailregs_dma +
+                ((2 * L1_CACHE_BYTES +
+                  dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+       /* copy the current value now that it's really allocated */
+       *dd->ipath_statusp = dd->_ipath_status;
+       /*
+        * setup buffer to hold freeze msg, accessible to apps,
+        * following statusp
+        */
+       dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
+       /* and its length */
+       dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
+
+       if (dd->ipath_unit * 64 > (IPATH_PORT0_RCVHDRTAIL_SIZE - 64)) {
+               ipath_dev_err(dd, "unit %u too large for port 0 "
+                             "rcvhdrtail buffer size\n", dd->ipath_unit);
+               ret = -ENODEV;
+       }
+       else
+               ret = 0;
+
+       /* so we can get current tail in ipath_kreceive(), per chip */
+       dd->ipath_hdrqtailptr = &ipath_port0_rcvhdrtail[
+               dd->ipath_unit * (64 / sizeof(*ipath_port0_rcvhdrtail))];
+done:
+       return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the infinipath device
+ *
+ * allocate the shadow TID array, so we can ipath_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * port data structure, so we only allocate memory for ports actually
+ * in use, since we at 8k per port, now.
+ */
+static void init_shadow_tids(struct ipath_devdata *dd)
+{
+       dd->ipath_pageshadow = (struct page **)
+               vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+                       sizeof(struct page *));
+       if (!dd->ipath_pageshadow)
+               ipath_dev_err(dd, "failed to allocate shadow page * "
+                             "array, no expected sends!\n");
+       else
+               memset(dd->ipath_pageshadow, 0,
+                      dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+                      sizeof(struct page *));
+}
+
+static void enable_chip(struct ipath_devdata *dd,
+                       struct ipath_portdata *pd, int reinit)
+{
+       u32 val;
+       int i;
+
+       if (!reinit) {
+               init_waitqueue_head(&ipath_sma_state_wait);
+       }
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       /* Enable PIO send, and update of PIOavail regs to memory. */
+       dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
+               INFINIPATH_S_PIOBUFAVAILUPD;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        dd->ipath_sendctrl);
+
+       /*
+        * enable port 0 receive, and receive interrupt.  other ports
+        * done as user opens and inits them.
+        */
+       dd->ipath_rcvctrl = INFINIPATH_R_TAILUPD |
+               (1ULL << INFINIPATH_R_PORTENABLE_SHIFT) |
+               (1ULL << INFINIPATH_R_INTRAVAIL_SHIFT);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                        dd->ipath_rcvctrl);
+
+       /*
+        * now ready for use.  this should be cleared whenever we
+        * detect a reset, or initiate one.
+        */
+       dd->ipath_flags |= IPATH_INITTED;
+
+       /*
+        * init our shadow copies of head from tail values, and write
+        * head values to match.
+        */
+       val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
+       (void)ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
+       dd->ipath_port0head = ipath_read_ureg32(dd, ur_rcvhdrtail, 0);
+
+       /* Initialize so we interrupt on next packet received */
+       (void)ipath_write_ureg(dd, ur_rcvhdrhead,
+                              dd->ipath_rhdrhead_intr_off |
+                              dd->ipath_port0head, 0);
+
+       /*
+        * by now pioavail updates to memory should have occurred, so
+        * copy them into our working/shadow registers; this is in
+        * case something went wrong with abort, but mostly to get the
+        * initial values of the generation bit correct.
+        */
+       for (i = 0; i < dd->ipath_pioavregs; i++) {
+               __le64 val;
+
+               /*
+                * Chip Errata bug 6641; even and odd qwords>3 are swapped.
+                */
+               if (i > 3) {
+                       if (i & 1)
+                               val = dd->ipath_pioavailregs_dma[i - 1];
+                       else
+                               val = dd->ipath_pioavailregs_dma[i + 1];
+               }
+               else
+                       val = dd->ipath_pioavailregs_dma[i];
+               dd->ipath_pioavailshadow[i] = le64_to_cpu(val);
+       }
+       /* can get counters, stats, etc. */
+       dd->ipath_flags |= IPATH_PRESENT;
+}
+
+static int init_housekeeping(struct ipath_devdata *dd,
+                            struct ipath_portdata **pdp, int reinit)
+{
+       char boardn[32];
+       int ret = 0;
+
+       /*
+        * have to clear shadow copies of registers at init that are
+        * not otherwise set here, or all kinds of bizarre things
+        * happen with driver on chip reset
+        */
+       dd->ipath_rcvhdrsize = 0;
+
+       /*
+        * Don't clear ipath_flags as 8bit mode was set before
+        * entering this func. However, we do set the linkstate to
+        * unknown, so we can watch for a transition.
+        */
+       dd->ipath_flags |= IPATH_LINKUNK;
+       dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
+                            IPATH_LINKDOWN | IPATH_LINKINIT);
+
+       ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
+       dd->ipath_revision =
+               ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
+
+       /*
+        * set up fundamental info we need to use the chip; we assume
+        * if the revision reg and these regs are OK, we don't need to
+        * special case the rest
+        */
+       dd->ipath_sregbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
+       dd->ipath_cregbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
+       dd->ipath_uregbase =
+               ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
+       ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
+                  "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
+                  dd->ipath_uregbase, dd->ipath_cregbase);
+       if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
+           || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
+           || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
+           || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
+               ipath_dev_err(dd, "Register read failures from chip, "
+                             "giving up initialization\n");
+               ret = -ENODEV;
+               goto done;
+       }
+
+       /* clear the initial reset flag, in case first driver load */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+                        INFINIPATH_E_RESET);
+
+       if (reinit)
+               ret = init_chip_reset(dd, pdp);
+       else
+               ret = init_chip_first(dd, pdp);
+
+       if (ret)
+               goto done;
+
+       ipath_cdbg(VERBOSE, "Revision %llx (PCI %x), %u ports, %u tids, "
+                  "%u egrtids\n", (unsigned long long) dd->ipath_revision,
+                  dd->ipath_pcirev, dd->ipath_portcnt, dd->ipath_rcvtidcnt,
+                  dd->ipath_rcvegrcnt);
+
+       if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
+            INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
+               ipath_dev_err(dd, "Driver only handles version %d, "
+                             "chip swversion is %d (%llx), failng\n",
+                             IPATH_CHIP_SWVERSION,
+                             (int)(dd->ipath_revision >>
+                                   INFINIPATH_R_SOFTWARE_SHIFT) &
+                             INFINIPATH_R_SOFTWARE_MASK,
+                             (unsigned long long) dd->ipath_revision);
+               ret = -ENOSYS;
+               goto done;
+       }
+       dd->ipath_majrev = (u8) ((dd->ipath_revision >>
+                                 INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
+                                INFINIPATH_R_CHIPREVMAJOR_MASK);
+       dd->ipath_minrev = (u8) ((dd->ipath_revision >>
+                                 INFINIPATH_R_CHIPREVMINOR_SHIFT) &
+                                INFINIPATH_R_CHIPREVMINOR_MASK);
+       dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
+                                   INFINIPATH_R_BOARDID_SHIFT) &
+                                  INFINIPATH_R_BOARDID_MASK);
+
+       ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
+
+       snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
+                "Driver %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
+                "SW Compat %u\n",
+                IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
+                (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
+                INFINIPATH_R_ARCH_MASK,
+                dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
+                (unsigned)(dd->ipath_revision >>
+                           INFINIPATH_R_SOFTWARE_SHIFT) &
+                INFINIPATH_R_SOFTWARE_MASK);
+
+       ipath_dbg("%s", dd->ipath_boardversion);
+
+done:
+       return ret;
+}
+
+
+/**
+ * ipath_init_chip - do the actual initialization sequence on the chip
+ * @dd: the infinipath device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int ipath_init_chip(struct ipath_devdata *dd, int reinit)
+{
+       int ret = 0, i;
+       u32 val32, kpiobufs;
+       u64 val, atmp;
+       struct ipath_portdata *pd = NULL; /* keep gcc4 happy */
+
+       ret = init_housekeeping(dd, &pd, reinit);
+       if (ret)
+               goto done;
+
+       /*
+        * we ignore most issues after reporting them, but have to specially
+        * handle hardware-disabled chips.
+        */
+       if (ret == 2) {
+               /* unique error, known to ipath_init_one */
+               ret = -EPERM;
+               goto done;
+       }
+
+       /*
+        * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
+        * but then it no longer nicely fits power of two, and since
+        * we now use routines that backend onto __get_free_pages, the
+        * rest would be wasted.
+        */
+       dd->ipath_rcvhdrcnt = dd->ipath_rcvegrcnt;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
+                        dd->ipath_rcvhdrcnt);
+
+       /*
+        * Set up the shadow copies of the piobufavail registers,
+        * which we compare against the chip registers for now, and
+        * the in memory DMA'ed copies of the registers.  This has to
+        * be done early, before we calculate lastport, etc.
+        */
+       val = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       /*
+        * calc number of pioavail registers, and save it; we have 2
+        * bits per buffer.
+        */
+       dd->ipath_pioavregs = ALIGN(val, sizeof(u64) * BITS_PER_BYTE / 2)
+               / (sizeof(u64) * BITS_PER_BYTE / 2);
+       if (!ipath_kpiobufs)    /* have to have at least 1, for SMA */
+               kpiobufs = ipath_kpiobufs = 1;
+       else if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) <
+                (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT)) {
+               dev_info(&dd->pcidev->dev, "Too few PIO buffers (%u) "
+                        "for %u ports to have %u each!\n",
+                        dd->ipath_piobcnt2k + dd->ipath_piobcnt4k,
+                        dd->ipath_cfgports, IPATH_MIN_USER_PORT_BUFCNT);
+               kpiobufs = 1;   /* reserve just the minimum for SMA/ether */
+       } else
+               kpiobufs = ipath_kpiobufs;
+
+       if (kpiobufs >
+           (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+            (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT))) {
+               i = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+                       (dd->ipath_cfgports * IPATH_MIN_USER_PORT_BUFCNT);
+               if (i < 0)
+                       i = 0;
+               dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs for "
+                        "kernel leaves too few for %d user ports "
+                        "(%d each); using %u\n", kpiobufs,
+                        dd->ipath_cfgports - 1,
+                        IPATH_MIN_USER_PORT_BUFCNT, i);
+               /*
+                * shouldn't change ipath_kpiobufs, because could be
+                * different for different devices...
+                */
+               kpiobufs = i;
+       }
+       dd->ipath_lastport_piobuf =
+               dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - kpiobufs;
+       dd->ipath_pbufsport = dd->ipath_cfgports > 1
+               ? dd->ipath_lastport_piobuf / (dd->ipath_cfgports - 1)
+               : 0;
+       val32 = dd->ipath_lastport_piobuf -
+               (dd->ipath_pbufsport * (dd->ipath_cfgports - 1));
+       if (val32 > 0) {
+               ipath_dbg("allocating %u pbufs/port leaves %u unused, "
+                         "add to kernel\n", dd->ipath_pbufsport, val32);
+               dd->ipath_lastport_piobuf -= val32;
+               ipath_dbg("%u pbufs/port leaves %u unused, add to kernel\n",
+                         dd->ipath_pbufsport, val32);
+       }
+       dd->ipath_lastpioindex = dd->ipath_lastport_piobuf;
+       ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
+                  "each for %u user ports\n", kpiobufs,
+                  dd->ipath_piobcnt2k + dd->ipath_piobcnt4k,
+                  dd->ipath_pbufsport, dd->ipath_cfgports - 1);
+
+       dd->ipath_f_early_init(dd);
+
+       /* early_init sets rcvhdrentsize and rcvhdrsize, so this must be
+        * done after early_init */
+       dd->ipath_hdrqlast =
+               dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
+                        dd->ipath_rcvhdrentsize);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+                        dd->ipath_rcvhdrsize);
+
+       if (!reinit) {
+               ret = init_pioavailregs(dd);
+               init_shadow_tids(dd);
+               if (ret)
+                       goto done;
+       }
+
+       (void)ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
+                              dd->ipath_pioavailregs_phys);
+       /*
+        * this is to detect s/w errors, which the h/w works around by
+        * ignoring the low 6 bits of address, if it wasn't aligned.
+        */
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
+       if (val != dd->ipath_pioavailregs_phys) {
+               ipath_dev_err(dd, "Catastrophic software error, "
+                             "SendPIOAvailAddr written as %lx, "
+                             "read back as %llx\n",
+                             (unsigned long) dd->ipath_pioavailregs_phys,
+                             (unsigned long long) val);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       val = ipath_port0_rcvhdrtail_dma + dd->ipath_unit * 64;
+
+       /* verify that the alignment requirement was met */
+       ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+                             0, val);
+       atmp = ipath_read_kreg64_port(
+               dd, dd->ipath_kregs->kr_rcvhdrtailaddr, 0);
+       if (val != atmp) {
+               ipath_dev_err(dd, "Catastrophic software error, "
+                             "RcvHdrTailAddr0 written as %llx, "
+                             "read back as %llx from %x\n",
+                             (unsigned long long) val,
+                             (unsigned long long) atmp,
+                             dd->ipath_kregs->kr_rcvhdrtailaddr);
+               ret = -EINVAL;
+               goto done;
+       }
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
+
+       /*
+        * make sure we are not in freeze, and PIO send enabled, so
+        * writes to pbc happen
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        INFINIPATH_S_PIOENABLE);
+
+       /*
+        * before error clears, since we expect serdes pll errors during
+        * this, the first time after reset
+        */
+       if (bringup_link(dd)) {
+               dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
+               ret = -ENETDOWN;
+               goto done;
+       }
+
+       /*
+        * clear any "expected" hwerrs from reset and/or initialization
+        * clear any that aren't enabled (at least this once), and then
+        * set the enable mask
+        */
+       dd->ipath_f_init_hwerrors(dd);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                        dd->ipath_hwerrmask);
+
+       dd->ipath_maskederrs = dd->ipath_ignorederrs;
+       /* clear all */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+       /* enable errors that are masked, at least this first time. */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+                        ~dd->ipath_maskederrs);
+       /* clear any interrups up to this point (ints still not enabled) */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+       ipath_stats.sps_lid[dd->ipath_unit] = dd->ipath_lid;
+
+       /*
+        * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
+        * re-init, the simplest way to handle this is to free
+        * existing, and re-allocate.
+        */
+       if (reinit)
+               ipath_free_pddata(dd, 0, 0);
+       dd->ipath_f_tidtemplate(dd);
+       ret = ipath_create_rcvhdrq(dd, pd);
+       if (!ret)
+               ret = create_port0_egr(dd);
+       if (ret)
+               ipath_dev_err(dd, "failed to allocate port 0 (kernel) "
+                             "rcvhdrq and/or egr bufs\n");
+       else
+               enable_chip(dd, pd, reinit);
+
+       /*
+        * cause retrigger of pending interrupts ignored during init,
+        * even if we had errors
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+
+       if(!dd->ipath_stats_timer_active) {
+               /*
+                * first init, or after an admin disable/enable
+                * set up stats retrieval timer, even if we had errors
+                * in last portion of setup
+                */
+               init_timer(&dd->ipath_stats_timer);
+               dd->ipath_stats_timer.function = ipath_get_faststats;
+               dd->ipath_stats_timer.data = (unsigned long) dd;
+               /* every 5 seconds; */
+               dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
+               /* takes ~16 seconds to overflow at full IB 4x bandwdith */
+               add_timer(&dd->ipath_stats_timer);
+               dd->ipath_stats_timer_active = 1;
+       }
+
+done:
+       if (!ret) {
+               ipath_get_guid(dd);
+               *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
+               if (!dd->ipath_f_intrsetup(dd)) {
+                       /* now we can enable all interrupts from the chip */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+                                        -1LL);
+                       /* force re-interrupt of any pending interrupts. */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
+                                        0ULL);
+                       /* chip is usable; mark it as initialized */
+                       *dd->ipath_statusp |= IPATH_STATUS_INITTED;
+               } else
+                       ipath_dev_err(dd, "No interrupts enabled, couldn't "
+                                     "setup interrupt address\n");
+
+               if (dd->ipath_cfgports > ipath_stats.sps_nports)
+                       /*
+                        * sps_nports is a global, so, we set it to
+                        * the highest number of ports of any of the
+                        * chips we find; we never decrement it, at
+                        * least for now.  Since this might have changed
+                        * over disable/enable or prior to reset, always
+                        * do the check and potentially adjust.
+                        */
+                       ipath_stats.sps_nports = dd->ipath_cfgports;
+       } else
+               ipath_dbg("Failed (%d) to initialize chip\n", ret);
+
+       /* if ret is non-zero, we probably should do some cleanup
+          here... */
+       return ret;
+}
+
+static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
+{
+       struct ipath_devdata *dd;
+       unsigned long flags;
+       unsigned short val;
+       int ret;
+
+       ret = ipath_parse_ushort(str, &val);
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       if (ret < 0)
+               goto bail;
+
+       if (val == 0) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+               if (dd->ipath_kregbase)
+                       continue;
+               if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+                          (dd->ipath_cfgports *
+                           IPATH_MIN_USER_PORT_BUFCNT)))
+               {
+                       ipath_dev_err(
+                               dd,
+                               "Allocating %d PIO bufs for kernel leaves "
+                               "too few for %d user ports (%d each)\n",
+                               val, dd->ipath_cfgports - 1,
+                               IPATH_MIN_USER_PORT_BUFCNT);
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               dd->ipath_lastport_piobuf =
+                       dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
+       }
+
+       ret = 0;
+bail:
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
new file mode 100644 (file)
index 0000000..60f5f41
--- /dev/null
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+
+#include "ipath_kernel.h"
+#include "ips_common.h"
+#include "ipath_layer.h"
+
+#define E_SUM_PKTERRS \
+       (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
+        INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
+        INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
+        INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+        INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
+        INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
+
+#define E_SUM_ERRS \
+       (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
+        INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+        INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
+        INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+        INFINIPATH_E_INVALIDADDR)
+
+static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
+{
+       unsigned long sbuf[4];
+       u64 ignore_this_time = 0;
+       u32 piobcnt;
+
+       /* if possible that sendbuffererror could be valid */
+       piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+       /* read these before writing errorclear */
+       sbuf[0] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror);
+       sbuf[1] = ipath_read_kreg64(
+               dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+       if (piobcnt > 128) {
+               sbuf[2] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+               sbuf[3] = ipath_read_kreg64(
+                       dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+       }
+
+       if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+               int i;
+
+               ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]);
+               if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+                       printk("%lx %lx ", sbuf[2], sbuf[3]);
+               for (i = 0; i < piobcnt; i++) {
+                       if (test_bit(i, sbuf)) {
+                               u32 __iomem *piobuf;
+                               if (i < dd->ipath_piobcnt2k)
+                                       piobuf = (u32 __iomem *)
+                                               (dd->ipath_pio2kbase +
+                                                i * dd->ipath_palign);
+                               else
+                                       piobuf = (u32 __iomem *)
+                                               (dd->ipath_pio4kbase +
+                                                (i - dd->ipath_piobcnt2k) *
+                                                dd->ipath_4kalign);
+
+                               ipath_cdbg(PKT,
+                                          "PIObuf[%u] @%p pbc is %x; ",
+                                          i, piobuf, readl(piobuf));
+
+                               ipath_disarm_piobufs(dd, i, 1);
+                       }
+               }
+               if (ipath_debug & __IPATH_PKTDBG)
+                       printk("\n");
+       }
+       if ((errs & (INFINIPATH_E_SDROPPEDDATAPKT |
+                    INFINIPATH_E_SDROPPEDSMPPKT |
+                    INFINIPATH_E_SMINPKTLEN)) &&
+           !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+               /*
+                * This can happen when SMA is trying to bring the link
+                * up, but the IB link changes state at the "wrong" time.
+                * The IB logic then complains that the packet isn't
+                * valid.  We don't want to confuse people, so we just
+                * don't print them, except at debug
+                */
+               ipath_dbg("Ignoring pktsend errors %llx, because not "
+                         "yet active\n", (unsigned long long) errs);
+               ignore_this_time = INFINIPATH_E_SDROPPEDDATAPKT |
+                       INFINIPATH_E_SDROPPEDSMPPKT |
+                       INFINIPATH_E_SMINPKTLEN;
+       }
+
+       return ignore_this_time;
+}
+
+/* return the strings for the most common link states */
+static char *ib_linkstate(u32 linkstate)
+{
+       char *ret;
+
+       switch (linkstate) {
+       case IPATH_IBSTATE_INIT:
+               ret = "Init";
+               break;
+       case IPATH_IBSTATE_ARM:
+               ret = "Arm";
+               break;
+       case IPATH_IBSTATE_ACTIVE:
+               ret = "Active";
+               break;
+       default:
+               ret = "Down";
+       }
+
+       return ret;
+}
+
+static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
+                                    ipath_err_t errs, int noprint)
+{
+       u64 val;
+       u32 ltstate, lstate;
+
+       /*
+        * even if diags are enabled, we want to notice LINKINIT, etc.
+        * We just don't want to change the LED state, or
+        * dd->ipath_kregs->kr_ibcctrl
+        */
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+       lstate = val & IPATH_IBSTATE_MASK;
+       if (lstate == IPATH_IBSTATE_INIT || lstate == IPATH_IBSTATE_ARM ||
+           lstate == IPATH_IBSTATE_ACTIVE) {
+               /*
+                * only print at SMA if there is a change, debug if not
+                * (sometimes we want to know that, usually not).
+                */
+               if (lstate == ((unsigned) dd->ipath_lastibcstat
+                              & IPATH_IBSTATE_MASK)) {
+                       ipath_dbg("Status change intr but no change (%s)\n",
+                                 ib_linkstate(lstate));
+               }
+               else
+                       ipath_cdbg(SMA, "Unit %u link state %s, last "
+                                  "was %s\n", dd->ipath_unit,
+                                  ib_linkstate(lstate),
+                                  ib_linkstate((unsigned)
+                                               dd->ipath_lastibcstat
+                                               & IPATH_IBSTATE_MASK));
+       }
+       else {
+               lstate = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK;
+               if (lstate == IPATH_IBSTATE_INIT ||
+                   lstate == IPATH_IBSTATE_ARM ||
+                   lstate == IPATH_IBSTATE_ACTIVE)
+                       ipath_cdbg(SMA, "Unit %u link state down"
+                                  " (state 0x%x), from %s\n",
+                                  dd->ipath_unit,
+                                  (u32)val & IPATH_IBSTATE_MASK,
+                                  ib_linkstate(lstate));
+               else
+                       ipath_cdbg(VERBOSE, "Unit %u link state changed "
+                                  "to 0x%x from down (%x)\n",
+                                  dd->ipath_unit, (u32) val, lstate);
+       }
+       ltstate = (val >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+               INFINIPATH_IBCS_LINKTRAININGSTATE_MASK;
+       lstate = (val >> INFINIPATH_IBCS_LINKSTATE_SHIFT) &
+               INFINIPATH_IBCS_LINKSTATE_MASK;
+
+       if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+           ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+               u32 last_ltstate;
+
+               /*
+                * Ignore cycling back and forth from Polling.Active
+                * to Polling.Quiet while waiting for the other end of
+                * the link to come up. We will cycle back and forth
+                * between them if no cable is plugged in,
+                * the other device is powered off or disabled, etc.
+                */
+               last_ltstate = (dd->ipath_lastibcstat >>
+                               INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT)
+                       & INFINIPATH_IBCS_LINKTRAININGSTATE_MASK;
+               if (last_ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE
+                   || last_ltstate ==
+                   INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+                       if (dd->ipath_ibpollcnt > 40) {
+                               dd->ipath_flags |= IPATH_NOCABLE;
+                               *dd->ipath_statusp |=
+                                       IPATH_STATUS_IB_NOCABLE;
+                       } else
+                               dd->ipath_ibpollcnt++;
+                       goto skip_ibchange;
+               }
+       }
+       dd->ipath_ibpollcnt = 0;        /* some state other than 2 or 3 */
+       ipath_stats.sps_iblink++;
+       if (ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
+               dd->ipath_flags |= IPATH_LINKDOWN;
+               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+                                    | IPATH_LINKACTIVE |
+                                    IPATH_LINKARMED);
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+               if (!noprint) {
+                       if (((dd->ipath_lastibcstat >>
+                             INFINIPATH_IBCS_LINKSTATE_SHIFT) &
+                            INFINIPATH_IBCS_LINKSTATE_MASK)
+                           == INFINIPATH_IBCS_L_STATE_ACTIVE)
+                               /* if from up to down be more vocal */
+                               ipath_cdbg(SMA,
+                                          "Unit %u link now down (%s)\n",
+                                          dd->ipath_unit,
+                                          ipath_ibcstatus_str[ltstate]);
+                       else
+                               ipath_cdbg(VERBOSE, "Unit %u link is "
+                                          "down (%s)\n", dd->ipath_unit,
+                                          ipath_ibcstatus_str[ltstate]);
+               }
+
+               dd->ipath_f_setextled(dd, lstate, ltstate);
+       } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ACTIVE) {
+               dd->ipath_flags |= IPATH_LINKACTIVE;
+               dd->ipath_flags &=
+                       ~(IPATH_LINKUNK | IPATH_LINKINIT | IPATH_LINKDOWN |
+                         IPATH_LINKARMED | IPATH_NOCABLE);
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+               *dd->ipath_statusp |=
+                       IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+               dd->ipath_f_setextled(dd, lstate, ltstate);
+
+               __ipath_layer_intr(dd, IPATH_LAYER_INT_IF_UP);
+       } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_INIT) {
+               /*
+                * set INIT and DOWN.  Down is checked by most of the other
+                * code, but INIT is useful to know in a few places.
+                */
+               dd->ipath_flags |= IPATH_LINKINIT | IPATH_LINKDOWN;
+               dd->ipath_flags &=
+                       ~(IPATH_LINKUNK | IPATH_LINKACTIVE | IPATH_LINKARMED
+                         | IPATH_NOCABLE);
+               *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE
+                                       | IPATH_STATUS_IB_READY);
+               dd->ipath_f_setextled(dd, lstate, ltstate);
+       } else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ARM) {
+               dd->ipath_flags |= IPATH_LINKARMED;
+               dd->ipath_flags &=
+                       ~(IPATH_LINKUNK | IPATH_LINKDOWN | IPATH_LINKINIT |
+                         IPATH_LINKACTIVE | IPATH_NOCABLE);
+               *dd->ipath_statusp &= ~(IPATH_STATUS_IB_NOCABLE
+                                       | IPATH_STATUS_IB_READY);
+               dd->ipath_f_setextled(dd, lstate, ltstate);
+       } else {
+               if (!noprint)
+                       ipath_dbg("IBstatuschange unit %u: %s (%x)\n",
+                                 dd->ipath_unit,
+                                 ipath_ibcstatus_str[ltstate], ltstate);
+       }
+skip_ibchange:
+       dd->ipath_lastibcstat = val;
+}
+
+static void handle_supp_msgs(struct ipath_devdata *dd,
+                            unsigned supp_msgs, char msg[512])
+{
+       /*
+        * Print the message unless it's ibc status change only, which
+        * happens so often we never want to count it.
+        */
+       if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+               ipath_decode_err(msg, sizeof msg, dd->ipath_lasterror &
+                                ~INFINIPATH_E_IBSTATUSCHANGED);
+               if (dd->ipath_lasterror &
+                   ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL))
+                       ipath_dev_err(dd, "Suppressed %u messages for "
+                                     "fast-repeating errors (%s) (%llx)\n",
+                                     supp_msgs, msg,
+                                     (unsigned long long)
+                                     dd->ipath_lasterror);
+               else {
+                       /*
+                        * rcvegrfull and rcvhdrqfull are "normal", for some
+                        * types of processes (mostly benchmarks) that send
+                        * huge numbers of messages, while not processing
+                        * them. So only complain about these at debug
+                        * level.
+                        */
+                       ipath_dbg("Suppressed %u messages for %s\n",
+                                 supp_msgs, msg);
+               }
+       }
+}
+
+static unsigned handle_frequent_errors(struct ipath_devdata *dd,
+                                      ipath_err_t errs, char msg[512],
+                                      int *noprint)
+{
+       unsigned long nc;
+       static unsigned long nextmsg_time;
+       static unsigned nmsgs, supp_msgs;
+
+       /*
+        * Throttle back "fast" messages to no more than 10 per 5 seconds.
+        * This isn't perfect, but it's a reasonable heuristic. If we get
+        * more than 10, give a 6x longer delay.
+        */
+       nc = jiffies;
+       if (nmsgs > 10) {
+               if (time_before(nc, nextmsg_time)) {
+                       *noprint = 1;
+                       if (!supp_msgs++)
+                               nextmsg_time = nc + HZ * 3;
+               }
+               else if (supp_msgs) {
+                       handle_supp_msgs(dd, supp_msgs, msg);
+                       supp_msgs = 0;
+                       nmsgs = 0;
+               }
+       }
+       else if (!nmsgs++ || time_after(nc, nextmsg_time))
+               nextmsg_time = nc + HZ / 2;
+
+       return supp_msgs;
+}
+
+static void handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+       char msg[512];
+       u64 ignore_this_time = 0;
+       int i;
+       int chkerrpkts = 0, noprint = 0;
+       unsigned supp_msgs;
+
+       supp_msgs = handle_frequent_errors(dd, errs, msg, &noprint);
+
+       /*
+        * don't report errors that are masked (includes those always
+        * ignored)
+        */
+       errs &= ~dd->ipath_maskederrs;
+
+       /* do these first, they are most important */
+       if (errs & INFINIPATH_E_HARDWARE) {
+               /* reuse same msg buf */
+               dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
+       }
+
+       if (!noprint && (errs & ~infinipath_e_bitsextant))
+               ipath_dev_err(dd, "error interrupt with unknown errors "
+                             "%llx set\n", (unsigned long long)
+                             (errs & ~infinipath_e_bitsextant));
+
+       if (errs & E_SUM_ERRS)
+               ignore_this_time = handle_e_sum_errs(dd, errs);
+
+       if (supp_msgs == 250000) {
+               /*
+                * It's not entirely reasonable assuming that the errors set
+                * in the last clear period are all responsible for the
+                * problem, but the alternative is to assume it's the only
+                * ones on this particular interrupt, which also isn't great
+                */
+               dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+                                ~dd->ipath_maskederrs);
+               ipath_decode_err(msg, sizeof msg,
+                                (dd->ipath_maskederrs & ~dd->
+                                 ipath_ignorederrs));
+
+               if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) &
+                   ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL))
+                       ipath_dev_err(dd, "Disabling error(s) %llx because "
+                                     "occuring too frequently (%s)\n",
+                                     (unsigned long long)
+                                     (dd->ipath_maskederrs &
+                                      ~dd->ipath_ignorederrs), msg);
+               else {
+                       /*
+                        * rcvegrfull and rcvhdrqfull are "normal",
+                        * for some types of processes (mostly benchmarks)
+                        * that send huge numbers of messages, while not
+                        * processing them.  So only complain about
+                        * these at debug level.
+                        */
+                       ipath_dbg("Disabling frequent queue full errors "
+                                 "(%s)\n", msg);
+               }
+
+               /*
+                * Re-enable the masked errors after around 3 minutes.  in
+                * ipath_get_faststats().  If we have a series of fast
+                * repeating but different errors, the interval will keep
+                * stretching out, but that's OK, as that's pretty
+                * catastrophic.
+                */
+               dd->ipath_unmasktime = jiffies + HZ * 180;
+       }
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
+       if (ignore_this_time)
+               errs &= ~ignore_this_time;
+       if (errs & ~dd->ipath_lasterror) {
+               errs &= ~dd->ipath_lasterror;
+               /* never suppress duplicate hwerrors or ibstatuschange */
+               dd->ipath_lasterror |= errs &
+                       ~(INFINIPATH_E_HARDWARE |
+                         INFINIPATH_E_IBSTATUSCHANGED);
+       }
+       if (!errs)
+               return;
+
+       if (!noprint)
+               /*
+                * the ones we mask off are handled specially below or above
+                */
+               ipath_decode_err(msg, sizeof msg,
+                                errs & ~(INFINIPATH_E_IBSTATUSCHANGED |
+                                         INFINIPATH_E_RRCVEGRFULL |
+                                         INFINIPATH_E_RRCVHDRFULL |
+                                         INFINIPATH_E_HARDWARE));
+       else
+               /* so we don't need if (!noprint) at strlcat's below */
+               *msg = 0;
+
+       if (errs & E_SUM_PKTERRS) {
+               ipath_stats.sps_pkterrs++;
+               chkerrpkts = 1;
+       }
+       if (errs & E_SUM_ERRS)
+               ipath_stats.sps_errs++;
+
+       if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+               ipath_stats.sps_crcerrs++;
+               chkerrpkts = 1;
+       }
+
+       /*
+        * We don't want to print these two as they happen, or we can make
+        * the situation even worse, because it takes so long to print
+        * messages to serial consoles.  Kernel ports get printed from
+        * fast_stats, no more than every 5 seconds, user ports get printed
+        * on close
+        */
+       if (errs & INFINIPATH_E_RRCVHDRFULL) {
+               int any;
+               u32 hd, tl;
+               ipath_stats.sps_hdrqfull++;
+               for (any = i = 0; i < dd->ipath_cfgports; i++) {
+                       struct ipath_portdata *pd = dd->ipath_pd[i];
+                       if (i == 0) {
+                               hd = dd->ipath_port0head;
+                               tl = (u32) le64_to_cpu(
+                                       *dd->ipath_hdrqtailptr);
+                       } else if (pd && pd->port_cnt &&
+                                  pd->port_rcvhdrtail_kvaddr) {
+                               /*
+                                * don't report same point multiple times,
+                                * except kernel
+                                */
+                               tl = (u32) * pd->port_rcvhdrtail_kvaddr;
+                               if (tl == dd->ipath_lastrcvhdrqtails[i])
+                                       continue;
+                               hd = ipath_read_ureg32(dd, ur_rcvhdrhead,
+                                                      i);
+                       } else
+                               continue;
+                       if (hd == (tl + 1) ||
+                           (!hd && tl == dd->ipath_hdrqlast)) {
+                               dd->ipath_lastrcvhdrqtails[i] = tl;
+                               pd->port_hdrqfull++;
+                               if (i == 0)
+                                       chkerrpkts = 1;
+                       }
+               }
+       }
+       if (errs & INFINIPATH_E_RRCVEGRFULL) {
+               /*
+                * since this is of less importance and not likely to
+                * happen without also getting hdrfull, only count
+                * occurrences; don't check each port (or even the kernel
+                * vs user)
+                */
+               ipath_stats.sps_etidfull++;
+               if (dd->ipath_port0head !=
+                   (u32) le64_to_cpu(*dd->ipath_hdrqtailptr))
+                       chkerrpkts = 1;
+       }
+
+       /*
+        * do this before IBSTATUSCHANGED, in case both bits set in a single
+        * interrupt; we want the STATUSCHANGE to "win", so we do our
+        * internal copy of state machine correctly
+        */
+       if (errs & INFINIPATH_E_RIBLOSTLINK) {
+               /*
+                * force through block below
+                */
+               errs |= INFINIPATH_E_IBSTATUSCHANGED;
+               ipath_stats.sps_iblink++;
+               dd->ipath_flags |= IPATH_LINKDOWN;
+               dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+                                    | IPATH_LINKARMED | IPATH_LINKACTIVE);
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+               if (!noprint) {
+                       u64 st = ipath_read_kreg64(
+                               dd, dd->ipath_kregs->kr_ibcstatus);
+
+                       ipath_dbg("Lost link, link now down (%s)\n",
+                                 ipath_ibcstatus_str[st & 0xf]);
+               }
+       }
+       if (errs & INFINIPATH_E_IBSTATUSCHANGED)
+               handle_e_ibstatuschanged(dd, errs, noprint);
+
+       if (errs & INFINIPATH_E_RESET) {
+               if (!noprint)
+                       ipath_dev_err(dd, "Got reset, requires re-init "
+                                     "(unload and reload driver)\n");
+               dd->ipath_flags &= ~IPATH_INITTED;      /* needs re-init */
+               /* mark as having had error */
+               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+       }
+
+       if (!noprint && *msg)
+               ipath_dev_err(dd, "%s error\n", msg);
+       if (dd->ipath_sma_state_wanted & dd->ipath_flags) {
+               ipath_cdbg(VERBOSE, "sma wanted state %x, iflags now %x, "
+                          "waking\n", dd->ipath_sma_state_wanted,
+                          dd->ipath_flags);
+               wake_up_interruptible(&ipath_sma_state_wait);
+       }
+
+       if (chkerrpkts)
+               /* process possible error packets in hdrq */
+               ipath_kreceive(dd);
+}
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static void ipath_bad_intr(struct ipath_devdata *dd, u32 * unexpectp)
+{
+       /*
+        * sometimes happen during driver init and unload, don't want
+        * to process any interrupts at that point
+        */
+
+       /* this is just a bandaid, not a fix, if something goes badly
+        * wrong */
+       if (++*unexpectp > 100) {
+               if (++*unexpectp > 105) {
+                       /*
+                        * ok, we must be taking somebody else's interrupts,
+                        * due to a messed up mptable and/or PIRQ table, so
+                        * unregister the interrupt.  We've seen this during
+                        * linuxbios development work, and it may happen in
+                        * the future again.
+                        */
+                       if (dd->pcidev && dd->pcidev->irq) {
+                               ipath_dev_err(dd, "Now %u unexpected "
+                                             "interrupts, unregistering "
+                                             "interrupt handler\n",
+                                             *unexpectp);
+                               ipath_dbg("free_irq of irq %x\n",
+                                         dd->pcidev->irq);
+                               free_irq(dd->pcidev->irq, dd);
+                       }
+               }
+               if (ipath_read_kreg32(dd, dd->ipath_kregs->kr_intmask)) {
+                       ipath_dev_err(dd, "%u unexpected interrupts, "
+                                     "disabling interrupts completely\n",
+                                     *unexpectp);
+                       /*
+                        * disable all interrupts, something is very wrong
+                        */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+                                        0ULL);
+               }
+       } else if (*unexpectp > 1)
+               ipath_dbg("Interrupt when not ready, should not happen, "
+                         "ignoring\n");
+}
+
+static void ipath_bad_regread(struct ipath_devdata *dd)
+{
+       static int allbits;
+
+       /* separate routine, for better optimization of ipath_intr() */
+
+       /*
+        * We print the message and disable interrupts, in hope of
+        * having a better chance of debugging the problem.
+        */
+       ipath_dev_err(dd,
+                     "Read of interrupt status failed (all bits set)\n");
+       if (allbits++) {
+               /* disable all interrupts, something is very wrong */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+               if (allbits == 2) {
+                       ipath_dev_err(dd, "Still bad interrupt status, "
+                                     "unregistering interrupt\n");
+                       free_irq(dd->pcidev->irq, dd);
+               } else if (allbits > 2) {
+                       if ((allbits % 10000) == 0)
+                               printk(".");
+               } else
+                       ipath_dev_err(dd, "Disabling interrupts, "
+                                     "multiple errors\n");
+       }
+}
+
+static void handle_port_pioavail(struct ipath_devdata *dd)
+{
+       u32 i;
+       /*
+        * start from port 1, since for now port 0  is never using
+        * wait_event for PIO
+        */
+       for (i = 1; dd->ipath_portpiowait && i < dd->ipath_cfgports; i++) {
+               struct ipath_portdata *pd = dd->ipath_pd[i];
+
+               if (pd && pd->port_cnt &&
+                   dd->ipath_portpiowait & (1U << i)) {
+                       clear_bit(i, &dd->ipath_portpiowait);
+                       if (test_bit(IPATH_PORT_WAITING_PIO,
+                                    &pd->port_flag)) {
+                               clear_bit(IPATH_PORT_WAITING_PIO,
+                                         &pd->port_flag);
+                               wake_up_interruptible(&pd->port_wait);
+                       }
+               }
+       }
+}
+
+static void handle_layer_pioavail(struct ipath_devdata *dd)
+{
+       int ret;
+
+       ret = __ipath_layer_intr(dd, IPATH_LAYER_INT_SEND_CONTINUE);
+       if (ret > 0)
+               goto clear;
+
+       ret = __ipath_verbs_piobufavail(dd);
+       if (ret > 0)
+               goto clear;
+
+       return;
+clear:
+       set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        dd->ipath_sendctrl);
+}
+
+static void handle_rcv(struct ipath_devdata *dd, u32 istat)
+{
+       u64 portr;
+       int i;
+       int rcvdint = 0;
+
+       portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) &
+                infinipath_i_rcvavail_mask)
+               | ((istat >> INFINIPATH_I_RCVURG_SHIFT) &
+                  infinipath_i_rcvurg_mask);
+       for (i = 0; i < dd->ipath_cfgports; i++) {
+               struct ipath_portdata *pd = dd->ipath_pd[i];
+               if (portr & (1 << i) && pd &&
+                   pd->port_cnt) {
+                       if (i == 0)
+                               ipath_kreceive(dd);
+                       else if (test_bit(IPATH_PORT_WAITING_RCV,
+                                         &pd->port_flag)) {
+                               int rcbit;
+                               clear_bit(IPATH_PORT_WAITING_RCV,
+                                         &pd->port_flag);
+                               rcbit = i + INFINIPATH_R_INTRAVAIL_SHIFT;
+                               clear_bit(1UL << rcbit, &dd->ipath_rcvctrl);
+                               wake_up_interruptible(&pd->port_wait);
+                               rcvdint = 1;
+                       }
+               }
+       }
+       if (rcvdint) {
+               /* only want to take one interrupt, so turn off the rcv
+                * interrupt for all the ports that we did the wakeup on
+                * (but never for kernel port)
+                */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+                                dd->ipath_rcvctrl);
+       }
+}
+
+irqreturn_t ipath_intr(int irq, void *data, struct pt_regs *regs)
+{
+       struct ipath_devdata *dd = data;
+       u32 istat = ipath_read_kreg32(dd, dd->ipath_kregs->kr_intstatus);
+       ipath_err_t estat = 0;
+       static unsigned unexpected = 0;
+       irqreturn_t ret;
+
+       if (unlikely(!istat)) {
+               ipath_stats.sps_nullintr++;
+               ret = IRQ_NONE; /* not our interrupt, or already handled */
+               goto bail;
+       }
+       if (unlikely(istat == -1)) {
+               ipath_bad_regread(dd);
+               /* don't know if it was our interrupt or not */
+               ret = IRQ_NONE;
+               goto bail;
+       }
+
+       ipath_stats.sps_ints++;
+
+       /*
+        * this needs to be flags&initted, not statusp, so we keep
+        * taking interrupts even after link goes down, etc.
+        * Also, we *must* clear the interrupt at some point, or we won't
+        * take it again, which can be real bad for errors, etc...
+        */
+
+       if (!(dd->ipath_flags & IPATH_INITTED)) {
+               ipath_bad_intr(dd, &unexpected);
+               ret = IRQ_NONE;
+               goto bail;
+       }
+       if (unexpected)
+               unexpected = 0;
+
+       ipath_cdbg(VERBOSE, "intr stat=0x%x\n", istat);
+
+       if (istat & ~infinipath_i_bitsextant)
+               ipath_dev_err(dd,
+                             "interrupt with unknown interrupts %x set\n",
+                             istat & (u32) ~ infinipath_i_bitsextant);
+
+       if (istat & INFINIPATH_I_ERROR) {
+               ipath_stats.sps_errints++;
+               estat = ipath_read_kreg64(dd,
+                                         dd->ipath_kregs->kr_errorstatus);
+               if (!estat)
+                       dev_info(&dd->pcidev->dev, "error interrupt (%x), "
+                                "but no error bits set!\n", istat);
+               else if (estat == -1LL)
+                       /*
+                        * should we try clearing all, or hope next read
+                        * works?
+                        */
+                       ipath_dev_err(dd, "Read of error status failed "
+                                     "(all bits set); ignoring\n");
+               else
+                       handle_errors(dd, estat);
+       }
+
+       if (istat & INFINIPATH_I_GPIO) {
+               if (unlikely(!(dd->ipath_flags & IPATH_GPIO_INTR))) {
+                       u32 gpiostatus;
+                       gpiostatus = ipath_read_kreg32(
+                               dd, dd->ipath_kregs->kr_gpio_status);
+                       ipath_dbg("Unexpected GPIO interrupt bits %x\n",
+                                 gpiostatus);
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+                                        gpiostatus);
+               }
+               else {
+                       /* Clear GPIO status bit 2 */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+                                        (u64) (1 << 2));
+
+                       /*
+                        * Packets are available in the port 0 rcv queue.
+                        * Eventually this needs to be generalized to check
+                        * IPATH_GPIO_INTR, and the specific GPIO bit, if
+                        * GPIO interrupts are used for anything else.
+                        */
+                       ipath_kreceive(dd);
+               }
+       }
+
+       /*
+        * clear the ones we will deal with on this round
+        * We clear it early, mostly for receive interrupts, so we
+        * know the chip will have seen this by the time we process
+        * the queue, and will re-interrupt if necessary.  The processor
+        * itself won't take the interrupt again until we return.
+        */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
+
+       if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+               clear_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                                dd->ipath_sendctrl);
+
+               if (dd->ipath_portpiowait)
+                       handle_port_pioavail(dd);
+
+               handle_layer_pioavail(dd);
+       }
+
+       /*
+        * we check for both transition from empty to non-empty, and urgent
+        * packets (those with the interrupt bit set in the header)
+        */
+
+       if (istat & ((infinipath_i_rcvavail_mask <<
+                     INFINIPATH_I_RCVAVAIL_SHIFT)
+                    | (infinipath_i_rcvurg_mask <<
+                       INFINIPATH_I_RCVURG_SHIFT)))
+               handle_rcv(dd, istat);
+
+       ret = IRQ_HANDLED;
+
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
new file mode 100644 (file)
index 0000000..159d0ae
--- /dev/null
@@ -0,0 +1,884 @@
+#ifndef _IPATH_KERNEL_H
+#define _IPATH_KERNEL_H
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for infinipath kernel code
+ * ipath_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <asm/io.h>
+
+#include "ipath_common.h"
+#include "ipath_debug.h"
+#include "ipath_registers.h"
+
+/* only s/w major version of InfiniPath we can handle */
+#define IPATH_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define IPATH_CHIP_VERS_MIN 0U
+
+/* temporary, maybe always */
+extern struct infinipath_stats ipath_stats;
+
+#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
+
+struct ipath_portdata {
+       void **port_rcvegrbuf;
+       dma_addr_t *port_rcvegrbuf_phys;
+       /* rcvhdrq base, needs mmap before useful */
+       void *port_rcvhdrq;
+       /* kernel virtual address where hdrqtail is updated */
+       u64 *port_rcvhdrtail_kvaddr;
+       /* page * used for uaddr */
+       struct page *port_rcvhdrtail_pagep;
+       /*
+        * temp buffer for expected send setup, allocated at open, instead
+        * of each setup call
+        */
+       void *port_tid_pg_list;
+       /* when waiting for rcv or pioavail */
+       wait_queue_head_t port_wait;
+       /*
+        * rcvegr bufs base, physical, must fit
+        * in 44 bits so 32 bit programs mmap64 44 bit works)
+        */
+       dma_addr_t port_rcvegr_phys;
+       /* mmap of hdrq, must fit in 44 bits */
+       dma_addr_t port_rcvhdrq_phys;
+       /*
+        * the actual user address that we ipath_mlock'ed, so we can
+        * ipath_munlock it at close
+        */
+       unsigned long port_rcvhdrtail_uaddr;
+       /*
+        * number of opens on this instance (0 or 1; ignoring forks, dup,
+        * etc. for now)
+        */
+       int port_cnt;
+       /*
+        * how much space to leave at start of eager TID entries for
+        * protocol use, on each TID
+        */
+       /* instead of calculating it */
+       unsigned port_port;
+       /* chip offset of PIO buffers for this port */
+       u32 port_piobufs;
+       /* how many alloc_pages() chunks in port_rcvegrbuf_pages */
+       u32 port_rcvegrbuf_chunks;
+       /* how many egrbufs per chunk */
+       u32 port_rcvegrbufs_perchunk;
+       /* order for port_rcvegrbuf_pages */
+       size_t port_rcvegrbuf_size;
+       /* rcvhdrq size (for freeing) */
+       size_t port_rcvhdrq_size;
+       /* next expected TID to check when looking for free */
+       u32 port_tidcursor;
+       /* next expected TID to check */
+       unsigned long port_flag;
+       /* WAIT_RCV that timed out, no interrupt */
+       u32 port_rcvwait_to;
+       /* WAIT_PIO that timed out, no interrupt */
+       u32 port_piowait_to;
+       /* WAIT_RCV already happened, no wait */
+       u32 port_rcvnowait;
+       /* WAIT_PIO already happened, no wait */
+       u32 port_pionowait;
+       /* total number of rcvhdrqfull errors */
+       u32 port_hdrqfull;
+       /* pid of process using this port */
+       pid_t port_pid;
+       /* same size as task_struct .comm[] */
+       char port_comm[16];
+       /* pkeys set by this use of this port */
+       u16 port_pkeys[4];
+       /* so file ops can get at unit */
+       struct ipath_devdata *port_dd;
+};
+
+struct sk_buff;
+
+/*
+ * control information for layered drivers
+ */
+struct _ipath_layer {
+       void *l_arg;
+};
+
+/* Verbs layer interface */
+struct _verbs_layer {
+       void *l_arg;
+       struct timer_list l_timer;
+};
+
+struct ipath_devdata {
+       struct list_head ipath_list;
+
+       struct ipath_kregs const *ipath_kregs;
+       struct ipath_cregs const *ipath_cregs;
+
+       /* mem-mapped pointer to base of chip regs */
+       u64 __iomem *ipath_kregbase;
+       /* end of mem-mapped chip space; range checking */
+       u64 __iomem *ipath_kregend;
+       /* physical address of chip for io_remap, etc. */
+       unsigned long ipath_physaddr;
+       /* base of memory alloced for ipath_kregbase, for free */
+       u64 *ipath_kregalloc;
+       /*
+        * version of kregbase that doesn't have high bits set (for 32 bit
+        * programs, so mmap64 44 bit works)
+        */
+       u64 __iomem *ipath_kregvirt;
+       /*
+        * virtual address where port0 rcvhdrqtail updated for this unit.
+        * only written to by the chip, not the driver.
+        */
+       volatile __le64 *ipath_hdrqtailptr;
+       dma_addr_t ipath_dma_addr;
+       /* ipath_cfgports pointers */
+       struct ipath_portdata **ipath_pd;
+       /* sk_buffs used by port 0 eager receive queue */
+       struct sk_buff **ipath_port0_skbs;
+       /* kvirt address of 1st 2k pio buffer */
+       void __iomem *ipath_pio2kbase;
+       /* kvirt address of 1st 4k pio buffer */
+       void __iomem *ipath_pio4kbase;
+       /*
+        * points to area where PIOavail registers will be DMA'ed.
+        * Has to be on a page of it's own, because the page will be
+        * mapped into user program space.  This copy is *ONLY* ever
+        * written by DMA, not by the driver!  Need a copy per device
+        * when we get to multiple devices
+        */
+       volatile __le64 *ipath_pioavailregs_dma;
+       /* physical address where updates occur */
+       dma_addr_t ipath_pioavailregs_phys;
+       struct _ipath_layer ipath_layer;
+       /* setup intr */
+       int (*ipath_f_intrsetup)(struct ipath_devdata *);
+       /* setup on-chip bus config */
+       int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
+       /* hard reset chip */
+       int (*ipath_f_reset)(struct ipath_devdata *);
+       int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
+                                    size_t);
+       void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
+       void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
+                                       size_t);
+       void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
+       int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
+       int (*ipath_f_early_init)(struct ipath_devdata *);
+       void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
+       void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
+                               u32, unsigned long);
+       void (*ipath_f_tidtemplate)(struct ipath_devdata *);
+       void (*ipath_f_cleanup)(struct ipath_devdata *);
+       void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
+       /* fill out chip-specific fields */
+       int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
+       struct _verbs_layer verbs_layer;
+       /* total dwords sent (summed from counter) */
+       u64 ipath_sword;
+       /* total dwords rcvd (summed from counter) */
+       u64 ipath_rword;
+       /* total packets sent (summed from counter) */
+       u64 ipath_spkts;
+       /* total packets rcvd (summed from counter) */
+       u64 ipath_rpkts;
+       /* ipath_statusp initially points to this. */
+       u64 _ipath_status;
+       /* GUID for this interface, in network order */
+       __be64 ipath_guid;
+       /*
+        * aggregrate of error bits reported since last cleared, for
+        * limiting of error reporting
+        */
+       ipath_err_t ipath_lasterror;
+       /*
+        * aggregrate of error bits reported since last cleared, for
+        * limiting of hwerror reporting
+        */
+       ipath_err_t ipath_lasthwerror;
+       /*
+        * errors masked because they occur too fast, also includes errors
+        * that are always ignored (ipath_ignorederrs)
+        */
+       ipath_err_t ipath_maskederrs;
+       /* time in jiffies at which to re-enable maskederrs */
+       unsigned long ipath_unmasktime;
+       /*
+        * errors always ignored (masked), at least for a given
+        * chip/device, because they are wrong or not useful
+        */
+       ipath_err_t ipath_ignorederrs;
+       /* count of egrfull errors, combined for all ports */
+       u64 ipath_last_tidfull;
+       /* for ipath_qcheck() */
+       u64 ipath_lastport0rcv_cnt;
+       /* template for writing TIDs  */
+       u64 ipath_tidtemplate;
+       /* value to write to free TIDs */
+       u64 ipath_tidinvalid;
+       /* PE-800 rcv interrupt setup */
+       u64 ipath_rhdrhead_intr_off;
+
+       /* size of memory at ipath_kregbase */
+       u32 ipath_kregsize;
+       /* number of registers used for pioavail */
+       u32 ipath_pioavregs;
+       /* IPATH_POLL, etc. */
+       u32 ipath_flags;
+       /* ipath_flags sma is waiting for */
+       u32 ipath_sma_state_wanted;
+       /* last buffer for user use, first buf for kernel use is this
+        * index. */
+       u32 ipath_lastport_piobuf;
+       /* is a stats timer active */
+       u32 ipath_stats_timer_active;
+       /* dwords sent read from counter */
+       u32 ipath_lastsword;
+       /* dwords received read from counter */
+       u32 ipath_lastrword;
+       /* sent packets read from counter */
+       u32 ipath_lastspkts;
+       /* received packets read from counter */
+       u32 ipath_lastrpkts;
+       /* pio bufs allocated per port */
+       u32 ipath_pbufsport;
+       /*
+        * number of ports configured as max; zero is set to number chip
+        * supports, less gives more pio bufs/port, etc.
+        */
+       u32 ipath_cfgports;
+       /* port0 rcvhdrq head offset */
+       u32 ipath_port0head;
+       /* count of port 0 hdrqfull errors */
+       u32 ipath_p0_hdrqfull;
+
+       /*
+        * (*cfgports) used to suppress multiple instances of same
+        * port staying stuck at same point
+        */
+       u32 *ipath_lastrcvhdrqtails;
+       /*
+        * (*cfgports) used to suppress multiple instances of same
+        * port staying stuck at same point
+        */
+       u32 *ipath_lastegrheads;
+       /*
+        * index of last piobuffer we used.  Speeds up searching, by
+        * starting at this point.  Doesn't matter if multiple cpu's use and
+        * update, last updater is only write that matters.  Whenever it
+        * wraps, we update shadow copies.  Need a copy per device when we
+        * get to multiple devices
+        */
+       u32 ipath_lastpioindex;
+       /* max length of freezemsg */
+       u32 ipath_freezelen;
+       /*
+        * consecutive times we wanted a PIO buffer but were unable to
+        * get one
+        */
+       u32 ipath_consec_nopiobuf;
+       /*
+        * hint that we should update ipath_pioavailshadow before
+        * looking for a PIO buffer
+        */
+       u32 ipath_upd_pio_shadow;
+       /* so we can rewrite it after a chip reset */
+       u32 ipath_pcibar0;
+       /* so we can rewrite it after a chip reset */
+       u32 ipath_pcibar1;
+       /* sequential tries for SMA send and no bufs */
+       u32 ipath_nosma_bufs;
+       /* duration (seconds) ipath_nosma_bufs set */
+       u32 ipath_nosma_secs;
+
+       /* HT/PCI Vendor ID (here for NodeInfo) */
+       u16 ipath_vendorid;
+       /* HT/PCI Device ID (here for NodeInfo) */
+       u16 ipath_deviceid;
+       /* offset in HT config space of slave/primary interface block */
+       u8 ipath_ht_slave_off;
+       /* for write combining settings */
+       unsigned long ipath_wc_cookie;
+       /* ref count for each pkey */
+       atomic_t ipath_pkeyrefs[4];
+       /* shadow copy of all exptids physaddr; used only by funcsim */
+       u64 *ipath_tidsimshadow;
+       /* shadow copy of struct page *'s for exp tid pages */
+       struct page **ipath_pageshadow;
+       /* lock to workaround chip bug 9437 */
+       spinlock_t ipath_tid_lock;
+
+       /*
+        * IPATH_STATUS_*,
+        * this address is mapped readonly into user processes so they can
+        * get status cheaply, whenever they want.
+        */
+       u64 *ipath_statusp;
+       /* freeze msg if hw error put chip in freeze */
+       char *ipath_freezemsg;
+       /* pci access data structure */
+       struct pci_dev *pcidev;
+       struct cdev *cdev;
+       struct class_device *class_dev;
+       /* timer used to prevent stats overflow, error throttling, etc. */
+       struct timer_list ipath_stats_timer;
+       /* check for stale messages in rcv queue */
+       /* only allow one intr at a time. */
+       unsigned long ipath_rcv_pending;
+
+       /*
+        * Shadow copies of registers; size indicates read access size.
+        * Most of them are readonly, but some are write-only register,
+        * where we manipulate the bits in the shadow copy, and then write
+        * the shadow copy to infinipath.
+        *
+        * We deliberately make most of these 32 bits, since they have
+        * restricted range.  For any that we read, we won't to generate 32
+        * bit accesses, since Opteron will generate 2 separate 32 bit HT
+        * transactions for a 64 bit read, and we want to avoid unnecessary
+        * HT transactions.
+        */
+
+       /* This is the 64 bit group */
+
+       /*
+        * shadow of pioavail, check to be sure it's large enough at
+        * init time.
+        */
+       unsigned long ipath_pioavailshadow[8];
+       /* shadow of kr_gpio_out, for rmw ops */
+       u64 ipath_gpio_out;
+       /* kr_revision shadow */
+       u64 ipath_revision;
+       /*
+        * shadow of ibcctrl, for interrupt handling of link changes,
+        * etc.
+        */
+       u64 ipath_ibcctrl;
+       /*
+        * last ibcstatus, to suppress "duplicate" status change messages,
+        * mostly from 2 to 3
+        */
+       u64 ipath_lastibcstat;
+       /* hwerrmask shadow */
+       ipath_err_t ipath_hwerrmask;
+       /* interrupt config reg shadow */
+       u64 ipath_intconfig;
+       /* kr_sendpiobufbase value */
+       u64 ipath_piobufbase;
+
+       /* these are the "32 bit" regs */
+
+       /*
+        * number of GUIDs in the flash for this interface; may need some
+        * rethinking for setting on other ifaces
+        */
+       u32 ipath_nguid;
+       /*
+        * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+        * all expect bit fields to be "unsigned long"
+        */
+       /* shadow kr_rcvctrl */
+       unsigned long ipath_rcvctrl;
+       /* shadow kr_sendctrl */
+       unsigned long ipath_sendctrl;
+
+       /* value we put in kr_rcvhdrcnt */
+       u32 ipath_rcvhdrcnt;
+       /* value we put in kr_rcvhdrsize */
+       u32 ipath_rcvhdrsize;
+       /* value we put in kr_rcvhdrentsize */
+       u32 ipath_rcvhdrentsize;
+       /* offset of last entry in rcvhdrq */
+       u32 ipath_hdrqlast;
+       /* kr_portcnt value */
+       u32 ipath_portcnt;
+       /* kr_pagealign value */
+       u32 ipath_palign;
+       /* number of "2KB" PIO buffers */
+       u32 ipath_piobcnt2k;
+       /* size in bytes of "2KB" PIO buffers */
+       u32 ipath_piosize2k;
+       /* number of "4KB" PIO buffers */
+       u32 ipath_piobcnt4k;
+       /* size in bytes of "4KB" PIO buffers */
+       u32 ipath_piosize4k;
+       /* kr_rcvegrbase value */
+       u32 ipath_rcvegrbase;
+       /* kr_rcvegrcnt value */
+       u32 ipath_rcvegrcnt;
+       /* kr_rcvtidbase value */
+       u32 ipath_rcvtidbase;
+       /* kr_rcvtidcnt value */
+       u32 ipath_rcvtidcnt;
+       /* kr_sendregbase */
+       u32 ipath_sregbase;
+       /* kr_userregbase */
+       u32 ipath_uregbase;
+       /* kr_counterregbase */
+       u32 ipath_cregbase;
+       /* shadow the control register contents */
+       u32 ipath_control;
+       /* shadow the gpio output contents */
+       u32 ipath_extctrl;
+       /* PCI revision register (HTC rev on FPGA) */
+       u32 ipath_pcirev;
+
+       /* chip address space used by 4k pio buffers */
+       u32 ipath_4kalign;
+       /* The MTU programmed for this unit */
+       u32 ipath_ibmtu;
+       /*
+        * The max size IB packet, included IB headers that we can send.
+        * Starts same as ipath_piosize, but is affected when ibmtu is
+        * changed, or by size of eager buffers
+        */
+       u32 ipath_ibmaxlen;
+       /*
+        * ibmaxlen at init time, limited by chip and by receive buffer
+        * size.  Not changed after init.
+        */
+       u32 ipath_init_ibmaxlen;
+       /* size of each rcvegrbuffer */
+       u32 ipath_rcvegrbufsize;
+       /* width (2,4,8,16,32) from HT config reg */
+       u32 ipath_htwidth;
+       /* HT speed (200,400,800,1000) from HT config */
+       u32 ipath_htspeed;
+       /* ports waiting for PIOavail intr */
+       unsigned long ipath_portpiowait;
+       /*
+        * number of sequential ibcstatus change for polling active/quiet
+        * (i.e., link not coming up).
+        */
+       u32 ipath_ibpollcnt;
+       /* low and high portions of MSI capability/vector */
+       u32 ipath_msi_lo;
+       /* saved after PCIe init for restore after reset */
+       u32 ipath_msi_hi;
+       /* MSI data (vector) saved for restore */
+       u16 ipath_msi_data;
+       /* MLID programmed for this instance */
+       u16 ipath_mlid;
+       /* LID programmed for this instance */
+       u16 ipath_lid;
+       /* list of pkeys programmed; 0 if not set */
+       u16 ipath_pkeys[4];
+       /* ASCII serial number, from flash */
+       u8 ipath_serial[12];
+       /* human readable board version */
+       u8 ipath_boardversion[80];
+       /* chip major rev, from ipath_revision */
+       u8 ipath_majrev;
+       /* chip minor rev, from ipath_revision */
+       u8 ipath_minrev;
+       /* board rev, from ipath_revision */
+       u8 ipath_boardrev;
+       /* unit # of this chip, if present */
+       int ipath_unit;
+       /* saved for restore after reset */
+       u8 ipath_pci_cacheline;
+       /* LID mask control */
+       u8 ipath_lmc;
+};
+
+extern volatile __le64 *ipath_port0_rcvhdrtail;
+extern dma_addr_t ipath_port0_rcvhdrtail_dma;
+
+#define IPATH_PORT0_RCVHDRTAIL_SIZE PAGE_SIZE
+
+extern struct list_head ipath_dev_list;
+extern spinlock_t ipath_devs_lock;
+extern struct ipath_devdata *ipath_lookup(int unit);
+
+extern u16 ipath_layer_rcv_opcode;
+extern int ipath_verbs_registered;
+extern int __ipath_layer_intr(struct ipath_devdata *, u32);
+extern int ipath_layer_intr(struct ipath_devdata *, u32);
+extern int __ipath_layer_rcv(struct ipath_devdata *, void *,
+                            struct sk_buff *);
+extern int __ipath_layer_rcv_lid(struct ipath_devdata *, void *);
+extern int __ipath_verbs_piobufavail(struct ipath_devdata *);
+extern int __ipath_verbs_rcv(struct ipath_devdata *, void *, void *, u32);
+
+void ipath_layer_add(struct ipath_devdata *);
+void ipath_layer_del(struct ipath_devdata *);
+
+int ipath_init_chip(struct ipath_devdata *, int);
+int ipath_enable_wc(struct ipath_devdata *dd);
+void ipath_disable_wc(struct ipath_devdata *dd);
+int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
+void ipath_shutdown_device(struct ipath_devdata *);
+
+struct file_operations;
+int ipath_cdev_init(int minor, char *name, struct file_operations *fops,
+                   struct cdev **cdevp, struct class_device **class_devp);
+void ipath_cdev_cleanup(struct cdev **cdevp,
+                       struct class_device **class_devp);
+
+int ipath_diag_init(void);
+void ipath_diag_cleanup(void);
+void ipath_diag_bringup_link(struct ipath_devdata *);
+
+extern wait_queue_head_t ipath_sma_state_wait;
+
+int ipath_user_add(struct ipath_devdata *dd);
+void ipath_user_del(struct ipath_devdata *dd);
+
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
+
+extern int ipath_diag_inuse;
+
+irqreturn_t ipath_intr(int irq, void *devid, struct pt_regs *regs);
+void ipath_decode_err(char *buf, size_t blen, ipath_err_t err);
+#if __IPATH_INFO || __IPATH_DBG
+extern const char *ipath_ibcstatus_str[];
+#endif
+
+/* clean up any per-chip chip-specific stuff */
+void ipath_chip_cleanup(struct ipath_devdata *);
+/* clean up any chip type-specific stuff */
+void ipath_chip_done(void);
+
+/* check to see if we have to force ordering for write combining */
+int ipath_unordered_wc(void);
+
+void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
+                         unsigned cnt);
+
+int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
+void ipath_free_pddata(struct ipath_devdata *, u32, int);
+
+int ipath_parse_ushort(const char *str, unsigned short *valp);
+
+int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
+void ipath_set_ib_lstate(struct ipath_devdata *, int);
+void ipath_kreceive(struct ipath_devdata *);
+int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
+int ipath_reset_device(int);
+void ipath_get_faststats(unsigned long);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define port_fp(fp) ((struct ipath_portdata *) (fp)->private_data)
+
+/*
+ * values for ipath_flags
+ */
+/* The chip is up and initted */
+#define IPATH_INITTED       0x2
+               /* set if any user code has set kr_rcvhdrsize */
+#define IPATH_RCVHDRSZ_SET  0x4
+               /* The chip is present and valid for accesses */
+#define IPATH_PRESENT       0x8
+               /* HT link0 is only 8 bits wide, ignore upper byte crc
+                * errors, etc. */
+#define IPATH_8BIT_IN_HT0   0x10
+               /* HT link1 is only 8 bits wide, ignore upper byte crc
+                * errors, etc. */
+#define IPATH_8BIT_IN_HT1   0x20
+               /* The link is down */
+#define IPATH_LINKDOWN      0x40
+               /* The link level is up (0x11) */
+#define IPATH_LINKINIT      0x80
+               /* The link is in the armed (0x21) state */
+#define IPATH_LINKARMED     0x100
+               /* The link is in the active (0x31) state */
+#define IPATH_LINKACTIVE    0x200
+               /* link current state is unknown */
+#define IPATH_LINKUNK       0x400
+               /* no IB cable, or no device on IB cable */
+#define IPATH_NOCABLE       0x4000
+               /* Supports port zero per packet receive interrupts via
+                * GPIO */
+#define IPATH_GPIO_INTR     0x8000
+               /* uses the coded 4byte TID, not 8 byte */
+#define IPATH_4BYTE_TID     0x10000
+               /* packet/word counters are 32 bit, else those 4 counters
+                * are 64bit */
+#define IPATH_32BITCOUNTERS 0x20000
+               /* can miss port0 rx interrupts */
+#define IPATH_POLL_RX_INTR  0x40000
+#define IPATH_DISABLED      0x80000 /* administratively disabled */
+
+/* portdata flag bit offsets */
+               /* waiting for a packet to arrive */
+#define IPATH_PORT_WAITING_RCV   2
+               /* waiting for a PIO buffer to be available */
+#define IPATH_PORT_WAITING_PIO   3
+
+/* free up any allocated data at closes */
+void ipath_free_data(struct ipath_portdata *dd);
+int ipath_waitfor_mdio_cmdready(struct ipath_devdata *);
+int ipath_waitfor_complete(struct ipath_devdata *, ipath_kreg, u64, u64 *);
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32 *);
+/* init PE-800-specific func */
+void ipath_init_pe800_funcs(struct ipath_devdata *);
+/* init HT-400-specific func */
+void ipath_init_ht400_funcs(struct ipath_devdata *);
+void ipath_get_guid(struct ipath_devdata *);
+u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+
+/*
+ * number of words used for protocol header if not set by ipath_userinit();
+ */
+#define IPATH_DFLT_RCVHDRSIZE 9
+
+#define IPATH_MDIO_CMD_WRITE   1
+#define IPATH_MDIO_CMD_READ    2
+#define IPATH_MDIO_CLD_DIV     25      /* to get 2.5 Mhz mdio clock */
+#define IPATH_MDIO_CMDVALID    0x40000000      /* bit 30 */
+#define IPATH_MDIO_DATAVALID   0x80000000      /* bit 31 */
+#define IPATH_MDIO_CTRL_STD    0x0
+
+static inline u64 ipath_mdio_req(int cmd, int dev, int reg, int data)
+{
+       return (((u64) IPATH_MDIO_CLD_DIV) << 32) |
+               (cmd << 26) |
+               (dev << 21) |
+               (reg << 16) |
+               (data & 0xFFFF);
+}
+
+               /* signal and fifo status, in bank 31 */
+#define IPATH_MDIO_CTRL_XGXS_REG_8  0x8
+               /* controls loopback, redundancy */
+#define IPATH_MDIO_CTRL_8355_REG_1  0x10
+               /* premph, encdec, etc. */
+#define IPATH_MDIO_CTRL_8355_REG_2  0x11
+               /* Kchars, etc. */
+#define IPATH_MDIO_CTRL_8355_REG_6  0x15
+#define IPATH_MDIO_CTRL_8355_REG_9  0x18
+#define IPATH_MDIO_CTRL_8355_REG_10 0x1D
+
+int ipath_get_user_pages(unsigned long, size_t, struct page **);
+int ipath_get_user_pages_nocopy(unsigned long, struct page **);
+void ipath_release_user_pages(struct page **, size_t);
+void ipath_release_user_pages_on_close(struct page **, size_t);
+int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
+int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
+
+/* these are used for the registers that vary with port */
+void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
+                          unsigned, u64);
+u64 ipath_read_kreg64_port(const struct ipath_devdata *, ipath_kreg,
+                          unsigned);
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/*
+ * At the moment, none of the s-registers are writable, so no
+ * ipath_write_sreg(), and none of the c-registers are writable, so no
+ * ipath_write_creg().
+ */
+
+/**
+ * ipath_read_ureg32 - read 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @port: port number
+ *
+ * Return the contents of a register that is virtualized to be per port.
+ * Prints a debug message and returns -1 on errors (not distinguishable from
+ * valid contents at runtime; we may add a separate error variable at some
+ * point).
+ *
+ * This is normally not used by the kernel, but may be for debugging, and
+ * has a different implementation than user mode, which is why it's not in
+ * _common.h.
+ */
+static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
+                                   ipath_ureg regno, int port)
+{
+       if (!dd->ipath_kregbase)
+               return 0;
+
+       return readl(regno + (u64 __iomem *)
+                    (dd->ipath_uregbase +
+                     (char __iomem *)dd->ipath_kregbase +
+                     dd->ipath_palign * port));
+}
+
+/**
+ * ipath_write_ureg - write 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @port: port
+ *
+ * Write the contents of a register that is virtualized to be per port.
+ */
+static inline void ipath_write_ureg(const struct ipath_devdata *dd,
+                                   ipath_ureg regno, u64 value, int port)
+{
+       u64 __iomem *ubase = (u64 __iomem *)
+               (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
+                dd->ipath_palign * port);
+       if (dd->ipath_kregbase)
+               writeq(value, &ubase[regno]);
+}
+
+static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
+                                   ipath_kreg regno)
+{
+       if (!dd->ipath_kregbase)
+               return -1;
+       return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
+                                   ipath_kreg regno)
+{
+       if (!dd->ipath_kregbase)
+               return -1;
+
+       return readq(&dd->ipath_kregbase[regno]);
+}
+
+static inline void ipath_write_kreg(const struct ipath_devdata *dd,
+                                   ipath_kreg regno, u64 value)
+{
+       if (dd->ipath_kregbase)
+               writeq(value, &dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
+                                 ipath_sreg regno)
+{
+       if (!dd->ipath_kregbase)
+               return 0;
+
+       return readq(regno + (u64 __iomem *)
+                    (dd->ipath_cregbase +
+                     (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
+                                        ipath_sreg regno)
+{
+       if (!dd->ipath_kregbase)
+               return 0;
+       return readl(regno + (u64 __iomem *)
+                    (dd->ipath_cregbase +
+                     (char __iomem *)dd->ipath_kregbase));
+}
+
+/*
+ * sysfs interface.
+ */
+
+struct device_driver;
+
+extern const char ipath_core_version[];
+
+int ipath_driver_create_group(struct device_driver *);
+void ipath_driver_remove_group(struct device_driver *);
+
+int ipath_device_create_group(struct device *, struct ipath_devdata *);
+void ipath_device_remove_group(struct device *, struct ipath_devdata *);
+int ipath_expose_reset(struct device *);
+
+int ipath_init_ipathfs(void);
+void ipath_exit_ipathfs(void);
+int ipathfs_add_device(struct ipath_devdata *);
+int ipathfs_remove_device(struct ipath_devdata *);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+#if defined(CONFIG_X86_64)
+#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
+#else
+#define ipath_flush_wc() wmb()
+#endif
+
+extern unsigned ipath_debug; /* debugging bit mask */
+
+const char *ipath_get_unit_name(int unit);
+
+extern struct mutex ipath_mutex;
+
+#define IPATH_DRV_NAME         "ipath_core"
+#define IPATH_MAJOR            233
+#define IPATH_SMA_MINOR                128
+#define IPATH_DIAG_MINOR       129
+#define IPATH_NMINORS          130
+
+#define ipath_dev_err(dd,fmt,...) \
+       do { \
+               const struct ipath_devdata *__dd = (dd); \
+               if (__dd->pcidev) \
+                       dev_err(&__dd->pcidev->dev, "%s: " fmt, \
+                               ipath_get_unit_name(__dd->ipath_unit), \
+                               ##__VA_ARGS__); \
+               else \
+                       printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
+                              ipath_get_unit_name(__dd->ipath_unit), \
+                              ##__VA_ARGS__); \
+       } while (0)
+
+#if _IPATH_DEBUGGING
+
+# define __IPATH_DBG_WHICH(which,fmt,...) \
+       do { \
+               if(unlikely(ipath_debug&(which))) \
+                       printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
+                              __func__,##__VA_ARGS__); \
+       } while(0)
+
+# define ipath_dbg(fmt,...) \
+       __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
+# define ipath_cdbg(which,fmt,...) \
+       __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
+
+#else /* ! _IPATH_DEBUGGING */
+
+# define ipath_dbg(fmt,...)
+# define ipath_cdbg(which,fmt,...)
+
+#endif /* _IPATH_DEBUGGING */
+
+#endif                         /* _IPATH_KERNEL_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c
new file mode 100644 (file)
index 0000000..aa33b0e
--- /dev/null
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/io.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_alloc_lkey - allocate an lkey
+ * @rkt: lkey table in which to allocate the lkey
+ * @mr: memory region that this lkey protects
+ *
+ * Returns 1 if successful, otherwise returns 0.
+ */
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
+{
+       unsigned long flags;
+       u32 r;
+       u32 n;
+       int ret;
+
+       spin_lock_irqsave(&rkt->lock, flags);
+
+       /* Find the next available LKEY */
+       r = n = rkt->next;
+       for (;;) {
+               if (rkt->table[r] == NULL)
+                       break;
+               r = (r + 1) & (rkt->max - 1);
+               if (r == n) {
+                       spin_unlock_irqrestore(&rkt->lock, flags);
+                       _VERBS_INFO("LKEY table full\n");
+                       ret = 0;
+                       goto bail;
+               }
+       }
+       rkt->next = (r + 1) & (rkt->max - 1);
+       /*
+        * Make sure lkey is never zero which is reserved to indicate an
+        * unrestricted LKEY.
+        */
+       rkt->gen++;
+       mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
+               ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
+                << 8);
+       if (mr->lkey == 0) {
+               mr->lkey |= 1 << 8;
+               rkt->gen++;
+       }
+       rkt->table[r] = mr;
+       spin_unlock_irqrestore(&rkt->lock, flags);
+
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_free_lkey - free an lkey
+ * @rkt: table from which to free the lkey
+ * @lkey: lkey id to free
+ */
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+{
+       unsigned long flags;
+       u32 r;
+
+       if (lkey == 0)
+               return;
+       r = lkey >> (32 - ib_ipath_lkey_table_size);
+       spin_lock_irqsave(&rkt->lock, flags);
+       rkt->table[r] = NULL;
+       spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * ipath_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
+                 struct ib_sge *sge, int acc)
+{
+       struct ipath_mregion *mr;
+       size_t off;
+       int ret;
+
+       /*
+        * We use LKEY == zero to mean a physical kmalloc() address.
+        * This is a bit of a hack since we rely on dma_map_single()
+        * being reversible by calling bus_to_virt().
+        */
+       if (sge->lkey == 0) {
+               isge->mr = NULL;
+               isge->vaddr = bus_to_virt(sge->addr);
+               isge->length = sge->length;
+               isge->sge_length = sge->length;
+               ret = 1;
+               goto bail;
+       }
+       spin_lock(&rkt->lock);
+       mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
+       spin_unlock(&rkt->lock);
+       if (unlikely(mr == NULL || mr->lkey != sge->lkey)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off = sge->addr - mr->user_base;
+       if (unlikely(sge->addr < mr->user_base ||
+                    off + sge->length > mr->length ||
+                    (mr->access_flags & acc) != acc)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off += mr->offset;
+       isge->mr = mr;
+       isge->m = 0;
+       isge->n = 0;
+       while (off >= mr->map[isge->m]->segs[isge->n].length) {
+               off -= mr->map[isge->m]->segs[isge->n].length;
+               isge->n++;
+               if (isge->n >= IPATH_SEGSZ) {
+                       isge->m++;
+                       isge->n = 0;
+               }
+       }
+       isge->vaddr = mr->map[isge->m]->segs[isge->n].vaddr + off;
+       isge->length = mr->map[isge->m]->segs[isge->n].length - off;
+       isge->sge_length = sge->length;
+
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_rkey_ok - check the IB virtual address, length, and RKEY
+ * @dev: infiniband device
+ * @ss: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ *
+ * The QP r_rq.lock should be held.
+ */
+int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
+                 u32 len, u64 vaddr, u32 rkey, int acc)
+{
+       struct ipath_lkey_table *rkt = &dev->lk_table;
+       struct ipath_sge *sge = &ss->sge;
+       struct ipath_mregion *mr;
+       size_t off;
+       int ret;
+
+       spin_lock(&rkt->lock);
+       mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
+       spin_unlock(&rkt->lock);
+       if (unlikely(mr == NULL || mr->lkey != rkey)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off = vaddr - mr->iova;
+       if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+                    (mr->access_flags & acc) == 0)) {
+               ret = 0;
+               goto bail;
+       }
+
+       off += mr->offset;
+       sge->mr = mr;
+       sge->m = 0;
+       sge->n = 0;
+       while (off >= mr->map[sge->m]->segs[sge->n].length) {
+               off -= mr->map[sge->m]->segs[sge->n].length;
+               sge->n++;
+               if (sge->n >= IPATH_SEGSZ) {
+                       sge->m++;
+                       sge->n = 0;
+               }
+       }
+       sge->vaddr = mr->map[sge->m]->segs[sge->n].vaddr + off;
+       sge->length = mr->map[sge->m]->segs[sge->n].length - off;
+       sge->sge_length = len;
+       ss->sg_list = NULL;
+       ss->num_sge = 1;
+
+       ret = 1;
+
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_layer.c b/drivers/infiniband/hw/ipath/ipath_layer.c
new file mode 100644 (file)
index 0000000..2cabf63
--- /dev/null
@@ -0,0 +1,1515 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * These are the routines used by layered drivers, currently just the
+ * layered ethernet driver and verbs layer.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <asm/byteorder.h>
+
+#include "ipath_kernel.h"
+#include "ips_common.h"
+#include "ipath_layer.h"
+
+/* Acquire before ipath_devs_lock. */
+static DEFINE_MUTEX(ipath_layer_mutex);
+
+u16 ipath_layer_rcv_opcode;
+static int (*layer_intr)(void *, u32);
+static int (*layer_rcv)(void *, void *, struct sk_buff *);
+static int (*layer_rcv_lid)(void *, void *);
+static int (*verbs_piobufavail)(void *);
+static void (*verbs_rcv)(void *, void *, void *, u32);
+int ipath_verbs_registered;
+
+static void *(*layer_add_one)(int, struct ipath_devdata *);
+static void (*layer_remove_one)(void *);
+static void *(*verbs_add_one)(int, struct ipath_devdata *);
+static void (*verbs_remove_one)(void *);
+static void (*verbs_timer_cb)(void *);
+
+int __ipath_layer_intr(struct ipath_devdata *dd, u32 arg)
+{
+       int ret = -ENODEV;
+
+       if (dd->ipath_layer.l_arg && layer_intr)
+               ret = layer_intr(dd->ipath_layer.l_arg, arg);
+
+       return ret;
+}
+
+int ipath_layer_intr(struct ipath_devdata *dd, u32 arg)
+{
+       int ret;
+
+       mutex_lock(&ipath_layer_mutex);
+
+       ret = __ipath_layer_intr(dd, arg);
+
+       mutex_unlock(&ipath_layer_mutex);
+
+       return ret;
+}
+
+int __ipath_layer_rcv(struct ipath_devdata *dd, void *hdr,
+                     struct sk_buff *skb)
+{
+       int ret = -ENODEV;
+
+       if (dd->ipath_layer.l_arg && layer_rcv)
+               ret = layer_rcv(dd->ipath_layer.l_arg, hdr, skb);
+
+       return ret;
+}
+
+int __ipath_layer_rcv_lid(struct ipath_devdata *dd, void *hdr)
+{
+       int ret = -ENODEV;
+
+       if (dd->ipath_layer.l_arg && layer_rcv_lid)
+               ret = layer_rcv_lid(dd->ipath_layer.l_arg, hdr);
+
+       return ret;
+}
+
+int __ipath_verbs_piobufavail(struct ipath_devdata *dd)
+{
+       int ret = -ENODEV;
+
+       if (dd->verbs_layer.l_arg && verbs_piobufavail)
+               ret = verbs_piobufavail(dd->verbs_layer.l_arg);
+
+       return ret;
+}
+
+int __ipath_verbs_rcv(struct ipath_devdata *dd, void *rc, void *ebuf,
+                     u32 tlen)
+{
+       int ret = -ENODEV;
+
+       if (dd->verbs_layer.l_arg && verbs_rcv) {
+               verbs_rcv(dd->verbs_layer.l_arg, rc, ebuf, tlen);
+               ret = 0;
+       }
+
+       return ret;
+}
+
+int ipath_layer_set_linkstate(struct ipath_devdata *dd, u8 newstate)
+{
+       u32 lstate;
+       int ret;
+
+       switch (newstate) {
+       case IPATH_IB_LINKDOWN:
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_POLL <<
+                                   INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINKDOWN_SLEEP:
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKINITCMD_SLEEP <<
+                                   INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINKDOWN_DISABLE:
+               ipath_set_ib_lstate(dd,
+                                   INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+                                   INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+               /* don't wait */
+               ret = 0;
+               goto bail;
+
+       case IPATH_IB_LINKINIT:
+               if (dd->ipath_flags & IPATH_LINKINIT) {
+                       ret = 0;
+                       goto bail;
+               }
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_INIT <<
+                                   INFINIPATH_IBCC_LINKCMD_SHIFT);
+               lstate = IPATH_LINKINIT;
+               break;
+
+       case IPATH_IB_LINKARM:
+               if (dd->ipath_flags & IPATH_LINKARMED) {
+                       ret = 0;
+                       goto bail;
+               }
+               if (!(dd->ipath_flags &
+                     (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED <<
+                                   INFINIPATH_IBCC_LINKCMD_SHIFT);
+               /*
+                * Since the port can transition to ACTIVE by receiving
+                * a non VL 15 packet, wait for either state.
+                */
+               lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
+               break;
+
+       case IPATH_IB_LINKACTIVE:
+               if (dd->ipath_flags & IPATH_LINKACTIVE) {
+                       ret = 0;
+                       goto bail;
+               }
+               if (!(dd->ipath_flags & IPATH_LINKARMED)) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE <<
+                                   INFINIPATH_IBCC_LINKCMD_SHIFT);
+               lstate = IPATH_LINKACTIVE;
+               break;
+
+       default:
+               ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
+               ret = -EINVAL;
+               goto bail;
+       }
+       ret = ipath_wait_linkstate(dd, lstate, 2000);
+
+bail:
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_linkstate);
+
+/**
+ * ipath_layer_set_mtu - set the MTU
+ * @dd: the infinipath device
+ * @arg: the new MTU
+ *
+ * we can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link initialize (IPATH_IBSTATE_INIT) state...
+ */
+int ipath_layer_set_mtu(struct ipath_devdata *dd, u16 arg)
+{
+       u32 piosize;
+       int changed = 0;
+       int ret;
+
+       /*
+        * mtu is IB data payload max.  It's the largest power of 2 less
+        * than piosize (or even larger, since it only really controls the
+        * largest we can receive; we can send the max of the mtu and
+        * piosize).  We check that it's one of the valid IB sizes.
+        */
+       if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+           arg != 4096) {
+               ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
+               ret = -EINVAL;
+               goto bail;
+       }
+       if (dd->ipath_ibmtu == arg) {
+               ret = 0;        /* same as current */
+               goto bail;
+       }
+
+       piosize = dd->ipath_ibmaxlen;
+       dd->ipath_ibmtu = arg;
+
+       if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
+               /* Only if it's not the initial value (or reset to it) */
+               if (piosize != dd->ipath_init_ibmaxlen) {
+                       dd->ipath_ibmaxlen = piosize;
+                       changed = 1;
+               }
+       } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
+               piosize = arg + IPATH_PIO_MAXIBHDR;
+               ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
+                          "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
+                          arg);
+               dd->ipath_ibmaxlen = piosize;
+               changed = 1;
+       }
+
+       if (changed) {
+               /*
+                * set the IBC maxpktlength to the size of our pio
+                * buffers in words
+                */
+               u64 ibc = dd->ipath_ibcctrl;
+               ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
+                        INFINIPATH_IBCC_MAXPKTLEN_SHIFT);
+
+               piosize = piosize - 2 * sizeof(u32);    /* ignore pbc */
+               dd->ipath_ibmaxlen = piosize;
+               piosize /= sizeof(u32); /* in words */
+               /*
+                * for ICRC, which we only send in diag test pkt mode, and
+                * we don't need to worry about that for mtu
+                */
+               piosize += 1;
+
+               ibc |= piosize << INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+               dd->ipath_ibcctrl = ibc;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+               dd->ipath_f_tidtemplate(dd);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_mtu);
+
+int ipath_set_sps_lid(struct ipath_devdata *dd, u32 arg, u8 lmc)
+{
+       ipath_stats.sps_lid[dd->ipath_unit] = arg;
+       dd->ipath_lid = arg;
+       dd->ipath_lmc = lmc;
+
+       mutex_lock(&ipath_layer_mutex);
+
+       if (dd->ipath_layer.l_arg && layer_intr)
+               layer_intr(dd->ipath_layer.l_arg, IPATH_LAYER_INT_LID);
+
+       mutex_unlock(&ipath_layer_mutex);
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_set_sps_lid);
+
+int ipath_layer_set_guid(struct ipath_devdata *dd, __be64 guid)
+{
+       /* XXX - need to inform anyone who cares this just happened. */
+       dd->ipath_guid = guid;
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_guid);
+
+__be64 ipath_layer_get_guid(struct ipath_devdata *dd)
+{
+       return dd->ipath_guid;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_guid);
+
+u32 ipath_layer_get_nguid(struct ipath_devdata *dd)
+{
+       return dd->ipath_nguid;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_nguid);
+
+int ipath_layer_query_device(struct ipath_devdata *dd, u32 * vendor,
+                            u32 * boardrev, u32 * majrev, u32 * minrev)
+{
+       *vendor = dd->ipath_vendorid;
+       *boardrev = dd->ipath_boardrev;
+       *majrev = dd->ipath_majrev;
+       *minrev = dd->ipath_minrev;
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_query_device);
+
+u32 ipath_layer_get_flags(struct ipath_devdata *dd)
+{
+       return dd->ipath_flags;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_flags);
+
+struct device *ipath_layer_get_device(struct ipath_devdata *dd)
+{
+       return &dd->pcidev->dev;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_device);
+
+u16 ipath_layer_get_deviceid(struct ipath_devdata *dd)
+{
+       return dd->ipath_deviceid;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_deviceid);
+
+u64 ipath_layer_get_lastibcstat(struct ipath_devdata *dd)
+{
+       return dd->ipath_lastibcstat;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_lastibcstat);
+
+u32 ipath_layer_get_ibmtu(struct ipath_devdata *dd)
+{
+       return dd->ipath_ibmtu;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_ibmtu);
+
+void ipath_layer_add(struct ipath_devdata *dd)
+{
+       mutex_lock(&ipath_layer_mutex);
+
+       if (layer_add_one)
+               dd->ipath_layer.l_arg =
+                       layer_add_one(dd->ipath_unit, dd);
+
+       if (verbs_add_one)
+               dd->verbs_layer.l_arg =
+                       verbs_add_one(dd->ipath_unit, dd);
+
+       mutex_unlock(&ipath_layer_mutex);
+}
+
+void ipath_layer_del(struct ipath_devdata *dd)
+{
+       mutex_lock(&ipath_layer_mutex);
+
+       if (dd->ipath_layer.l_arg && layer_remove_one) {
+               layer_remove_one(dd->ipath_layer.l_arg);
+               dd->ipath_layer.l_arg = NULL;
+       }
+
+       if (dd->verbs_layer.l_arg && verbs_remove_one) {
+               verbs_remove_one(dd->verbs_layer.l_arg);
+               dd->verbs_layer.l_arg = NULL;
+       }
+
+       mutex_unlock(&ipath_layer_mutex);
+}
+
+int ipath_layer_register(void *(*l_add)(int, struct ipath_devdata *),
+                        void (*l_remove)(void *),
+                        int (*l_intr)(void *, u32),
+                        int (*l_rcv)(void *, void *, struct sk_buff *),
+                        u16 l_rcv_opcode,
+                        int (*l_rcv_lid)(void *, void *))
+{
+       struct ipath_devdata *dd, *tmp;
+       unsigned long flags;
+
+       mutex_lock(&ipath_layer_mutex);
+
+       layer_add_one = l_add;
+       layer_remove_one = l_remove;
+       layer_intr = l_intr;
+       layer_rcv = l_rcv;
+       layer_rcv_lid = l_rcv_lid;
+       ipath_layer_rcv_opcode = l_rcv_opcode;
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+               if (!(dd->ipath_flags & IPATH_INITTED))
+                       continue;
+
+               if (dd->ipath_layer.l_arg)
+                       continue;
+
+               if (!(*dd->ipath_statusp & IPATH_STATUS_SMA))
+                       *dd->ipath_statusp |= IPATH_STATUS_OIB_SMA;
+
+               spin_unlock_irqrestore(&ipath_devs_lock, flags);
+               dd->ipath_layer.l_arg = l_add(dd->ipath_unit, dd);
+               spin_lock_irqsave(&ipath_devs_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+       mutex_unlock(&ipath_layer_mutex);
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_register);
+
+void ipath_layer_unregister(void)
+{
+       struct ipath_devdata *dd, *tmp;
+       unsigned long flags;
+
+       mutex_lock(&ipath_layer_mutex);
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+               if (dd->ipath_layer.l_arg && layer_remove_one) {
+                       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+                       layer_remove_one(dd->ipath_layer.l_arg);
+                       spin_lock_irqsave(&ipath_devs_lock, flags);
+                       dd->ipath_layer.l_arg = NULL;
+               }
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       layer_add_one = NULL;
+       layer_remove_one = NULL;
+       layer_intr = NULL;
+       layer_rcv = NULL;
+       layer_rcv_lid = NULL;
+
+       mutex_unlock(&ipath_layer_mutex);
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_unregister);
+
+static void __ipath_verbs_timer(unsigned long arg)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *) arg;
+
+       /*
+        * If port 0 receive packet interrupts are not available, or
+        * can be missed, poll the receive queue
+        */
+       if (dd->ipath_flags & IPATH_POLL_RX_INTR)
+               ipath_kreceive(dd);
+
+       /* Handle verbs layer timeouts. */
+       if (dd->verbs_layer.l_arg && verbs_timer_cb)
+               verbs_timer_cb(dd->verbs_layer.l_arg);
+
+       mod_timer(&dd->verbs_layer.l_timer, jiffies + 1);
+}
+
+/**
+ * ipath_verbs_register - verbs layer registration
+ * @l_piobufavail: callback for when PIO buffers become available
+ * @l_rcv: callback for receiving a packet
+ * @l_timer_cb: timer callback
+ * @ipath_devdata: device data structure is put here
+ */
+int ipath_verbs_register(void *(*l_add)(int, struct ipath_devdata *),
+                        void (*l_remove)(void *arg),
+                        int (*l_piobufavail) (void *arg),
+                        void (*l_rcv) (void *arg, void *rhdr,
+                                       void *data, u32 tlen),
+                        void (*l_timer_cb) (void *arg))
+{
+       struct ipath_devdata *dd, *tmp;
+       unsigned long flags;
+
+       mutex_lock(&ipath_layer_mutex);
+
+       verbs_add_one = l_add;
+       verbs_remove_one = l_remove;
+       verbs_piobufavail = l_piobufavail;
+       verbs_rcv = l_rcv;
+       verbs_timer_cb = l_timer_cb;
+
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+               if (!(dd->ipath_flags & IPATH_INITTED))
+                       continue;
+
+               if (dd->verbs_layer.l_arg)
+                       continue;
+
+               spin_unlock_irqrestore(&ipath_devs_lock, flags);
+               dd->verbs_layer.l_arg = l_add(dd->ipath_unit, dd);
+               spin_lock_irqsave(&ipath_devs_lock, flags);
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+       mutex_unlock(&ipath_layer_mutex);
+
+       ipath_verbs_registered = 1;
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_verbs_register);
+
+void ipath_verbs_unregister(void)
+{
+       struct ipath_devdata *dd, *tmp;
+       unsigned long flags;
+
+       mutex_lock(&ipath_layer_mutex);
+       spin_lock_irqsave(&ipath_devs_lock, flags);
+
+       list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+               *dd->ipath_statusp &= ~IPATH_STATUS_OIB_SMA;
+
+               if (dd->verbs_layer.l_arg && verbs_remove_one) {
+                       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+                       verbs_remove_one(dd->verbs_layer.l_arg);
+                       spin_lock_irqsave(&ipath_devs_lock, flags);
+                       dd->verbs_layer.l_arg = NULL;
+               }
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+       verbs_add_one = NULL;
+       verbs_remove_one = NULL;
+       verbs_piobufavail = NULL;
+       verbs_rcv = NULL;
+       verbs_timer_cb = NULL;
+
+       mutex_unlock(&ipath_layer_mutex);
+}
+
+EXPORT_SYMBOL_GPL(ipath_verbs_unregister);
+
+int ipath_layer_open(struct ipath_devdata *dd, u32 * pktmax)
+{
+       int ret;
+       u32 intval = 0;
+
+       mutex_lock(&ipath_layer_mutex);
+
+       if (!dd->ipath_layer.l_arg) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ret = ipath_setrcvhdrsize(dd, NUM_OF_EXTRA_WORDS_IN_HEADER_QUEUE);
+
+       if (ret < 0)
+               goto bail;
+
+       *pktmax = dd->ipath_ibmaxlen;
+
+       if (*dd->ipath_statusp & IPATH_STATUS_IB_READY)
+               intval |= IPATH_LAYER_INT_IF_UP;
+       if (ipath_stats.sps_lid[dd->ipath_unit])
+               intval |= IPATH_LAYER_INT_LID;
+       if (ipath_stats.sps_mlid[dd->ipath_unit])
+               intval |= IPATH_LAYER_INT_BCAST;
+       /*
+        * do this on open, in case low level is already up and
+        * just layered driver was reloaded, etc.
+        */
+       if (intval)
+               layer_intr(dd->ipath_layer.l_arg, intval);
+
+       ret = 0;
+bail:
+       mutex_unlock(&ipath_layer_mutex);
+
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_open);
+
+u16 ipath_layer_get_lid(struct ipath_devdata *dd)
+{
+       return dd->ipath_lid;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_lid);
+
+/**
+ * ipath_layer_get_mac - get the MAC address
+ * @dd: the infinipath device
+ * @mac: the MAC is put here
+ *
+ * This is the EUID-64 OUI octets (top 3), then
+ * skip the next 2 (which should both be zero or 0xff).
+ * The returned MAC is in network order
+ * mac points to at least 6 bytes of buffer
+ * We assume that by the time the LID is set, that the GUID is as valid
+ * as it's ever going to be, rather than adding yet another status bit.
+ */
+
+int ipath_layer_get_mac(struct ipath_devdata *dd, u8 * mac)
+{
+       u8 *guid;
+
+       guid = (u8 *) &dd->ipath_guid;
+
+       mac[0] = guid[0];
+       mac[1] = guid[1];
+       mac[2] = guid[2];
+       mac[3] = guid[5];
+       mac[4] = guid[6];
+       mac[5] = guid[7];
+       if ((guid[3] || guid[4]) && !(guid[3] == 0xff && guid[4] == 0xff))
+               ipath_dbg("Warning, guid bytes 3 and 4 not 0 or 0xffff: "
+                         "%x %x\n", guid[3], guid[4]);
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_mac);
+
+u16 ipath_layer_get_bcast(struct ipath_devdata *dd)
+{
+       return dd->ipath_mlid;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_bcast);
+
+u32 ipath_layer_get_cr_errpkey(struct ipath_devdata *dd)
+{
+       return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_cr_errpkey);
+
+static void update_sge(struct ipath_sge_state *ss, u32 length)
+{
+       struct ipath_sge *sge = &ss->sge;
+
+       sge->vaddr += length;
+       sge->length -= length;
+       sge->sge_length -= length;
+       if (sge->sge_length == 0) {
+               if (--ss->num_sge)
+                       *sge = *ss->sg_list++;
+       } else if (sge->length == 0 && sge->mr != NULL) {
+               if (++sge->n >= IPATH_SEGSZ) {
+                       if (++sge->m >= sge->mr->mapsz)
+                               return;
+                       sge->n = 0;
+               }
+               sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+               sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+       }
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+       return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+       return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+       data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+       data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+       return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+       return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+       return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+       data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+       data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+       return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
+                   u32 length)
+{
+       u32 extra = 0;
+       u32 data = 0;
+       u32 last;
+
+       while (1) {
+               u32 len = ss->sge.length;
+               u32 off;
+
+               BUG_ON(len == 0);
+               if (len > length)
+                       len = length;
+               if (len > ss->sge.sge_length)
+                       len = ss->sge.sge_length;
+               /* If the source address is not aligned, try to align it. */
+               off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+               if (off) {
+                       u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+                                           ~(sizeof(u32) - 1));
+                       u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+                       u32 y;
+
+                       y = sizeof(u32) - off;
+                       if (len > y)
+                               len = y;
+                       if (len + extra >= sizeof(u32)) {
+                               data |= set_upper_bits(v, extra *
+                                                      BITS_PER_BYTE);
+                               len = sizeof(u32) - extra;
+                               if (len == length) {
+                                       last = data;
+                                       break;
+                               }
+                               __raw_writel(data, piobuf);
+                               piobuf++;
+                               extra = 0;
+                               data = 0;
+                       } else {
+                               /* Clear unused upper bytes */
+                               data |= clear_upper_bytes(v, len, extra);
+                               if (len == length) {
+                                       last = data;
+                                       break;
+                               }
+                               extra += len;
+                       }
+               } else if (extra) {
+                       /* Source address is aligned. */
+                       u32 *addr = (u32 *) ss->sge.vaddr;
+                       int shift = extra * BITS_PER_BYTE;
+                       int ushift = 32 - shift;
+                       u32 l = len;
+
+                       while (l >= sizeof(u32)) {
+                               u32 v = *addr;
+
+                               data |= set_upper_bits(v, shift);
+                               __raw_writel(data, piobuf);
+                               data = get_upper_bits(v, ushift);
+                               piobuf++;
+                               addr++;
+                               l -= sizeof(u32);
+                       }
+                       /*
+                        * We still have 'extra' number of bytes leftover.
+                        */
+                       if (l) {
+                               u32 v = *addr;
+
+                               if (l + extra >= sizeof(u32)) {
+                                       data |= set_upper_bits(v, shift);
+                                       len -= l + extra - sizeof(u32);
+                                       if (len == length) {
+                                               last = data;
+                                               break;
+                                       }
+                                       __raw_writel(data, piobuf);
+                                       piobuf++;
+                                       extra = 0;
+                                       data = 0;
+                               } else {
+                                       /* Clear unused upper bytes */
+                                       data |= clear_upper_bytes(v, l,
+                                                                 extra);
+                                       if (len == length) {
+                                               last = data;
+                                               break;
+                                       }
+                                       extra += l;
+                               }
+                       } else if (len == length) {
+                               last = data;
+                               break;
+                       }
+               } else if (len == length) {
+                       u32 w;
+
+                       /*
+                        * Need to round up for the last dword in the
+                        * packet.
+                        */
+                       w = (len + 3) >> 2;
+                       __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
+                       piobuf += w - 1;
+                       last = ((u32 *) ss->sge.vaddr)[w - 1];
+                       break;
+               } else {
+                       u32 w = len >> 2;
+
+                       __iowrite32_copy(piobuf, ss->sge.vaddr, w);
+                       piobuf += w;
+
+                       extra = len & (sizeof(u32) - 1);
+                       if (extra) {
+                               u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+                               /* Clear unused upper bytes */
+                               data = clear_upper_bytes(v, extra, 0);
+                       }
+               }
+               update_sge(ss, len);
+               length -= len;
+       }
+       /* must flush early everything before trigger word */
+       ipath_flush_wc();
+       __raw_writel(last, piobuf);
+       /* be sure trigger word is written */
+       ipath_flush_wc();
+       update_sge(ss, length);
+}
+
+/**
+ * ipath_verbs_send - send a packet from the verbs layer
+ * @dd: the infinipath device
+ * @hdrwords: the number of works in the header
+ * @hdr: the packet header
+ * @len: the length of the packet in bytes
+ * @ss: the SGE to send
+ *
+ * This is like ipath_sma_send_pkt() in that we need to be able to send
+ * packets after the chip is initialized (MADs) but also like
+ * ipath_layer_send_hdr() since its used by the verbs layer.
+ */
+int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
+                    u32 *hdr, u32 len, struct ipath_sge_state *ss)
+{
+       u32 __iomem *piobuf;
+       u32 plen;
+       int ret;
+
+       /* +1 is for the qword padding of pbc */
+       plen = hdrwords + ((len + 3) >> 2) + 1;
+       if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) {
+               ipath_dbg("packet len 0x%x too long, failing\n", plen);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       /* Get a PIO buffer to use. */
+       piobuf = ipath_getpiobuf(dd, NULL);
+       if (unlikely(piobuf == NULL)) {
+               ret = -EBUSY;
+               goto bail;
+       }
+
+       /*
+        * Write len to control qword, no flags.
+        * We have to flush after the PBC for correctness on some cpus
+        * or WC buffer can be written out of order.
+        */
+       writeq(plen, piobuf);
+       ipath_flush_wc();
+       piobuf += 2;
+       if (len == 0) {
+               /*
+                * If there is just the header portion, must flush before
+                * writing last word of header for correctness, and after
+                * the last header word (trigger word).
+                */
+               __iowrite32_copy(piobuf, hdr, hdrwords - 1);
+               ipath_flush_wc();
+               __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+               ipath_flush_wc();
+               ret = 0;
+               goto bail;
+       }
+
+       __iowrite32_copy(piobuf, hdr, hdrwords);
+       piobuf += hdrwords;
+
+       /* The common case is aligned and contained in one segment. */
+       if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+                  !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+               u32 w;
+
+               /* Need to round up for the last dword in the packet. */
+               w = (len + 3) >> 2;
+               __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
+               /* must flush early everything before trigger word */
+               ipath_flush_wc();
+               __raw_writel(((u32 *) ss->sge.vaddr)[w - 1],
+                            piobuf + w - 1);
+               /* be sure trigger word is written */
+               ipath_flush_wc();
+               update_sge(ss, len);
+               ret = 0;
+               goto bail;
+       }
+       copy_io(piobuf, ss, len);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_verbs_send);
+
+int ipath_layer_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+                                 u64 *rwords, u64 *spkts, u64 *rpkts,
+                                 u64 *xmit_wait)
+{
+       int ret;
+
+       if (!(dd->ipath_flags & IPATH_INITTED)) {
+               /* no hardware, freeze, etc. */
+               ipath_dbg("unit %u not usable\n", dd->ipath_unit);
+               ret = -EINVAL;
+               goto bail;
+       }
+       *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+       *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+       *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+       *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+       *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_snapshot_counters);
+
+/**
+ * ipath_layer_get_counters - get various chip counters
+ * @dd: the infinipath device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int ipath_layer_get_counters(struct ipath_devdata *dd,
+                             struct ipath_layer_counters *cntrs)
+{
+       int ret;
+
+       if (!(dd->ipath_flags & IPATH_INITTED)) {
+               /* no hardware, freeze, etc. */
+               ipath_dbg("unit %u not usable\n", dd->ipath_unit);
+               ret = -EINVAL;
+               goto bail;
+       }
+       cntrs->symbol_error_counter =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt);
+       cntrs->link_error_recovery_counter =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt);
+       cntrs->link_downed_counter =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt);
+       cntrs->port_rcv_errors =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_errrcvflowctrlcnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlinkcnt) +
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt);
+       cntrs->port_rcv_remphys_errors =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt);
+       cntrs->port_xmit_discards =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_unsupvlcnt);
+       cntrs->port_xmit_data =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+       cntrs->port_rcv_data =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+       cntrs->port_xmit_packets =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+       cntrs->port_rcv_packets =
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_counters);
+
+int ipath_layer_want_buffer(struct ipath_devdata *dd)
+{
+       set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        dd->ipath_sendctrl);
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_want_buffer);
+
+int ipath_layer_send_hdr(struct ipath_devdata *dd, struct ether_header *hdr)
+{
+       int ret = 0;
+       u32 __iomem *piobuf;
+       u32 plen, *uhdr;
+       size_t count;
+       __be16 vlsllnh;
+
+       if (!(dd->ipath_flags & IPATH_RCVHDRSZ_SET)) {
+               ipath_dbg("send while not open\n");
+               ret = -EINVAL;
+       } else
+               if ((dd->ipath_flags & (IPATH_LINKUNK | IPATH_LINKDOWN)) ||
+                   dd->ipath_lid == 0) {
+                       /*
+                        * lid check is for when sma hasn't yet configured
+                        */
+                       ret = -ENETDOWN;
+                       ipath_cdbg(VERBOSE, "send while not ready, "
+                                  "mylid=%u, flags=0x%x\n",
+                                  dd->ipath_lid, dd->ipath_flags);
+               }
+
+       vlsllnh = *((__be16 *) hdr);
+       if (vlsllnh != htons(IPS_LRH_BTH)) {
+               ipath_dbg("Warning: lrh[0] wrong (%x, not %x); "
+                         "not sending\n", be16_to_cpu(vlsllnh),
+                         IPS_LRH_BTH);
+               ret = -EINVAL;
+       }
+       if (ret)
+               goto done;
+
+       /* Get a PIO buffer to use. */
+       piobuf = ipath_getpiobuf(dd, NULL);
+       if (piobuf == NULL) {
+               ret = -EBUSY;
+               goto done;
+       }
+
+       plen = (sizeof(*hdr) >> 2); /* actual length */
+       ipath_cdbg(EPKT, "0x%x+1w pio %p\n", plen, piobuf);
+
+       writeq(plen+1, piobuf); /* len (+1 for pad) to pbc, no flags */
+       ipath_flush_wc();
+       piobuf += 2;
+       uhdr = (u32 *)hdr;
+       count = plen-1; /* amount we can copy before trigger word */
+       __iowrite32_copy(piobuf, uhdr, count);
+       ipath_flush_wc();
+       __raw_writel(uhdr[count], piobuf + count);
+       ipath_flush_wc(); /* ensure it's sent, now */
+
+       ipath_stats.sps_ether_spkts++;  /* ether packet sent */
+
+done:
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_send_hdr);
+
+int ipath_layer_set_piointbufavail_int(struct ipath_devdata *dd)
+{
+       set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
+
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                        dd->ipath_sendctrl);
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_piointbufavail_int);
+
+int ipath_layer_enable_timer(struct ipath_devdata *dd)
+{
+       /*
+        * HT-400 has a design flaw where the chip and kernel idea
+        * of the tail register don't always agree, and therefore we won't
+        * get an interrupt on the next packet received.
+        * If the board supports per packet receive interrupts, use it.
+        * Otherwise, the timer function periodically checks for packets
+        * to cover this case.
+        * Either way, the timer is needed for verbs layer related
+        * processing.
+        */
+       if (dd->ipath_flags & IPATH_GPIO_INTR) {
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
+                                0x2074076542310ULL);
+               /* Enable GPIO bit 2 interrupt */
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+                                (u64) (1 << 2));
+       }
+
+       init_timer(&dd->verbs_layer.l_timer);
+       dd->verbs_layer.l_timer.function = __ipath_verbs_timer;
+       dd->verbs_layer.l_timer.data = (unsigned long)dd;
+       dd->verbs_layer.l_timer.expires = jiffies + 1;
+       add_timer(&dd->verbs_layer.l_timer);
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_enable_timer);
+
+int ipath_layer_disable_timer(struct ipath_devdata *dd)
+{
+       /* Disable GPIO bit 2 interrupt */
+       if (dd->ipath_flags & IPATH_GPIO_INTR)
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, 0);
+
+       del_timer_sync(&dd->verbs_layer.l_timer);
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_disable_timer);
+
+/**
+ * ipath_layer_set_verbs_flags - set the verbs layer flags
+ * @dd: the infinipath device
+ * @flags: the flags to set
+ */
+int ipath_layer_set_verbs_flags(struct ipath_devdata *dd, unsigned flags)
+{
+       struct ipath_devdata *ss;
+       unsigned long lflags;
+
+       spin_lock_irqsave(&ipath_devs_lock, lflags);
+
+       list_for_each_entry(ss, &ipath_dev_list, ipath_list) {
+               if (!(ss->ipath_flags & IPATH_INITTED))
+                       continue;
+               if ((flags & IPATH_VERBS_KERNEL_SMA) &&
+                   !(*ss->ipath_statusp & IPATH_STATUS_SMA))
+                       *ss->ipath_statusp |= IPATH_STATUS_OIB_SMA;
+               else
+                       *ss->ipath_statusp &= ~IPATH_STATUS_OIB_SMA;
+       }
+
+       spin_unlock_irqrestore(&ipath_devs_lock, lflags);
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_verbs_flags);
+
+/**
+ * ipath_layer_get_npkeys - return the size of the PKEY table for port 0
+ * @dd: the infinipath device
+ */
+unsigned ipath_layer_get_npkeys(struct ipath_devdata *dd)
+{
+       return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_npkeys);
+
+/**
+ * ipath_layer_get_pkey - return the indexed PKEY from the port 0 PKEY table
+ * @dd: the infinipath device
+ * @index: the PKEY index
+ */
+unsigned ipath_layer_get_pkey(struct ipath_devdata *dd, unsigned index)
+{
+       unsigned ret;
+
+       if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
+               ret = 0;
+       else
+               ret = dd->ipath_pd[0]->port_pkeys[index];
+
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_pkey);
+
+/**
+ * ipath_layer_get_pkeys - return the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the pkey table is placed here
+ */
+int ipath_layer_get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
+{
+       struct ipath_portdata *pd = dd->ipath_pd[0];
+
+       memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_pkeys);
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the infinipath device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct ipath_devdata *dd, u16 key)
+{
+       int i;
+       int ret;
+
+       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (dd->ipath_pkeys[i] != key)
+                       continue;
+               if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
+                       dd->ipath_pkeys[i] = 0;
+                       ret = 1;
+                       goto bail;
+               }
+               break;
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the infinipath device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct ipath_devdata *dd, u16 key)
+{
+       int i;
+       u16 lkey = key & 0x7FFF;
+       int any = 0;
+       int ret;
+
+       if (lkey == 0x7FFF) {
+               ret = 0;
+               goto bail;
+       }
+
+       /* Look for an empty slot or a matching PKEY. */
+       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i]) {
+                       any++;
+                       continue;
+               }
+               /* If it matches exactly, try to increment the ref count */
+               if (dd->ipath_pkeys[i] == key) {
+                       if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
+                               ret = 0;
+                               goto bail;
+                       }
+                       /* Lost the race. Look for an empty slot below. */
+                       atomic_dec(&dd->ipath_pkeyrefs[i]);
+                       any++;
+               }
+               /*
+                * It makes no sense to have both the limited and unlimited
+                * PKEY set at the same time since the unlimited one will
+                * disable the limited one.
+                */
+               if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+                       ret = -EEXIST;
+                       goto bail;
+               }
+       }
+       if (!any) {
+               ret = -EBUSY;
+               goto bail;
+       }
+       for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+               if (!dd->ipath_pkeys[i] &&
+                   atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+                       /* for ipathstats, etc. */
+                       ipath_stats.sps_pkeys[i] = lkey;
+                       dd->ipath_pkeys[i] = key;
+                       ret = 1;
+                       goto bail;
+               }
+       }
+       ret = -EBUSY;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_layer_set_pkeys - set the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the PKEY table
+ */
+int ipath_layer_set_pkeys(struct ipath_devdata *dd, u16 * pkeys)
+{
+       struct ipath_portdata *pd;
+       int i;
+       int changed = 0;
+
+       pd = dd->ipath_pd[0];
+
+       for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+               u16 key = pkeys[i];
+               u16 okey = pd->port_pkeys[i];
+
+               if (key == okey)
+                       continue;
+               /*
+                * The value of this PKEY table entry is changing.
+                * Remove the old entry in the hardware's array of PKEYs.
+                */
+               if (okey & 0x7FFF)
+                       changed |= rm_pkey(dd, okey);
+               if (key & 0x7FFF) {
+                       int ret = add_pkey(dd, key);
+
+                       if (ret < 0)
+                               key = 0;
+                       else
+                               changed |= ret;
+               }
+               pd->port_pkeys[i] = key;
+       }
+       if (changed) {
+               u64 pkey;
+
+               pkey = (u64) dd->ipath_pkeys[0] |
+                       ((u64) dd->ipath_pkeys[1] << 16) |
+                       ((u64) dd->ipath_pkeys[2] << 32) |
+                       ((u64) dd->ipath_pkeys[3] << 48);
+               ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
+                          (unsigned long long) pkey);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+                                pkey);
+       }
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_pkeys);
+
+/**
+ * ipath_layer_get_linkdowndefaultstate - get the default linkdown state
+ * @dd: the infinipath device
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+int ipath_layer_get_linkdowndefaultstate(struct ipath_devdata *dd)
+{
+       return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_linkdowndefaultstate);
+
+/**
+ * ipath_layer_set_linkdowndefaultstate - set the default linkdown state
+ * @dd: the infinipath device
+ * @sleep: the new state
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+int ipath_layer_set_linkdowndefaultstate(struct ipath_devdata *dd,
+                                        int sleep)
+{
+       if (sleep)
+               dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+       else
+               dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                        dd->ipath_ibcctrl);
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_linkdowndefaultstate);
+
+int ipath_layer_get_phyerrthreshold(struct ipath_devdata *dd)
+{
+       return (dd->ipath_ibcctrl >>
+               INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_phyerrthreshold);
+
+/**
+ * ipath_layer_set_phyerrthreshold - set the physical error threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+int ipath_layer_set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
+{
+       unsigned v;
+
+       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+       if (v != n) {
+               dd->ipath_ibcctrl &=
+                       ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
+                         INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
+               dd->ipath_ibcctrl |=
+                       (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+       }
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_phyerrthreshold);
+
+int ipath_layer_get_overrunthreshold(struct ipath_devdata *dd)
+{
+       return (dd->ipath_ibcctrl >>
+               INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_get_overrunthreshold);
+
+/**
+ * ipath_layer_set_overrunthreshold - set the overrun threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+int ipath_layer_set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
+{
+       unsigned v;
+
+       v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+               INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+       if (v != n) {
+               dd->ipath_ibcctrl &=
+                       ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
+                         INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
+               dd->ipath_ibcctrl |=
+                       (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+                                dd->ipath_ibcctrl);
+       }
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(ipath_layer_set_overrunthreshold);
+
+int ipath_layer_get_boardname(struct ipath_devdata *dd, char *name,
+                             size_t namelen)
+{
+       return dd->ipath_f_get_boardname(dd, name, namelen);
+}
+EXPORT_SYMBOL_GPL(ipath_layer_get_boardname);
+
+u32 ipath_layer_get_rcvhdrentsize(struct ipath_devdata *dd)
+{
+       return dd->ipath_rcvhdrentsize;
+}
+EXPORT_SYMBOL_GPL(ipath_layer_get_rcvhdrentsize);
diff --git a/drivers/infiniband/hw/ipath/ipath_layer.h b/drivers/infiniband/hw/ipath/ipath_layer.h
new file mode 100644 (file)
index 0000000..6fefd15
--- /dev/null
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_LAYER_H
+#define _IPATH_LAYER_H
+
+/*
+ * This header file is for symbols shared between the infinipath driver
+ * and drivers layered upon it (such as ipath).
+ */
+
+struct sk_buff;
+struct ipath_sge_state;
+struct ipath_devdata;
+struct ether_header;
+
+struct ipath_layer_counters {
+       u64 symbol_error_counter;
+       u64 link_error_recovery_counter;
+       u64 link_downed_counter;
+       u64 port_rcv_errors;
+       u64 port_rcv_remphys_errors;
+       u64 port_xmit_discards;
+       u64 port_xmit_data;
+       u64 port_rcv_data;
+       u64 port_xmit_packets;
+       u64 port_rcv_packets;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct ipath_seg {
+       void *vaddr;
+       size_t length;
+};
+
+/* The number of ipath_segs that fit in a page. */
+#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
+
+struct ipath_segarray {
+       struct ipath_seg segs[IPATH_SEGSZ];
+};
+
+struct ipath_mregion {
+       u64 user_base;          /* User's address for this region */
+       u64 iova;               /* IB start address of this region */
+       size_t length;
+       u32 lkey;
+       u32 offset;             /* offset (bytes) to start of region */
+       int access_flags;
+       u32 max_segs;           /* number of ipath_segs in all the arrays */
+       u32 mapsz;              /* size of the map array */
+       struct ipath_segarray *map[0];  /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct ipath_sge {
+       struct ipath_mregion *mr;
+       void *vaddr;            /* current pointer into the segment */
+       u32 sge_length;         /* length of the SGE */
+       u32 length;             /* remaining length of the segment */
+       u16 m;                  /* current index: mr->map[m] */
+       u16 n;                  /* current index: mr->map[m]->segs[n] */
+};
+
+struct ipath_sge_state {
+       struct ipath_sge *sg_list;      /* next SGE to be used if any */
+       struct ipath_sge sge;   /* progress state for the current SGE */
+       u8 num_sge;
+};
+
+int ipath_layer_register(void *(*l_add)(int, struct ipath_devdata *),
+                        void (*l_remove)(void *),
+                        int (*l_intr)(void *, u32),
+                        int (*l_rcv)(void *, void *,
+                                     struct sk_buff *),
+                        u16 rcv_opcode,
+                        int (*l_rcv_lid)(void *, void *));
+int ipath_verbs_register(void *(*l_add)(int, struct ipath_devdata *),
+                        void (*l_remove)(void *arg),
+                        int (*l_piobufavail)(void *arg),
+                        void (*l_rcv)(void *arg, void *rhdr,
+                                      void *data, u32 tlen),
+                        void (*l_timer_cb)(void *arg));
+void ipath_layer_unregister(void);
+void ipath_verbs_unregister(void);
+int ipath_layer_open(struct ipath_devdata *, u32 * pktmax);
+u16 ipath_layer_get_lid(struct ipath_devdata *dd);
+int ipath_layer_get_mac(struct ipath_devdata *dd, u8 *);
+u16 ipath_layer_get_bcast(struct ipath_devdata *dd);
+u32 ipath_layer_get_cr_errpkey(struct ipath_devdata *dd);
+int ipath_layer_set_linkstate(struct ipath_devdata *dd, u8 state);
+int ipath_layer_set_mtu(struct ipath_devdata *, u16);
+int ipath_set_sps_lid(struct ipath_devdata *, u32, u8);
+int ipath_layer_send_hdr(struct ipath_devdata *dd,
+                        struct ether_header *hdr);
+int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
+                    u32 * hdr, u32 len, struct ipath_sge_state *ss);
+int ipath_layer_set_piointbufavail_int(struct ipath_devdata *dd);
+int ipath_layer_get_boardname(struct ipath_devdata *dd, char *name,
+                             size_t namelen);
+int ipath_layer_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+                                 u64 *rwords, u64 *spkts, u64 *rpkts,
+                                 u64 *xmit_wait);
+int ipath_layer_get_counters(struct ipath_devdata *dd,
+                            struct ipath_layer_counters *cntrs);
+int ipath_layer_want_buffer(struct ipath_devdata *dd);
+int ipath_layer_set_guid(struct ipath_devdata *, __be64 guid);
+__be64 ipath_layer_get_guid(struct ipath_devdata *);
+u32 ipath_layer_get_nguid(struct ipath_devdata *);
+int ipath_layer_query_device(struct ipath_devdata *, u32 * vendor,
+                            u32 * boardrev, u32 * majrev, u32 * minrev);
+u32 ipath_layer_get_flags(struct ipath_devdata *dd);
+struct device *ipath_layer_get_device(struct ipath_devdata *dd);
+u16 ipath_layer_get_deviceid(struct ipath_devdata *dd);
+u64 ipath_layer_get_lastibcstat(struct ipath_devdata *dd);
+u32 ipath_layer_get_ibmtu(struct ipath_devdata *dd);
+int ipath_layer_enable_timer(struct ipath_devdata *dd);
+int ipath_layer_disable_timer(struct ipath_devdata *dd);
+int ipath_layer_set_verbs_flags(struct ipath_devdata *dd, unsigned flags);
+unsigned ipath_layer_get_npkeys(struct ipath_devdata *dd);
+unsigned ipath_layer_get_pkey(struct ipath_devdata *dd, unsigned index);
+int ipath_layer_get_pkeys(struct ipath_devdata *dd, u16 *pkeys);
+int ipath_layer_set_pkeys(struct ipath_devdata *dd, u16 *pkeys);
+int ipath_layer_get_linkdowndefaultstate(struct ipath_devdata *dd);
+int ipath_layer_set_linkdowndefaultstate(struct ipath_devdata *dd,
+                                        int sleep);
+int ipath_layer_get_phyerrthreshold(struct ipath_devdata *dd);
+int ipath_layer_set_phyerrthreshold(struct ipath_devdata *dd, unsigned n);
+int ipath_layer_get_overrunthreshold(struct ipath_devdata *dd);
+int ipath_layer_set_overrunthreshold(struct ipath_devdata *dd, unsigned n);
+u32 ipath_layer_get_rcvhdrentsize(struct ipath_devdata *dd);
+
+/* ipath_ether interrupt values */
+#define IPATH_LAYER_INT_IF_UP 0x2
+#define IPATH_LAYER_INT_IF_DOWN 0x4
+#define IPATH_LAYER_INT_LID 0x8
+#define IPATH_LAYER_INT_SEND_CONTINUE 0x10
+#define IPATH_LAYER_INT_BCAST 0x40
+
+/* _verbs_layer.l_flags */
+#define IPATH_VERBS_KERNEL_SMA 0x1
+
+extern unsigned ipath_debug; /* debugging bit mask */
+
+#endif                         /* _IPATH_LAYER_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
new file mode 100644 (file)
index 0000000..f7f8391
--- /dev/null
@@ -0,0 +1,1352 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ips_common.h"
+
+#define IB_SMP_UNSUP_VERSION   __constant_htons(0x0004)
+#define IB_SMP_UNSUP_METHOD    __constant_htons(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR __constant_htons(0x000C)
+#define IB_SMP_INVALID_FIELD   __constant_htons(0x001C)
+
+static int reply(struct ib_smp *smp)
+{
+       /*
+        * The verbs framework will handle the directed/LID route
+        * packet changes.
+        */
+       smp->method = IB_MGMT_METHOD_GET_RESP;
+       if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+               smp->status |= IB_SMP_DIRECTION;
+       return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int recv_subn_get_nodedescription(struct ib_smp *smp,
+                                        struct ib_device *ibdev)
+{
+       if (smp->attr_mod)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       strncpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+       return reply(smp);
+}
+
+struct nodeinfo {
+       u8 base_version;
+       u8 class_version;
+       u8 node_type;
+       u8 num_ports;
+       __be64 sys_guid;
+       __be64 node_guid;
+       __be64 port_guid;
+       __be16 partition_cap;
+       __be16 device_id;
+       __be32 revision;
+       u8 local_port_num;
+       u8 vendor_id[3];
+} __attribute__ ((packed));
+
+static int recv_subn_get_nodeinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev, u8 port)
+{
+       struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
+       struct ipath_devdata *dd = to_idev(ibdev)->dd;
+       u32 vendor, boardid, majrev, minrev;
+
+       if (smp->attr_mod)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       nip->base_version = 1;
+       nip->class_version = 1;
+       nip->node_type = 1;     /* channel adapter */
+       /*
+        * XXX The num_ports value will need a layer function to get
+        * the value if we ever have more than one IB port on a chip.
+        * We will also need to get the GUID for the port.
+        */
+       nip->num_ports = ibdev->phys_port_cnt;
+       /* This is already in network order */
+       nip->sys_guid = to_idev(ibdev)->sys_image_guid;
+       nip->node_guid = ipath_layer_get_guid(dd);
+       nip->port_guid = nip->sys_guid;
+       nip->partition_cap = cpu_to_be16(ipath_layer_get_npkeys(dd));
+       nip->device_id = cpu_to_be16(ipath_layer_get_deviceid(dd));
+       ipath_layer_query_device(dd, &vendor, &boardid, &majrev, &minrev);
+       nip->revision = cpu_to_be32((majrev << 16) | minrev);
+       nip->local_port_num = port;
+       nip->vendor_id[0] = 0;
+       nip->vendor_id[1] = vendor >> 8;
+       nip->vendor_id[2] = vendor;
+
+       return reply(smp);
+}
+
+static int recv_subn_get_guidinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev)
+{
+       u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+       __be64 *p = (__be64 *) smp->data;
+
+       /* 32 blocks of 8 64-bit GUIDs per block */
+
+       memset(smp->data, 0, sizeof(smp->data));
+
+       /*
+        * We only support one GUID for now.  If this changes, the
+        * portinfo.guid_cap field needs to be updated too.
+        */
+       if (startgx == 0)
+               /* The first is a copy of the read-only HW GUID. */
+               *p = ipath_layer_get_guid(to_idev(ibdev)->dd);
+       else
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return reply(smp);
+}
+
+struct port_info {
+       __be64 mkey;
+       __be64 gid_prefix;
+       __be16 lid;
+       __be16 sm_lid;
+       __be32 cap_mask;
+       __be16 diag_code;
+       __be16 mkey_lease_period;
+       u8 local_port_num;
+       u8 link_width_enabled;
+       u8 link_width_supported;
+       u8 link_width_active;
+       u8 linkspeed_portstate;                 /* 4 bits, 4 bits */
+       u8 portphysstate_linkdown;              /* 4 bits, 4 bits */
+       u8 mkeyprot_resv_lmc;                   /* 2 bits, 3, 3 */
+       u8 linkspeedactive_enabled;             /* 4 bits, 4 bits */
+       u8 neighbormtu_mastersmsl;              /* 4 bits, 4 bits */
+       u8 vlcap_inittype;                      /* 4 bits, 4 bits */
+       u8 vl_high_limit;
+       u8 vl_arb_high_cap;
+       u8 vl_arb_low_cap;
+       u8 inittypereply_mtucap;                /* 4 bits, 4 bits */
+       u8 vlstallcnt_hoqlife;                  /* 3 bits, 5 bits */
+       u8 operationalvl_pei_peo_fpi_fpo;       /* 4 bits, 1, 1, 1, 1 */
+       __be16 mkey_violations;
+       __be16 pkey_violations;
+       __be16 qkey_violations;
+       u8 guid_cap;
+       u8 clientrereg_resv_subnetto;           /* 1 bit, 2 bits, 5 */
+       u8 resv_resptimevalue;                  /* 3 bits, 5 bits */
+       u8 localphyerrors_overrunerrors;        /* 4 bits, 4 bits */
+       __be16 max_credit_hint;
+       u8 resv;
+       u8 link_roundtrip_latency[3];
+} __attribute__ ((packed));
+
+static int recv_subn_get_portinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev, u8 port)
+{
+       struct ipath_ibdev *dev;
+       struct port_info *pip = (struct port_info *)smp->data;
+       u16 lid;
+       u8 ibcstat;
+       u8 mtu;
+       int ret;
+
+       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
+               smp->status |= IB_SMP_INVALID_FIELD;
+               ret = reply(smp);
+               goto bail;
+       }
+
+       dev = to_idev(ibdev);
+
+       /* Clear all fields.  Only set the non-zero fields. */
+       memset(smp->data, 0, sizeof(smp->data));
+
+       /* Only return the mkey if the protection field allows it. */
+       if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
+           (dev->mkeyprot_resv_lmc >> 6) == 0)
+               pip->mkey = dev->mkey;
+       pip->gid_prefix = dev->gid_prefix;
+       lid = ipath_layer_get_lid(dev->dd);
+       pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
+       pip->sm_lid = cpu_to_be16(dev->sm_lid);
+       pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
+       /* pip->diag_code; */
+       pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
+       pip->local_port_num = port;
+       pip->link_width_enabled = dev->link_width_enabled;
+       pip->link_width_supported = 3;  /* 1x or 4x */
+       pip->link_width_active = 2;     /* 4x */
+       pip->linkspeed_portstate = 0x10;        /* 2.5Gbps */
+       ibcstat = ipath_layer_get_lastibcstat(dev->dd);
+       pip->linkspeed_portstate |= ((ibcstat >> 4) & 0x3) + 1;
+       pip->portphysstate_linkdown =
+               (ipath_cvt_physportstate[ibcstat & 0xf] << 4) |
+               (ipath_layer_get_linkdowndefaultstate(dev->dd) ? 1 : 2);
+       pip->mkeyprot_resv_lmc = dev->mkeyprot_resv_lmc;
+       pip->linkspeedactive_enabled = 0x11;    /* 2.5Gbps, 2.5Gbps */
+       switch (ipath_layer_get_ibmtu(dev->dd)) {
+       case 4096:
+               mtu = IB_MTU_4096;
+               break;
+       case 2048:
+               mtu = IB_MTU_2048;
+               break;
+       case 1024:
+               mtu = IB_MTU_1024;
+               break;
+       case 512:
+               mtu = IB_MTU_512;
+               break;
+       case 256:
+               mtu = IB_MTU_256;
+               break;
+       default:                /* oops, something is wrong */
+               mtu = IB_MTU_2048;
+               break;
+       }
+       pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
+       pip->vlcap_inittype = 0x10;     /* VLCap = VL0, InitType = 0 */
+       pip->vl_high_limit = dev->vl_high_limit;
+       /* pip->vl_arb_high_cap; // only one VL */
+       /* pip->vl_arb_low_cap; // only one VL */
+       /* InitTypeReply = 0 */
+       pip->inittypereply_mtucap = IB_MTU_4096;
+       // HCAs ignore VLStallCount and HOQLife
+       /* pip->vlstallcnt_hoqlife; */
+       pip->operationalvl_pei_peo_fpi_fpo = 0x10;      /* OVLs = 1 */
+       pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
+       /* P_KeyViolations are counted by hardware. */
+       pip->pkey_violations =
+               cpu_to_be16((ipath_layer_get_cr_errpkey(dev->dd) -
+                            dev->n_pkey_violations) & 0xFFFF);
+       pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
+       /* Only the hardware GUID is supported for now */
+       pip->guid_cap = 1;
+       pip->clientrereg_resv_subnetto = dev->subnet_timeout;
+       /* 32.768 usec. response time (guessing) */
+       pip->resv_resptimevalue = 3;
+       pip->localphyerrors_overrunerrors =
+               (ipath_layer_get_phyerrthreshold(dev->dd) << 4) |
+               ipath_layer_get_overrunthreshold(dev->dd);
+       /* pip->max_credit_hint; */
+       /* pip->link_roundtrip_latency[3]; */
+
+       ret = reply(smp);
+
+bail:
+       return ret;
+}
+
+static int recv_subn_get_pkeytable(struct ib_smp *smp,
+                                  struct ib_device *ibdev)
+{
+       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+       u16 *p = (u16 *) smp->data;
+       __be16 *q = (__be16 *) smp->data;
+
+       /* 64 blocks of 32 16-bit P_Key entries */
+
+       memset(smp->data, 0, sizeof(smp->data));
+       if (startpx == 0) {
+               struct ipath_ibdev *dev = to_idev(ibdev);
+               unsigned i, n = ipath_layer_get_npkeys(dev->dd);
+
+               ipath_layer_get_pkeys(dev->dd, p);
+
+               for (i = 0; i < n; i++)
+                       q[i] = cpu_to_be16(p[i]);
+       } else
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return reply(smp);
+}
+
+static int recv_subn_set_guidinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev)
+{
+       /* The only GUID we support is the first read-only entry. */
+       return recv_subn_get_guidinfo(smp, ibdev);
+}
+
+/**
+ * recv_subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int recv_subn_set_portinfo(struct ib_smp *smp,
+                                 struct ib_device *ibdev, u8 port)
+{
+       struct port_info *pip = (struct port_info *)smp->data;
+       struct ib_event event;
+       struct ipath_ibdev *dev;
+       u32 flags;
+       char clientrereg = 0;
+       u16 lid, smlid;
+       u8 lwe;
+       u8 lse;
+       u8 state;
+       u16 lstate;
+       u32 mtu;
+       int ret;
+
+       if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
+               goto err;
+
+       dev = to_idev(ibdev);
+       event.device = ibdev;
+       event.element.port_num = port;
+
+       dev->mkey = pip->mkey;
+       dev->gid_prefix = pip->gid_prefix;
+       dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+       lid = be16_to_cpu(pip->lid);
+       if (lid != ipath_layer_get_lid(dev->dd)) {
+               /* Must be a valid unicast LID address. */
+               if (lid == 0 || lid >= IPS_MULTICAST_LID_BASE)
+                       goto err;
+               ipath_set_sps_lid(dev->dd, lid, pip->mkeyprot_resv_lmc & 7);
+               event.event = IB_EVENT_LID_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       smlid = be16_to_cpu(pip->sm_lid);
+       if (smlid != dev->sm_lid) {
+               /* Must be a valid unicast LID address. */
+               if (smlid == 0 || smlid >= IPS_MULTICAST_LID_BASE)
+                       goto err;
+               dev->sm_lid = smlid;
+               event.event = IB_EVENT_SM_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       /* Only 4x supported but allow 1x or 4x to be set (see 14.2.6.6). */
+       lwe = pip->link_width_enabled;
+       if ((lwe >= 4 && lwe <= 8) || (lwe >= 0xC && lwe <= 0xFE))
+               goto err;
+       if (lwe == 0xFF)
+               dev->link_width_enabled = 3;    /* 1x or 4x */
+       else if (lwe)
+               dev->link_width_enabled = lwe;
+
+       /* Only 2.5 Gbs supported. */
+       lse = pip->linkspeedactive_enabled & 0xF;
+       if (lse >= 2 && lse <= 0xE)
+               goto err;
+
+       /* Set link down default state. */
+       switch (pip->portphysstate_linkdown & 0xF) {
+       case 0: /* NOP */
+               break;
+       case 1: /* SLEEP */
+               if (ipath_layer_set_linkdowndefaultstate(dev->dd, 1))
+                       goto err;
+               break;
+       case 2: /* POLL */
+               if (ipath_layer_set_linkdowndefaultstate(dev->dd, 0))
+                       goto err;
+               break;
+       default:
+               goto err;
+       }
+
+       dev->mkeyprot_resv_lmc = pip->mkeyprot_resv_lmc;
+       dev->vl_high_limit = pip->vl_high_limit;
+
+       switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
+       case IB_MTU_256:
+               mtu = 256;
+               break;
+       case IB_MTU_512:
+               mtu = 512;
+               break;
+       case IB_MTU_1024:
+               mtu = 1024;
+               break;
+       case IB_MTU_2048:
+               mtu = 2048;
+               break;
+       case IB_MTU_4096:
+               mtu = 4096;
+               break;
+       default:
+               /* XXX We have already partially updated our state! */
+               goto err;
+       }
+       ipath_layer_set_mtu(dev->dd, mtu);
+
+       dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
+
+       /* We only support VL0 */
+       if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
+               goto err;
+
+       if (pip->mkey_violations == 0)
+               dev->mkey_violations = 0;
+
+       /*
+        * Hardware counter can't be reset so snapshot and subtract
+        * later.
+        */
+       if (pip->pkey_violations == 0)
+               dev->n_pkey_violations =
+                       ipath_layer_get_cr_errpkey(dev->dd);
+
+       if (pip->qkey_violations == 0)
+               dev->qkey_violations = 0;
+
+       if (ipath_layer_set_phyerrthreshold(
+                   dev->dd,
+                   (pip->localphyerrors_overrunerrors >> 4) & 0xF))
+               goto err;
+
+       if (ipath_layer_set_overrunthreshold(
+                   dev->dd,
+                   (pip->localphyerrors_overrunerrors & 0xF)))
+               goto err;
+
+       dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+       if (pip->clientrereg_resv_subnetto & 0x80) {
+               clientrereg = 1;
+               event.event = IB_EVENT_LID_CHANGE;
+               ib_dispatch_event(&event);
+       }
+
+       /*
+        * Do the port state change now that the other link parameters
+        * have been set.
+        * Changing the port physical state only makes sense if the link
+        * is down or is being set to down.
+        */
+       state = pip->linkspeed_portstate & 0xF;
+       flags = ipath_layer_get_flags(dev->dd);
+       lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+       if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+               goto err;
+
+       /*
+        * Only state changes of DOWN, ARM, and ACTIVE are valid
+        * and must be in the correct state to take effect (see 7.2.6).
+        */
+       switch (state) {
+       case IB_PORT_NOP:
+               if (lstate == 0)
+                       break;
+               /* FALLTHROUGH */
+       case IB_PORT_DOWN:
+               if (lstate == 0)
+                       if (ipath_layer_get_linkdowndefaultstate(dev->dd))
+                               lstate = IPATH_IB_LINKDOWN_SLEEP;
+                       else
+                               lstate = IPATH_IB_LINKDOWN;
+               else if (lstate == 1)
+                       lstate = IPATH_IB_LINKDOWN_SLEEP;
+               else if (lstate == 2)
+                       lstate = IPATH_IB_LINKDOWN;
+               else if (lstate == 3)
+                       lstate = IPATH_IB_LINKDOWN_DISABLE;
+               else
+                       goto err;
+               ipath_layer_set_linkstate(dev->dd, lstate);
+               if (flags & IPATH_LINKACTIVE) {
+                       event.event = IB_EVENT_PORT_ERR;
+                       ib_dispatch_event(&event);
+               }
+               break;
+       case IB_PORT_ARMED:
+               if (!(flags & (IPATH_LINKINIT | IPATH_LINKACTIVE)))
+                       break;
+               ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKARM);
+               if (flags & IPATH_LINKACTIVE) {
+                       event.event = IB_EVENT_PORT_ERR;
+                       ib_dispatch_event(&event);
+               }
+               break;
+       case IB_PORT_ACTIVE:
+               if (!(flags & IPATH_LINKARMED))
+                       break;
+               ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKACTIVE);
+               event.event = IB_EVENT_PORT_ACTIVE;
+               ib_dispatch_event(&event);
+               break;
+       default:
+               /* XXX We have already partially updated our state! */
+               goto err;
+       }
+
+       ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+       if (clientrereg)
+               pip->clientrereg_resv_subnetto |= 0x80;
+
+       goto done;
+
+err:
+       smp->status |= IB_SMP_INVALID_FIELD;
+       ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+done:
+       return ret;
+}
+
+static int recv_subn_set_pkeytable(struct ib_smp *smp,
+                                  struct ib_device *ibdev)
+{
+       u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+       __be16 *p = (__be16 *) smp->data;
+       u16 *q = (u16 *) smp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       unsigned i, n = ipath_layer_get_npkeys(dev->dd);
+
+       for (i = 0; i < n; i++)
+               q[i] = be16_to_cpu(p[i]);
+
+       if (startpx != 0 ||
+           ipath_layer_set_pkeys(dev->dd, q) != 0)
+               smp->status |= IB_SMP_INVALID_FIELD;
+
+       return recv_subn_get_pkeytable(smp, ibdev);
+}
+
+#define IB_PMA_CLASS_PORT_INFO         __constant_htons(0x0001)
+#define IB_PMA_PORT_SAMPLES_CONTROL    __constant_htons(0x0010)
+#define IB_PMA_PORT_SAMPLES_RESULT     __constant_htons(0x0011)
+#define IB_PMA_PORT_COUNTERS           __constant_htons(0x0012)
+#define IB_PMA_PORT_COUNTERS_EXT       __constant_htons(0x001D)
+#define IB_PMA_PORT_SAMPLES_RESULT_EXT __constant_htons(0x001E)
+
+struct ib_perf {
+       u8 base_version;
+       u8 mgmt_class;
+       u8 class_version;
+       u8 method;
+       __be16 status;
+       __be16 unused;
+       __be64 tid;
+       __be16 attr_id;
+       __be16 resv;
+       __be32 attr_mod;
+       u8 reserved[40];
+       u8 data[192];
+} __attribute__ ((packed));
+
+struct ib_pma_classportinfo {
+       u8 base_version;
+       u8 class_version;
+       __be16 cap_mask;
+       u8 reserved[3];
+       u8 resp_time_value;     /* only lower 5 bits */
+       union ib_gid redirect_gid;
+       __be32 redirect_tc_sl_fl;       /* 8, 4, 20 bits respectively */
+       __be16 redirect_lid;
+       __be16 redirect_pkey;
+       __be32 redirect_qp;     /* only lower 24 bits */
+       __be32 redirect_qkey;
+       union ib_gid trap_gid;
+       __be32 trap_tc_sl_fl;   /* 8, 4, 20 bits respectively */
+       __be16 trap_lid;
+       __be16 trap_pkey;
+       __be32 trap_hl_qp;      /* 8, 24 bits respectively */
+       __be32 trap_qkey;
+} __attribute__ ((packed));
+
+struct ib_pma_portsamplescontrol {
+       u8 opcode;
+       u8 port_select;
+       u8 tick;
+       u8 counter_width;       /* only lower 3 bits */
+       __be32 counter_mask0_9; /* 2, 10 * 3, bits */
+       __be16 counter_mask10_14;       /* 1, 5 * 3, bits */
+       u8 sample_mechanisms;
+       u8 sample_status;       /* only lower 2 bits */
+       __be64 option_mask;
+       __be64 vendor_mask;
+       __be32 sample_start;
+       __be32 sample_interval;
+       __be16 tag;
+       __be16 counter_select[15];
+} __attribute__ ((packed));
+
+struct ib_pma_portsamplesresult {
+       __be16 tag;
+       __be16 sample_status;   /* only lower 2 bits */
+       __be32 counter[15];
+} __attribute__ ((packed));
+
+struct ib_pma_portsamplesresult_ext {
+       __be16 tag;
+       __be16 sample_status;   /* only lower 2 bits */
+       __be32 extended_width;  /* only upper 2 bits */
+       __be64 counter[15];
+} __attribute__ ((packed));
+
+struct ib_pma_portcounters {
+       u8 reserved;
+       u8 port_select;
+       __be16 counter_select;
+       __be16 symbol_error_counter;
+       u8 link_error_recovery_counter;
+       u8 link_downed_counter;
+       __be16 port_rcv_errors;
+       __be16 port_rcv_remphys_errors;
+       __be16 port_rcv_switch_relay_errors;
+       __be16 port_xmit_discards;
+       u8 port_xmit_constraint_errors;
+       u8 port_rcv_constraint_errors;
+       u8 reserved1;
+       u8 lli_ebor_errors;     /* 4, 4, bits */
+       __be16 reserved2;
+       __be16 vl15_dropped;
+       __be32 port_xmit_data;
+       __be32 port_rcv_data;
+       __be32 port_xmit_packets;
+       __be32 port_rcv_packets;
+} __attribute__ ((packed));
+
+#define IB_PMA_SEL_SYMBOL_ERROR                        __constant_htons(0x0001)
+#define IB_PMA_SEL_LINK_ERROR_RECOVERY         __constant_htons(0x0002)
+#define IB_PMA_SEL_LINK_DOWNED                 __constant_htons(0x0004)
+#define IB_PMA_SEL_PORT_RCV_ERRORS             __constant_htons(0x0008)
+#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS     __constant_htons(0x0010)
+#define IB_PMA_SEL_PORT_XMIT_DISCARDS          __constant_htons(0x0040)
+#define IB_PMA_SEL_PORT_XMIT_DATA              __constant_htons(0x1000)
+#define IB_PMA_SEL_PORT_RCV_DATA               __constant_htons(0x2000)
+#define IB_PMA_SEL_PORT_XMIT_PACKETS           __constant_htons(0x4000)
+#define IB_PMA_SEL_PORT_RCV_PACKETS            __constant_htons(0x8000)
+
+struct ib_pma_portcounters_ext {
+       u8 reserved;
+       u8 port_select;
+       __be16 counter_select;
+       __be32 reserved1;
+       __be64 port_xmit_data;
+       __be64 port_rcv_data;
+       __be64 port_xmit_packets;
+       __be64 port_rcv_packets;
+       __be64 port_unicast_xmit_packets;
+       __be64 port_unicast_rcv_packets;
+       __be64 port_multicast_xmit_packets;
+       __be64 port_multicast_rcv_packets;
+} __attribute__ ((packed));
+
+#define IB_PMA_SELX_PORT_XMIT_DATA             __constant_htons(0x0001)
+#define IB_PMA_SELX_PORT_RCV_DATA              __constant_htons(0x0002)
+#define IB_PMA_SELX_PORT_XMIT_PACKETS          __constant_htons(0x0004)
+#define IB_PMA_SELX_PORT_RCV_PACKETS           __constant_htons(0x0008)
+#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS      __constant_htons(0x0010)
+#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS       __constant_htons(0x0020)
+#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS    __constant_htons(0x0040)
+#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS     __constant_htons(0x0080)
+
+static int recv_pma_get_classportinfo(struct ib_perf *pmp)
+{
+       struct ib_pma_classportinfo *p =
+               (struct ib_pma_classportinfo *)pmp->data;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       if (pmp->attr_mod != 0)
+               pmp->status |= IB_SMP_INVALID_FIELD;
+
+       /* Indicate AllPortSelect is valid (only one port anyway) */
+       p->cap_mask = __constant_cpu_to_be16(1 << 8);
+       p->base_version = 1;
+       p->class_version = 1;
+       /*
+        * Expected response time is 4.096 usec. * 2^18 == 1.073741824
+        * sec.
+        */
+       p->resp_time_value = 18;
+
+       return reply((struct ib_smp *) pmp);
+}
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+       __constant_cpu_to_be32(COUNTER_MASK(1, 0) | \
+                              COUNTER_MASK(1, 1) | \
+                              COUNTER_MASK(1, 2) | \
+                              COUNTER_MASK(1, 3) | \
+                              COUNTER_MASK(1, 4))
+
+static int recv_pma_get_portsamplescontrol(struct ib_perf *pmp,
+                                          struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portsamplescontrol *p =
+               (struct ib_pma_portsamplescontrol *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       unsigned long flags;
+       u8 port_select = p->port_select;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       p->port_select = port_select;
+       if (pmp->attr_mod != 0 ||
+           (port_select != port && port_select != 0xFF))
+               pmp->status |= IB_SMP_INVALID_FIELD;
+       /*
+        * Ticks are 10x the link transfer period which for 2.5Gbs is 4
+        * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
+        * intervals are counted in ticks.  Since we use Linux timers, that
+        * count in jiffies, we can't sample for less than 1000 ticks if HZ
+        * == 1000 (4000 ticks if HZ is 250).
+        */
+       /* XXX This is WRONG. */
+       p->tick = 250;          /* 1 usec. */
+       p->counter_width = 4;   /* 32 bit counters */
+       p->counter_mask0_9 = COUNTER_MASK0_9;
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       p->sample_status = dev->pma_sample_status;
+       p->sample_start = cpu_to_be32(dev->pma_sample_start);
+       p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
+       p->tag = cpu_to_be16(dev->pma_tag);
+       p->counter_select[0] = dev->pma_counter_select[0];
+       p->counter_select[1] = dev->pma_counter_select[1];
+       p->counter_select[2] = dev->pma_counter_select[2];
+       p->counter_select[3] = dev->pma_counter_select[3];
+       p->counter_select[4] = dev->pma_counter_select[4];
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portsamplescontrol(struct ib_perf *pmp,
+                                          struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portsamplescontrol *p =
+               (struct ib_pma_portsamplescontrol *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       unsigned long flags;
+       u32 start;
+       int ret;
+
+       if (pmp->attr_mod != 0 ||
+           (p->port_select != port && p->port_select != 0xFF)) {
+               pmp->status |= IB_SMP_INVALID_FIELD;
+               ret = reply((struct ib_smp *) pmp);
+               goto bail;
+       }
+
+       start = be32_to_cpu(p->sample_start);
+       if (start != 0) {
+               spin_lock_irqsave(&dev->pending_lock, flags);
+               if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_DONE) {
+                       dev->pma_sample_status =
+                               IB_PMA_SAMPLE_STATUS_STARTED;
+                       dev->pma_sample_start = start;
+                       dev->pma_sample_interval =
+                               be32_to_cpu(p->sample_interval);
+                       dev->pma_tag = be16_to_cpu(p->tag);
+                       if (p->counter_select[0])
+                               dev->pma_counter_select[0] =
+                                       p->counter_select[0];
+                       if (p->counter_select[1])
+                               dev->pma_counter_select[1] =
+                                       p->counter_select[1];
+                       if (p->counter_select[2])
+                               dev->pma_counter_select[2] =
+                                       p->counter_select[2];
+                       if (p->counter_select[3])
+                               dev->pma_counter_select[3] =
+                                       p->counter_select[3];
+                       if (p->counter_select[4])
+                               dev->pma_counter_select[4] =
+                                       p->counter_select[4];
+               }
+               spin_unlock_irqrestore(&dev->pending_lock, flags);
+       }
+       ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+       return ret;
+}
+
+static u64 get_counter(struct ipath_ibdev *dev, __be16 sel)
+{
+       u64 ret;
+
+       switch (sel) {
+       case IB_PMA_PORT_XMIT_DATA:
+               ret = dev->ipath_sword;
+               break;
+       case IB_PMA_PORT_RCV_DATA:
+               ret = dev->ipath_rword;
+               break;
+       case IB_PMA_PORT_XMIT_PKTS:
+               ret = dev->ipath_spkts;
+               break;
+       case IB_PMA_PORT_RCV_PKTS:
+               ret = dev->ipath_rpkts;
+               break;
+       case IB_PMA_PORT_XMIT_WAIT:
+               ret = dev->ipath_xmit_wait;
+               break;
+       default:
+               ret = 0;
+       }
+
+       return ret;
+}
+
+static int recv_pma_get_portsamplesresult(struct ib_perf *pmp,
+                                         struct ib_device *ibdev)
+{
+       struct ib_pma_portsamplesresult *p =
+               (struct ib_pma_portsamplesresult *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int i;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+       p->tag = cpu_to_be16(dev->pma_tag);
+       p->sample_status = cpu_to_be16(dev->pma_sample_status);
+       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+               p->counter[i] = cpu_to_be32(
+                       get_counter(dev, dev->pma_counter_select[i]));
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portsamplesresult_ext(struct ib_perf *pmp,
+                                             struct ib_device *ibdev)
+{
+       struct ib_pma_portsamplesresult_ext *p =
+               (struct ib_pma_portsamplesresult_ext *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int i;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+       p->tag = cpu_to_be16(dev->pma_tag);
+       p->sample_status = cpu_to_be16(dev->pma_sample_status);
+       /* 64 bits */
+       p->extended_width = __constant_cpu_to_be32(0x80000000);
+       for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+               p->counter[i] = cpu_to_be64(
+                       get_counter(dev, dev->pma_counter_select[i]));
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters(struct ib_perf *pmp,
+                                    struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_layer_counters cntrs;
+       u8 port_select = p->port_select;
+
+       ipath_layer_get_counters(dev->dd, &cntrs);
+
+       /* Adjust counters for any resets done. */
+       cntrs.symbol_error_counter -= dev->n_symbol_error_counter;
+       cntrs.link_error_recovery_counter -=
+               dev->n_link_error_recovery_counter;
+       cntrs.link_downed_counter -= dev->n_link_downed_counter;
+       cntrs.port_rcv_errors += dev->rcv_errors;
+       cntrs.port_rcv_errors -= dev->n_port_rcv_errors;
+       cntrs.port_rcv_remphys_errors -= dev->n_port_rcv_remphys_errors;
+       cntrs.port_xmit_discards -= dev->n_port_xmit_discards;
+       cntrs.port_xmit_data -= dev->n_port_xmit_data;
+       cntrs.port_rcv_data -= dev->n_port_rcv_data;
+       cntrs.port_xmit_packets -= dev->n_port_xmit_packets;
+       cntrs.port_rcv_packets -= dev->n_port_rcv_packets;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       p->port_select = port_select;
+       if (pmp->attr_mod != 0 ||
+           (port_select != port && port_select != 0xFF))
+               pmp->status |= IB_SMP_INVALID_FIELD;
+
+       if (cntrs.symbol_error_counter > 0xFFFFUL)
+               p->symbol_error_counter = __constant_cpu_to_be16(0xFFFF);
+       else
+               p->symbol_error_counter =
+                       cpu_to_be16((u16)cntrs.symbol_error_counter);
+       if (cntrs.link_error_recovery_counter > 0xFFUL)
+               p->link_error_recovery_counter = 0xFF;
+       else
+               p->link_error_recovery_counter =
+                       (u8)cntrs.link_error_recovery_counter;
+       if (cntrs.link_downed_counter > 0xFFUL)
+               p->link_downed_counter = 0xFF;
+       else
+               p->link_downed_counter = (u8)cntrs.link_downed_counter;
+       if (cntrs.port_rcv_errors > 0xFFFFUL)
+               p->port_rcv_errors = __constant_cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_errors =
+                       cpu_to_be16((u16) cntrs.port_rcv_errors);
+       if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+               p->port_rcv_remphys_errors = __constant_cpu_to_be16(0xFFFF);
+       else
+               p->port_rcv_remphys_errors =
+                       cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+       if (cntrs.port_xmit_discards > 0xFFFFUL)
+               p->port_xmit_discards = __constant_cpu_to_be16(0xFFFF);
+       else
+               p->port_xmit_discards =
+                       cpu_to_be16((u16)cntrs.port_xmit_discards);
+       if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+               p->port_xmit_data = __constant_cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+       if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+               p->port_rcv_data = __constant_cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+       if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+               p->port_xmit_packets = __constant_cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_xmit_packets =
+                       cpu_to_be32((u32)cntrs.port_xmit_packets);
+       if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+               p->port_rcv_packets = __constant_cpu_to_be32(0xFFFFFFFF);
+       else
+               p->port_rcv_packets =
+                       cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters_ext(struct ib_perf *pmp,
+                                        struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters_ext *p =
+               (struct ib_pma_portcounters_ext *)pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       u64 swords, rwords, spkts, rpkts, xwait;
+       u8 port_select = p->port_select;
+
+       ipath_layer_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+                                     &rpkts, &xwait);
+
+       /* Adjust counters for any resets done. */
+       swords -= dev->n_port_xmit_data;
+       rwords -= dev->n_port_rcv_data;
+       spkts -= dev->n_port_xmit_packets;
+       rpkts -= dev->n_port_rcv_packets;
+
+       memset(pmp->data, 0, sizeof(pmp->data));
+
+       p->port_select = port_select;
+       if (pmp->attr_mod != 0 ||
+           (port_select != port && port_select != 0xFF))
+               pmp->status |= IB_SMP_INVALID_FIELD;
+
+       p->port_xmit_data = cpu_to_be64(swords);
+       p->port_rcv_data = cpu_to_be64(rwords);
+       p->port_xmit_packets = cpu_to_be64(spkts);
+       p->port_rcv_packets = cpu_to_be64(rpkts);
+       p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
+       p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
+       p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
+       p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
+
+       return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portcounters(struct ib_perf *pmp,
+                                    struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       struct ipath_layer_counters cntrs;
+
+       /*
+        * Since the HW doesn't support clearing counters, we save the
+        * current count and subtract it from future responses.
+        */
+       ipath_layer_get_counters(dev->dd, &cntrs);
+
+       if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+               dev->n_symbol_error_counter = cntrs.symbol_error_counter;
+
+       if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+               dev->n_link_error_recovery_counter =
+                       cntrs.link_error_recovery_counter;
+
+       if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+               dev->n_link_downed_counter = cntrs.link_downed_counter;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+               dev->n_port_rcv_errors =
+                       cntrs.port_rcv_errors + dev->rcv_errors;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+               dev->n_port_rcv_remphys_errors =
+                       cntrs.port_rcv_remphys_errors;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+               dev->n_port_xmit_discards = cntrs.port_xmit_discards;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+               dev->n_port_xmit_data = cntrs.port_xmit_data;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+               dev->n_port_rcv_data = cntrs.port_rcv_data;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+               dev->n_port_xmit_packets = cntrs.port_xmit_packets;
+
+       if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+               dev->n_port_rcv_packets = cntrs.port_rcv_packets;
+
+       return recv_pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int recv_pma_set_portcounters_ext(struct ib_perf *pmp,
+                                        struct ib_device *ibdev, u8 port)
+{
+       struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+               pmp->data;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       u64 swords, rwords, spkts, rpkts, xwait;
+
+       ipath_layer_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+                                     &rpkts, &xwait);
+
+       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+               dev->n_port_xmit_data = swords;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+               dev->n_port_rcv_data = rwords;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+               dev->n_port_xmit_packets = spkts;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+               dev->n_port_rcv_packets = rpkts;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+               dev->n_unicast_xmit = 0;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+               dev->n_unicast_rcv = 0;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+               dev->n_multicast_xmit = 0;
+
+       if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+               dev->n_multicast_rcv = 0;
+
+       return recv_pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+                       u8 port_num, struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_smp *smp = (struct ib_smp *)out_mad;
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int ret;
+
+       *out_mad = *in_mad;
+       if (smp->class_version != 1) {
+               smp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply(smp);
+               goto bail;
+       }
+
+       /* Is the mkey in the process of expiring? */
+       if (dev->mkey_lease_timeout && jiffies >= dev->mkey_lease_timeout) {
+               /* Clear timeout and mkey protection field. */
+               dev->mkey_lease_timeout = 0;
+               dev->mkeyprot_resv_lmc &= 0x3F;
+       }
+
+       /*
+        * M_Key checking depends on
+        * Portinfo:M_Key_protect_bits
+        */
+       if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
+           dev->mkey != smp->mkey &&
+           (smp->method == IB_MGMT_METHOD_SET ||
+            (smp->method == IB_MGMT_METHOD_GET &&
+             (dev->mkeyprot_resv_lmc >> 7) != 0))) {
+               if (dev->mkey_violations != 0xFFFF)
+                       ++dev->mkey_violations;
+               if (dev->mkey_lease_timeout ||
+                   dev->mkey_lease_period == 0) {
+                       ret = IB_MAD_RESULT_SUCCESS |
+                               IB_MAD_RESULT_CONSUMED;
+                       goto bail;
+               }
+               dev->mkey_lease_timeout = jiffies +
+                       dev->mkey_lease_period * HZ;
+               /* Future: Generate a trap notice. */
+               ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+               goto bail;
+       } else if (dev->mkey_lease_timeout)
+               dev->mkey_lease_timeout = 0;
+
+       switch (smp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (smp->attr_id) {
+               case IB_SMP_ATTR_NODE_DESC:
+                       ret = recv_subn_get_nodedescription(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_NODE_INFO:
+                       ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
+                       goto bail;
+               case IB_SMP_ATTR_GUID_INFO:
+                       ret = recv_subn_get_guidinfo(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_PORT_INFO:
+                       ret = recv_subn_get_portinfo(smp, ibdev, port_num);
+                       goto bail;
+               case IB_SMP_ATTR_PKEY_TABLE:
+                       ret = recv_subn_get_pkeytable(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_SM_INFO:
+                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+                               ret = IB_MAD_RESULT_SUCCESS |
+                                       IB_MAD_RESULT_CONSUMED;
+                               goto bail;
+                       }
+                       if (dev->port_cap_flags & IB_PORT_SM) {
+                               ret = IB_MAD_RESULT_SUCCESS;
+                               goto bail;
+                       }
+                       /* FALLTHROUGH */
+               default:
+                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply(smp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_SET:
+               switch (smp->attr_id) {
+               case IB_SMP_ATTR_GUID_INFO:
+                       ret = recv_subn_set_guidinfo(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_PORT_INFO:
+                       ret = recv_subn_set_portinfo(smp, ibdev, port_num);
+                       goto bail;
+               case IB_SMP_ATTR_PKEY_TABLE:
+                       ret = recv_subn_set_pkeytable(smp, ibdev);
+                       goto bail;
+               case IB_SMP_ATTR_SM_INFO:
+                       if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+                               ret = IB_MAD_RESULT_SUCCESS |
+                                       IB_MAD_RESULT_CONSUMED;
+                               goto bail;
+                       }
+                       if (dev->port_cap_flags & IB_PORT_SM) {
+                               ret = IB_MAD_RESULT_SUCCESS;
+                               goto bail;
+                       }
+                       /* FALLTHROUGH */
+               default:
+                       smp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply(smp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_FAILURE;
+               goto bail;
+       default:
+               smp->status |= IB_SMP_UNSUP_METHOD;
+               ret = reply(smp);
+       }
+
+bail:
+       return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port_num,
+                       struct ib_mad *in_mad,
+                       struct ib_mad *out_mad)
+{
+       struct ib_perf *pmp = (struct ib_perf *)out_mad;
+       int ret;
+
+       *out_mad = *in_mad;
+       if (pmp->class_version != 1) {
+               pmp->status |= IB_SMP_UNSUP_VERSION;
+               ret = reply((struct ib_smp *) pmp);
+               goto bail;
+       }
+
+       switch (pmp->method) {
+       case IB_MGMT_METHOD_GET:
+               switch (pmp->attr_id) {
+               case IB_PMA_CLASS_PORT_INFO:
+                       ret = recv_pma_get_classportinfo(pmp);
+                       goto bail;
+               case IB_PMA_PORT_SAMPLES_CONTROL:
+                       ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
+                                                             port_num);
+                       goto bail;
+               case IB_PMA_PORT_SAMPLES_RESULT:
+                       ret = recv_pma_get_portsamplesresult(pmp, ibdev);
+                       goto bail;
+               case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+                       ret = recv_pma_get_portsamplesresult_ext(pmp,
+                                                                ibdev);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS:
+                       ret = recv_pma_get_portcounters(pmp, ibdev,
+                                                       port_num);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS_EXT:
+                       ret = recv_pma_get_portcounters_ext(pmp, ibdev,
+                                                           port_num);
+                       goto bail;
+               default:
+                       pmp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_smp *) pmp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_SET:
+               switch (pmp->attr_id) {
+               case IB_PMA_PORT_SAMPLES_CONTROL:
+                       ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
+                                                             port_num);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS:
+                       ret = recv_pma_set_portcounters(pmp, ibdev,
+                                                       port_num);
+                       goto bail;
+               case IB_PMA_PORT_COUNTERS_EXT:
+                       ret = recv_pma_set_portcounters_ext(pmp, ibdev,
+                                                           port_num);
+                       goto bail;
+               default:
+                       pmp->status |= IB_SMP_UNSUP_METH_ATTR;
+                       ret = reply((struct ib_smp *) pmp);
+                       goto bail;
+               }
+
+       case IB_MGMT_METHOD_GET_RESP:
+               /*
+                * The ib_mad module will call us to process responses
+                * before checking for other consumers.
+                * Just tell the caller to process it normally.
+                */
+               ret = IB_MAD_RESULT_FAILURE;
+               goto bail;
+       default:
+               pmp->status |= IB_SMP_UNSUP_METHOD;
+               ret = reply((struct ib_smp *) pmp);
+       }
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+                     struct ib_wc *in_wc, struct ib_grh *in_grh,
+                     struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int ret;
+
+       /*
+        * Snapshot current HW counters to "clear" them.
+        * This should be done when the driver is loaded except that for
+        * some reason we get a zillion errors when brining up the link.
+        */
+       if (dev->rcv_errors == 0) {
+               struct ipath_layer_counters cntrs;
+
+               ipath_layer_get_counters(to_idev(ibdev)->dd, &cntrs);
+               dev->rcv_errors++;
+               dev->n_symbol_error_counter = cntrs.symbol_error_counter;
+               dev->n_link_error_recovery_counter =
+                       cntrs.link_error_recovery_counter;
+               dev->n_link_downed_counter = cntrs.link_downed_counter;
+               dev->n_port_rcv_errors = cntrs.port_rcv_errors + 1;
+               dev->n_port_rcv_remphys_errors =
+                       cntrs.port_rcv_remphys_errors;
+               dev->n_port_xmit_discards = cntrs.port_xmit_discards;
+               dev->n_port_xmit_data = cntrs.port_xmit_data;
+               dev->n_port_rcv_data = cntrs.port_rcv_data;
+               dev->n_port_xmit_packets = cntrs.port_xmit_packets;
+               dev->n_port_rcv_packets = cntrs.port_rcv_packets;
+       }
+       switch (in_mad->mad_hdr.mgmt_class) {
+       case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+       case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+               ret = process_subn(ibdev, mad_flags, port_num,
+                                  in_mad, out_mad);
+               goto bail;
+       case IB_MGMT_CLASS_PERF_MGMT:
+               ret = process_perf(ibdev, port_num, in_mad, out_mad);
+               goto bail;
+       default:
+               ret = IB_MAD_RESULT_SUCCESS;
+       }
+
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
new file mode 100644 (file)
index 0000000..69ffec6
--- /dev/null
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
+{
+       struct ipath_mr *mr;
+       struct ib_mr *ret;
+
+       mr = kzalloc(sizeof *mr, GFP_KERNEL);
+       if (!mr) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       mr->mr.access_flags = acc;
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+static struct ipath_mr *alloc_mr(int count,
+                                struct ipath_lkey_table *lk_table)
+{
+       struct ipath_mr *mr;
+       int m, i = 0;
+
+       /* Allocate struct plus pointers to first level page tables. */
+       m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+       mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+       if (!mr)
+               goto done;
+
+       /* Allocate first level page tables. */
+       for (; i < m; i++) {
+               mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+               if (!mr->mr.map[i])
+                       goto bail;
+       }
+       mr->mr.mapsz = m;
+
+       /*
+        * ib_reg_phys_mr() will initialize mr->ibmr except for
+        * lkey and rkey.
+        */
+       if (!ipath_alloc_lkey(lk_table, &mr->mr))
+               goto bail;
+       mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+
+       goto done;
+
+bail:
+       while (i) {
+               i--;
+               kfree(mr->mr.map[i]);
+       }
+       kfree(mr);
+       mr = NULL;
+
+done:
+       return mr;
+}
+
+/**
+ * ipath_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+                               struct ib_phys_buf *buffer_list,
+                               int num_phys_buf, int acc, u64 *iova_start)
+{
+       struct ipath_mr *mr;
+       int n, m, i;
+       struct ib_mr *ret;
+
+       mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
+       if (mr == NULL) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       mr->mr.user_base = *iova_start;
+       mr->mr.iova = *iova_start;
+       mr->mr.length = 0;
+       mr->mr.offset = 0;
+       mr->mr.access_flags = acc;
+       mr->mr.max_segs = num_phys_buf;
+
+       m = 0;
+       n = 0;
+       for (i = 0; i < num_phys_buf; i++) {
+               mr->mr.map[m]->segs[n].vaddr =
+                       phys_to_virt(buffer_list[i].addr);
+               mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+               mr->mr.length += buffer_list[i].size;
+               n++;
+               if (n == IPATH_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @region: the user memory region
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
+                               int mr_access_flags, struct ib_udata *udata)
+{
+       struct ipath_mr *mr;
+       struct ib_umem_chunk *chunk;
+       int n, m, i;
+       struct ib_mr *ret;
+
+       n = 0;
+       list_for_each_entry(chunk, &region->chunk_list, list)
+               n += chunk->nents;
+
+       mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
+       if (!mr) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       mr->mr.user_base = region->user_base;
+       mr->mr.iova = region->virt_base;
+       mr->mr.length = region->length;
+       mr->mr.offset = region->offset;
+       mr->mr.access_flags = mr_access_flags;
+       mr->mr.max_segs = n;
+
+       m = 0;
+       n = 0;
+       list_for_each_entry(chunk, &region->chunk_list, list) {
+               for (i = 0; i < chunk->nmap; i++) {
+                       mr->mr.map[m]->segs[n].vaddr =
+                               page_address(chunk->page_list[i].page);
+                       mr->mr.map[m]->segs[n].length = region->page_size;
+                       n++;
+                       if (n == IPATH_SEGSZ) {
+                               m++;
+                               n = 0;
+                       }
+               }
+       }
+       ret = &mr->ibmr;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by ipath_get_dma_mr()
+ * or ipath_reg_user_mr().
+ */
+int ipath_dereg_mr(struct ib_mr *ibmr)
+{
+       struct ipath_mr *mr = to_imr(ibmr);
+       int i;
+
+       ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+       i = mr->mr.mapsz;
+       while (i) {
+               i--;
+               kfree(mr->mr.map[i]);
+       }
+       kfree(mr);
+       return 0;
+}
+
+/**
+ * ipath_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+                              struct ib_fmr_attr *fmr_attr)
+{
+       struct ipath_fmr *fmr;
+       int m, i = 0;
+       struct ib_fmr *ret;
+
+       /* Allocate struct plus pointers to first level page tables. */
+       m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+       fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+       if (!fmr)
+               goto bail;
+
+       /* Allocate first level page tables. */
+       for (; i < m; i++) {
+               fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
+                                        GFP_KERNEL);
+               if (!fmr->mr.map[i])
+                       goto bail;
+       }
+       fmr->mr.mapsz = m;
+
+       /*
+        * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+        * rkey.
+        */
+       if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
+               goto bail;
+       fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
+       /*
+        * Resources are allocated but no valid mapping (RKEY can't be
+        * used).
+        */
+       fmr->mr.user_base = 0;
+       fmr->mr.iova = 0;
+       fmr->mr.length = 0;
+       fmr->mr.offset = 0;
+       fmr->mr.access_flags = mr_access_flags;
+       fmr->mr.max_segs = fmr_attr->max_pages;
+       fmr->page_shift = fmr_attr->page_shift;
+
+       ret = &fmr->ibfmr;
+       goto done;
+
+bail:
+       while (i)
+               kfree(fmr->mr.map[--i]);
+       kfree(fmr);
+       ret = ERR_PTR(-ENOMEM);
+
+done:
+       return ret;
+}
+
+/**
+ * ipath_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+                      int list_len, u64 iova)
+{
+       struct ipath_fmr *fmr = to_ifmr(ibfmr);
+       struct ipath_lkey_table *rkt;
+       unsigned long flags;
+       int m, n, i;
+       u32 ps;
+       int ret;
+
+       if (list_len > fmr->mr.max_segs) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       rkt = &to_idev(ibfmr->device)->lk_table;
+       spin_lock_irqsave(&rkt->lock, flags);
+       fmr->mr.user_base = iova;
+       fmr->mr.iova = iova;
+       ps = 1 << fmr->page_shift;
+       fmr->mr.length = list_len * ps;
+       m = 0;
+       n = 0;
+       ps = 1 << fmr->page_shift;
+       for (i = 0; i < list_len; i++) {
+               fmr->mr.map[m]->segs[n].vaddr = phys_to_virt(page_list[i]);
+               fmr->mr.map[m]->segs[n].length = ps;
+               if (++n == IPATH_SEGSZ) {
+                       m++;
+                       n = 0;
+               }
+       }
+       spin_unlock_irqrestore(&rkt->lock, flags);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int ipath_unmap_fmr(struct list_head *fmr_list)
+{
+       struct ipath_fmr *fmr;
+       struct ipath_lkey_table *rkt;
+       unsigned long flags;
+
+       list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+               rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+               spin_lock_irqsave(&rkt->lock, flags);
+               fmr->mr.user_base = 0;
+               fmr->mr.iova = 0;
+               fmr->mr.length = 0;
+               spin_unlock_irqrestore(&rkt->lock, flags);
+       }
+       return 0;
+}
+
+/**
+ * ipath_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+       struct ipath_fmr *fmr = to_ifmr(ibfmr);
+       int i;
+
+       ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+       i = fmr->mr.mapsz;
+       while (i)
+               kfree(fmr->mr.map[--i]);
+       kfree(fmr);
+       return 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_pe800.c b/drivers/infiniband/hw/ipath/ipath_pe800.c
new file mode 100644 (file)
index 0000000..e693a7a
--- /dev/null
@@ -0,0 +1,1247 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the
+ * InfiniPath PE-800 chip.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+
+#include "ipath_kernel.h"
+#include "ipath_registers.h"
+
+/*
+ * This file contains all the chip-specific register information and
+ * access functions for the PathScale PE800, the PCI-Express chip.
+ *
+ * This lists the InfiniPath PE800 registers, in the actual chip layout.
+ * This structure should never be directly accessed.
+ */
+struct _infinipath_do_not_use_kernel_regs {
+       unsigned long long Revision;
+       unsigned long long Control;
+       unsigned long long PageAlign;
+       unsigned long long PortCnt;
+       unsigned long long DebugPortSelect;
+       unsigned long long Reserved0;
+       unsigned long long SendRegBase;
+       unsigned long long UserRegBase;
+       unsigned long long CounterRegBase;
+       unsigned long long Scratch;
+       unsigned long long Reserved1;
+       unsigned long long Reserved2;
+       unsigned long long IntBlocked;
+       unsigned long long IntMask;
+       unsigned long long IntStatus;
+       unsigned long long IntClear;
+       unsigned long long ErrorMask;
+       unsigned long long ErrorStatus;
+       unsigned long long ErrorClear;
+       unsigned long long HwErrMask;
+       unsigned long long HwErrStatus;
+       unsigned long long HwErrClear;
+       unsigned long long HwDiagCtrl;
+       unsigned long long MDIO;
+       unsigned long long IBCStatus;
+       unsigned long long IBCCtrl;
+       unsigned long long ExtStatus;
+       unsigned long long ExtCtrl;
+       unsigned long long GPIOOut;
+       unsigned long long GPIOMask;
+       unsigned long long GPIOStatus;
+       unsigned long long GPIOClear;
+       unsigned long long RcvCtrl;
+       unsigned long long RcvBTHQP;
+       unsigned long long RcvHdrSize;
+       unsigned long long RcvHdrCnt;
+       unsigned long long RcvHdrEntSize;
+       unsigned long long RcvTIDBase;
+       unsigned long long RcvTIDCnt;
+       unsigned long long RcvEgrBase;
+       unsigned long long RcvEgrCnt;
+       unsigned long long RcvBufBase;
+       unsigned long long RcvBufSize;
+       unsigned long long RxIntMemBase;
+       unsigned long long RxIntMemSize;
+       unsigned long long RcvPartitionKey;
+       unsigned long long Reserved3;
+       unsigned long long RcvPktLEDCnt;
+       unsigned long long Reserved4[8];
+       unsigned long long SendCtrl;
+       unsigned long long SendPIOBufBase;
+       unsigned long long SendPIOSize;
+       unsigned long long SendPIOBufCnt;
+       unsigned long long SendPIOAvailAddr;
+       unsigned long long TxIntMemBase;
+       unsigned long long TxIntMemSize;
+       unsigned long long Reserved5;
+       unsigned long long PCIeRBufTestReg0;
+       unsigned long long PCIeRBufTestReg1;
+       unsigned long long Reserved51[6];
+       unsigned long long SendBufferError;
+       unsigned long long SendBufferErrorCONT1;
+       unsigned long long Reserved6SBE[6];
+       unsigned long long RcvHdrAddr0;
+       unsigned long long RcvHdrAddr1;
+       unsigned long long RcvHdrAddr2;
+       unsigned long long RcvHdrAddr3;
+       unsigned long long RcvHdrAddr4;
+       unsigned long long Reserved7RHA[11];
+       unsigned long long RcvHdrTailAddr0;
+       unsigned long long RcvHdrTailAddr1;
+       unsigned long long RcvHdrTailAddr2;
+       unsigned long long RcvHdrTailAddr3;
+       unsigned long long RcvHdrTailAddr4;
+       unsigned long long Reserved8RHTA[11];
+       unsigned long long Reserved9SW[8];
+       unsigned long long SerdesConfig0;
+       unsigned long long SerdesConfig1;
+       unsigned long long SerdesStatus;
+       unsigned long long XGXSConfig;
+       unsigned long long IBPLLCfg;
+       unsigned long long Reserved10SW2[3];
+       unsigned long long PCIEQ0SerdesConfig0;
+       unsigned long long PCIEQ0SerdesConfig1;
+       unsigned long long PCIEQ0SerdesStatus;
+       unsigned long long Reserved11;
+       unsigned long long PCIEQ1SerdesConfig0;
+       unsigned long long PCIEQ1SerdesConfig1;
+       unsigned long long PCIEQ1SerdesStatus;
+       unsigned long long Reserved12;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof(struct \
+    _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+#define IPATH_CREG_OFFSET(field) (offsetof( \
+    struct infinipath_counters, field) / sizeof(u64))
+
+static const struct ipath_kregs ipath_pe_kregs = {
+       .kr_control = IPATH_KREG_OFFSET(Control),
+       .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
+       .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
+       .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
+       .kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
+       .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
+       .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
+       .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
+       .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
+       .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
+       .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
+       .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
+       .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
+       .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
+       .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
+       .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
+       .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
+       .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
+       .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
+       .kr_intclear = IPATH_KREG_OFFSET(IntClear),
+       .kr_intmask = IPATH_KREG_OFFSET(IntMask),
+       .kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
+       .kr_mdio = IPATH_KREG_OFFSET(MDIO),
+       .kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
+       .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
+       .kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
+       .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
+       .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
+       .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
+       .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
+       .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
+       .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
+       .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
+       .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
+       .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
+       .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
+       .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
+       .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
+       .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
+       .kr_revision = IPATH_KREG_OFFSET(Revision),
+       .kr_scratch = IPATH_KREG_OFFSET(Scratch),
+       .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
+       .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
+       .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
+       .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
+       .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
+       .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
+       .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
+       .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
+       .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
+       .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
+       .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
+       .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
+       .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
+       .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
+       .kr_ibpllcfg = IPATH_KREG_OFFSET(IBPLLCfg),
+
+       /*
+        * These should not be used directly via ipath_read_kreg64(),
+        * use them with ipath_read_kreg64_port()
+        */
+       .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
+       .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0),
+
+       /* This group is pe-800-specific; and used only in this file */
+       /* The rcvpktled register controls one of the debug port signals, so
+        * a packet activity LED can be connected to it. */
+       .kr_rcvpktledcnt = IPATH_KREG_OFFSET(RcvPktLEDCnt),
+       .kr_pcierbuftestreg0 = IPATH_KREG_OFFSET(PCIeRBufTestReg0),
+       .kr_pcierbuftestreg1 = IPATH_KREG_OFFSET(PCIeRBufTestReg1),
+       .kr_pcieq0serdesconfig0 = IPATH_KREG_OFFSET(PCIEQ0SerdesConfig0),
+       .kr_pcieq0serdesconfig1 = IPATH_KREG_OFFSET(PCIEQ0SerdesConfig1),
+       .kr_pcieq0serdesstatus = IPATH_KREG_OFFSET(PCIEQ0SerdesStatus),
+       .kr_pcieq1serdesconfig0 = IPATH_KREG_OFFSET(PCIEQ1SerdesConfig0),
+       .kr_pcieq1serdesconfig1 = IPATH_KREG_OFFSET(PCIEQ1SerdesConfig1),
+       .kr_pcieq1serdesstatus = IPATH_KREG_OFFSET(PCIEQ1SerdesStatus)
+};
+
+static const struct ipath_cregs ipath_pe_cregs = {
+       .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
+       .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
+       .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
+       .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
+       .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
+       .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
+       .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
+       .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
+       .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
+       .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
+       .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
+       .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
+       .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
+       .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
+       .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
+       .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
+       .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
+       .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
+       .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
+       .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
+       .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
+       .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
+       .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
+       .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
+       .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
+       .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
+       .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
+       .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
+       .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
+       .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
+       .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
+       .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
+       .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
+};
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_MASK 0x1F
+#define INFINIPATH_I_RCVAVAIL_MASK 0x1F
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define INFINIPATH_HWE_PCIEMEMPARITYERR_MASK  0x000000000000003fULL
+#define INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT 0
+#define INFINIPATH_HWE_PCIEPOISONEDTLP      0x0000000010000000ULL
+#define INFINIPATH_HWE_PCIECPLTIMEOUT       0x0000000020000000ULL
+#define INFINIPATH_HWE_PCIEBUSPARITYXTLH    0x0000000040000000ULL
+#define INFINIPATH_HWE_PCIEBUSPARITYXADM    0x0000000080000000ULL
+#define INFINIPATH_HWE_PCIEBUSPARITYRADM    0x0000000100000000ULL
+#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define INFINIPATH_HWE_PCIE1PLLFAILED       0x0400000000000000ULL
+#define INFINIPATH_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
+#define INFINIPATH_HWE_SERDESPLLFAILED      0x1000000000000000ULL
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_FREQSEL 0x2
+#define INFINIPATH_EXTS_SERDESSEL 0x4
+#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define INFINIPATH_EXTS_MEMBIST_FOUND       0x0000000000008000
+
+#define _IPATH_GPIO_SDA_NUM 1
+#define _IPATH_GPIO_SCL_NUM 0
+
+#define IPATH_GPIO_SDA (1ULL << \
+       (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+#define IPATH_GPIO_SCL (1ULL << \
+       (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+
+/**
+ * ipath_pe_handle_hwerrors - display hardware errors.
+ * @dd: the infinipath device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * ipath_handle_errors() to avoid excessive stack usage.
+ */
+void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
+       size_t msgl)
+{
+       ipath_err_t hwerrs;
+       u32 bits, ctrl;
+       int isfatal = 0;
+       char bitsmsg[64];
+
+       hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+       if (!hwerrs) {
+               /*
+                * better than printing cofusing messages
+                * This seems to be related to clearing the crc error, or
+                * the pll error during init.
+                */
+               ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
+               return;
+       } else if (hwerrs == ~0ULL) {
+               ipath_dev_err(dd, "Read of hardware error status failed "
+                             "(all bits set); ignoring\n");
+               return;
+       }
+       ipath_stats.sps_hwerrs++;
+
+       /* Always clear the error status register, except MEMBISTFAIL,
+        * regardless of whether we continue or stop using the chip.
+        * We want that set so we know it failed, even across driver reload.
+        * We'll still ignore it in the hwerrmask.  We do this partly for
+        * diagnostics, but also for support */
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                        hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
+
+       hwerrs &= dd->ipath_hwerrmask;
+
+       /*
+        * make sure we get this much out, unless told to be quiet,
+        * or it's occurred within the last 5 seconds
+        */
+       if ((hwerrs & ~dd->ipath_lasthwerror) ||
+           (ipath_debug & __IPATH_VERBDBG))
+               dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
+                        "(cleared)\n", (unsigned long long) hwerrs);
+       dd->ipath_lasthwerror |= hwerrs;
+
+       if (hwerrs & ~infinipath_hwe_bitsextant)
+               ipath_dev_err(dd, "hwerror interrupt with unknown errors "
+                             "%llx set\n", (unsigned long long)
+                             (hwerrs & ~infinipath_hwe_bitsextant));
+
+       ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+       if (ctrl & INFINIPATH_C_FREEZEMODE) {
+               if (hwerrs) {
+                       /*
+                        * if any set that we aren't ignoring only make the
+                        * complaint once, in case it's stuck or recurring,
+                        * and we get here multiple times
+                        */
+                       if (dd->ipath_flags & IPATH_INITTED) {
+                               ipath_dev_err(dd, "Fatal Error (freeze "
+                                             "mode), no longer usable\n");
+                               isfatal = 1;
+                       }
+                       /*
+                        * Mark as having had an error for driver, and also
+                        * for /sys and status word mapped to user programs.
+                        * This marks unit as not usable, until reset
+                        */
+                       *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+                       *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+                       dd->ipath_flags &= ~IPATH_INITTED;
+               } else {
+                       ipath_dbg("Clearing freezemode on ignored hardware "
+                                 "error\n");
+                       ctrl &= ~INFINIPATH_C_FREEZEMODE;
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+                                        ctrl);
+               }
+       }
+
+       *msg = '\0';
+
+       if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
+               strlcat(msg, "[Memory BIST test failed, PE-800 unusable]",
+                       msgl);
+               /* ignore from now on, so disable until driver reloaded */
+               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+       if (hwerrs & (INFINIPATH_HWE_RXEMEMPARITYERR_MASK
+                     << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)) {
+               bits = (u32) ((hwerrs >>
+                              INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) &
+                             INFINIPATH_HWE_RXEMEMPARITYERR_MASK);
+               snprintf(bitsmsg, sizeof bitsmsg, "[RXE Parity Errs %x] ",
+                        bits);
+               strlcat(msg, bitsmsg, msgl);
+       }
+       if (hwerrs & (INFINIPATH_HWE_TXEMEMPARITYERR_MASK
+                     << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
+               bits = (u32) ((hwerrs >>
+                              INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) &
+                             INFINIPATH_HWE_TXEMEMPARITYERR_MASK);
+               snprintf(bitsmsg, sizeof bitsmsg, "[TXE Parity Errs %x] ",
+                        bits);
+               strlcat(msg, bitsmsg, msgl);
+       }
+       if (hwerrs & (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK
+                     << INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT)) {
+               bits = (u32) ((hwerrs >>
+                              INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) &
+                             INFINIPATH_HWE_PCIEMEMPARITYERR_MASK);
+               snprintf(bitsmsg, sizeof bitsmsg,
+                        "[PCIe Mem Parity Errs %x] ", bits);
+               strlcat(msg, bitsmsg, msgl);
+       }
+       if (hwerrs & INFINIPATH_HWE_IBCBUSTOSPCPARITYERR)
+               strlcat(msg, "[IB2IPATH Parity]", msgl);
+       if (hwerrs & INFINIPATH_HWE_IBCBUSFRSPCPARITYERR)
+               strlcat(msg, "[IPATH2IB Parity]", msgl);
+
+#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |       \
+                        INFINIPATH_HWE_COREPLL_RFSLIP )
+
+       if (hwerrs & _IPATH_PLL_FAIL) {
+               snprintf(bitsmsg, sizeof bitsmsg,
+                        "[PLL failed (%llx), PE-800 unusable]",
+                        (unsigned long long) hwerrs & _IPATH_PLL_FAIL);
+               strlcat(msg, bitsmsg, msgl);
+               /* ignore from now on, so disable until driver reloaded */
+               dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+
+       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
+               /*
+                * If it occurs, it is left masked since the eternal
+                * interface is unused
+                */
+               dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+                                dd->ipath_hwerrmask);
+       }
+
+       if (hwerrs & INFINIPATH_HWE_PCIEPOISONEDTLP)
+               strlcat(msg, "[PCIe Poisoned TLP]", msgl);
+       if (hwerrs & INFINIPATH_HWE_PCIECPLTIMEOUT)
+               strlcat(msg, "[PCIe completion timeout]", msgl);
+
+       /*
+        * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+        * parity or memory parity error failures, because most likely we
+        * won't be able to talk to the core of the chip.  Nonetheless, we
+        * might see them, if they are in parts of the PCIe core that aren't
+        * essential.
+        */
+       if (hwerrs & INFINIPATH_HWE_PCIE1PLLFAILED)
+               strlcat(msg, "[PCIePLL1]", msgl);
+       if (hwerrs & INFINIPATH_HWE_PCIE0PLLFAILED)
+               strlcat(msg, "[PCIePLL0]", msgl);
+       if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXTLH)
+               strlcat(msg, "[PCIe XTLH core parity]", msgl);
+       if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYXADM)
+               strlcat(msg, "[PCIe ADM TX core parity]", msgl);
+       if (hwerrs & INFINIPATH_HWE_PCIEBUSPARITYRADM)
+               strlcat(msg, "[PCIe ADM RX core parity]", msgl);
+
+       if (hwerrs & INFINIPATH_HWE_RXDSYNCMEMPARITYERR)
+               strlcat(msg, "[Rx Dsync]", msgl);
+       if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED)
+               strlcat(msg, "[SerDes PLL]", msgl);
+
+       ipath_dev_err(dd, "%s hardware error\n", msg);
+       if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) {
+               /*
+                * for /sys status file ; if no trailing } is copied, we'll
+                * know it was truncated.
+                */
+               snprintf(dd->ipath_freezemsg, dd->ipath_freezelen,
+                        "{%s}", msg);
+       }
+}
+
+/**
+ * ipath_pe_boardname - fill in the board name
+ * @dd: the infinipath device
+ * @name: the output buffer
+ * @namelen: the size of the output buffer
+ *
+ * info is based on the board revision register
+ */
+static int ipath_pe_boardname(struct ipath_devdata *dd, char *name,
+                             size_t namelen)
+{
+       char *n = NULL;
+       u8 boardrev = dd->ipath_boardrev;
+       int ret;
+
+       switch (boardrev) {
+       case 0:
+               n = "InfiniPath_Emulation";
+               break;
+       case 1:
+               n = "InfiniPath_PE-800-Bringup";
+               break;
+       case 2:
+               n = "InfiniPath_PE-880";
+               break;
+       case 3:
+               n = "InfiniPath_PE-850";
+               break;
+       case 4:
+               n = "InfiniPath_PE-860";
+               break;
+       default:
+               ipath_dev_err(dd,
+                             "Don't yet know about board with ID %u\n",
+                             boardrev);
+               snprintf(name, namelen, "Unknown_InfiniPath_PE-8xx_%u",
+                        boardrev);
+               break;
+       }
+       if (n)
+               snprintf(name, namelen, "%s", n);
+
+       if (dd->ipath_majrev != 4 || dd->ipath_minrev != 1) {
+               ipath_dev_err(dd, "Unsupported PE-800 revision %u.%u!\n",
+                             dd->ipath_majrev, dd->ipath_minrev);
+               ret = 1;
+       } else
+               ret = 0;
+
+       return ret;
+}
+
+/**
+ * ipath_pe_init_hwerrors - enable hardware errors
+ * @dd: the infinipath device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+void ipath_pe_init_hwerrors(struct ipath_devdata *dd)
+{
+       ipath_err_t val;
+       u64 extsval;
+
+       extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+       if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
+               ipath_dev_err(dd, "MemBIST did not complete!\n");
+
+       val = ~0ULL;    /* barring bugs, all hwerrors become interrupts, */
+
+       if (!dd->ipath_boardrev)        // no PLL for Emulator
+               val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+
+       /* workaround bug 9460 in internal interface bus parity checking */
+       val &= ~INFINIPATH_HWE_PCIEBUSPARITYRADM;
+
+       dd->ipath_hwerrmask = val;
+}
+
+/**
+ * ipath_pe_bringup_serdes - bring up the serdes
+ * @dd: the infinipath device
+ */
+int ipath_pe_bringup_serdes(struct ipath_devdata *dd)
+{
+       u64 val, tmp, config1;
+       int ret = 0, change = 0;
+
+       ipath_dbg("Trying to bringup serdes\n");
+
+       if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
+           INFINIPATH_HWE_SERDESPLLFAILED) {
+               ipath_dbg("At start, serdes PLL failed bit set "
+                         "in hwerrstatus, clearing and continuing\n");
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+                                INFINIPATH_HWE_SERDESPLLFAILED);
+       }
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+       config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
+
+       ipath_cdbg(VERBOSE, "SerDes status config0=%llx config1=%llx, "
+                  "xgxsconfig %llx\n", (unsigned long long) val,
+                  (unsigned long long) config1, (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+       /*
+        * Force reset on, also set rxdetect enable.  Must do before reading
+        * serdesstatus at least for simulation, or some of the bits in
+        * serdes status will come back as undefined and cause simulation
+        * failures
+        */
+       val |= INFINIPATH_SERDC0_RESET_PLL | INFINIPATH_SERDC0_RXDETECT_EN
+               | INFINIPATH_SERDC0_L1PWR_DN;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+       /* be sure chip saw it */
+       tmp = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       udelay(5);              /* need pll reset set at least for a bit */
+       /*
+        * after PLL is reset, set the per-lane Resets and TxIdle and
+        * clear the PLL reset and rxdetect (to get falling edge).
+        * Leave L1PWR bits set (permanently)
+        */
+       val &= ~(INFINIPATH_SERDC0_RXDETECT_EN | INFINIPATH_SERDC0_RESET_PLL
+                | INFINIPATH_SERDC0_L1PWR_DN);
+       val |= INFINIPATH_SERDC0_RESET_MASK | INFINIPATH_SERDC0_TXIDLE;
+       ipath_cdbg(VERBOSE, "Clearing pll reset and setting lane resets "
+                  "and txidle (%llx)\n", (unsigned long long) val);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+       /* be sure chip saw it */
+       tmp = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+       /* need PLL reset clear for at least 11 usec before lane
+        * resets cleared; give it a few more to be sure */
+       udelay(15);
+       val &= ~(INFINIPATH_SERDC0_RESET_MASK | INFINIPATH_SERDC0_TXIDLE);
+
+       ipath_cdbg(VERBOSE, "Clearing lane resets and txidle "
+                  "(writing %llx)\n", (unsigned long long) val);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+       /* be sure chip saw it */
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+       if (((val >> INFINIPATH_XGXS_MDIOADDR_SHIFT) &
+            INFINIPATH_XGXS_MDIOADDR_MASK) != 3) {
+               val &=
+                       ~(INFINIPATH_XGXS_MDIOADDR_MASK <<
+                         INFINIPATH_XGXS_MDIOADDR_SHIFT);
+               /* MDIO address 3 */
+               val |= 3ULL << INFINIPATH_XGXS_MDIOADDR_SHIFT;
+               change = 1;
+       }
+       if (val & INFINIPATH_XGXS_RESET) {
+               val &= ~INFINIPATH_XGXS_RESET;
+               change = 1;
+       }
+       if (change)
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+
+       val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+       /* clear current and de-emphasis bits */
+       config1 &= ~0x0ffffffff00ULL;
+       /* set current to 20ma */
+       config1 |= 0x00000000000ULL;
+       /* set de-emphasis to -5.68dB */
+       config1 |= 0x0cccc000000ULL;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
+
+       ipath_cdbg(VERBOSE, "done: SerDes status config0=%llx "
+                  "config1=%llx, sstatus=%llx xgxs=%llx\n",
+                  (unsigned long long) val, (unsigned long long) config1,
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+                  (unsigned long long)
+                  ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+       if (!ipath_waitfor_mdio_cmdready(dd)) {
+               ipath_write_kreg(
+                       dd, dd->ipath_kregs->kr_mdio,
+                       ipath_mdio_req(IPATH_MDIO_CMD_READ, 31,
+                                      IPATH_MDIO_CTRL_XGXS_REG_8, 0));
+               if (ipath_waitfor_complete(dd, dd->ipath_kregs->kr_mdio,
+                                          IPATH_MDIO_DATAVALID, &val))
+                       ipath_dbg("Never got MDIO data for XGXS "
+                                 "status read\n");
+               else
+                       ipath_cdbg(VERBOSE, "MDIO Read reg8, "
+                                  "'bank' 31 %x\n", (u32) val);
+       } else
+               ipath_dbg("Never got MDIO cmdready for XGXS status read\n");
+
+       return ret;
+}
+
+/**
+ * ipath_pe_quiet_serdes - set serdes to txidle
+ * @dd: the infinipath device
+ * Called when driver is being unloaded
+ */
+void ipath_pe_quiet_serdes(struct ipath_devdata *dd)
+{
+       u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+       val |= INFINIPATH_SERDC0_TXIDLE;
+       ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
+                 (unsigned long long) val);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+}
+
+/* this is not yet needed on the PE800, so just return 0. */
+static int ipath_pe_intconfig(struct ipath_devdata *dd)
+{
+       return 0;
+}
+
+/**
+ * ipath_setup_pe_setextled - set the state of the two external LEDs
+ * @dd: the infinipath device
+ * @lst: the L state
+ * @ltst: the LT state
+
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void ipath_setup_pe_setextled(struct ipath_devdata *dd, u64 lst,
+                                    u64 ltst)
+{
+       u64 extctl;
+
+       /* the diags use the LED to indicate diag info, so we leave
+        * the external LED alone when the diags are running */
+       if (ipath_diag_inuse)
+               return;
+
+       extctl = dd->ipath_extctrl & ~(INFINIPATH_EXTC_LED1PRIPORT_ON |
+                                      INFINIPATH_EXTC_LED2PRIPORT_ON);
+
+       if (ltst & INFINIPATH_IBCS_LT_STATE_LINKUP)
+               extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
+       if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+               extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
+       dd->ipath_extctrl = extctl;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
+}
+
+/**
+ * ipath_setup_pe_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the infinipath device
+ *
+ * This is called during driver unload.
+ * We do the pci_disable_msi here, not in generic code, because it
+ * isn't used for the HT-400. If we do end up needing pci_enable_msi
+ * at some point in the future for HT-400, we'll move the call back
+ * into the main init_one code.
+ */
+static void ipath_setup_pe_cleanup(struct ipath_devdata *dd)
+{
+       dd->ipath_msi_lo = 0;   /* just in case unload fails */
+       pci_disable_msi(dd->pcidev);
+}
+
+/**
+ * ipath_setup_pe_config - setup PCIe config related stuff
+ * @dd: the infinipath device
+ * @pdev: the PCI device
+ *
+ * The pci_enable_msi() call will fail on systems with MSI quirks
+ * such as those with AMD8131, even if the device of interest is not
+ * attached to that device, (in the 2.6.13 - 2.6.15 kernels, at least, fixed
+ * late in 2.6.16).
+ * All that can be done is to edit the kernel source to remove the quirk
+ * check until that is fixed.
+ * We do not need to call enable_msi() for our HyperTransport chip (HT-400),
+ * even those it uses MSI, and we want to avoid the quirk warning, so
+ * So we call enable_msi only for the PE-800.  If we do end up needing
+ * pci_enable_msi at some point in the future for HT-400, we'll move the
+ * call back into the main init_one code.
+ * We save the msi lo and hi values, so we can restore them after
+ * chip reset (the kernel PCI infrastructure doesn't yet handle that
+ * correctly).
+ */
+static int ipath_setup_pe_config(struct ipath_devdata *dd,
+                                struct pci_dev *pdev)
+{
+       int pos, ret;
+
+       dd->ipath_msi_lo = 0;   /* used as a flag during reset processing */
+       ret = pci_enable_msi(dd->pcidev);
+       if (ret)
+               ipath_dev_err(dd, "pci_enable_msi failed: %d, "
+                             "interrupts may not work\n", ret);
+       /* continue even if it fails, we may still be OK... */
+
+       if ((pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI))) {
+               u16 control;
+               pci_read_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO,
+                                     &dd->ipath_msi_lo);
+               pci_read_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI,
+                                     &dd->ipath_msi_hi);
+               pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS,
+                                    &control);
+               /* now save the data (vector) info */
+               pci_read_config_word(dd->pcidev,
+                                    pos + ((control & PCI_MSI_FLAGS_64BIT)
+                                           ? 12 : 8),
+                                    &dd->ipath_msi_data);
+               ipath_cdbg(VERBOSE, "Read msi data 0x%x from config offset "
+                          "0x%x, control=0x%x\n", dd->ipath_msi_data,
+                          pos + ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8),
+                          control);
+               /* we save the cachelinesize also, although it doesn't
+                * really matter */
+               pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE,
+                                    &dd->ipath_pci_cacheline);
+       } else
+               ipath_dev_err(dd, "Can't find MSI capability, "
+                             "can't save MSI settings for reset\n");
+       if ((pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_EXP))) {
+               u16 linkstat;
+               pci_read_config_word(dd->pcidev, pos + PCI_EXP_LNKSTA,
+                                    &linkstat);
+               linkstat >>= 4;
+               linkstat &= 0x1f;
+               if (linkstat != 8)
+                       ipath_dev_err(dd, "PCIe width %u, "
+                                     "performance reduced\n", linkstat);
+       }
+       else
+               ipath_dev_err(dd, "Can't find PCI Express "
+                             "capability!\n");
+       return 0;
+}
+
+static void ipath_init_pe_variables(void)
+{
+       /*
+        * bits for selecting i2c direction and values,
+        * used for I2C serial flash
+        */
+       ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+       ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+       ipath_gpio_sda = IPATH_GPIO_SDA;
+       ipath_gpio_scl = IPATH_GPIO_SCL;
+
+       /* variables for sanity checking interrupt and errors */
+       infinipath_hwe_bitsextant =
+               (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+               (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK <<
+                INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) |
+               INFINIPATH_HWE_PCIE1PLLFAILED |
+               INFINIPATH_HWE_PCIE0PLLFAILED |
+               INFINIPATH_HWE_PCIEPOISONEDTLP |
+               INFINIPATH_HWE_PCIECPLTIMEOUT |
+               INFINIPATH_HWE_PCIEBUSPARITYXTLH |
+               INFINIPATH_HWE_PCIEBUSPARITYXADM |
+               INFINIPATH_HWE_PCIEBUSPARITYRADM |
+               INFINIPATH_HWE_MEMBISTFAILED |
+               INFINIPATH_HWE_COREPLL_FBSLIP |
+               INFINIPATH_HWE_COREPLL_RFSLIP |
+               INFINIPATH_HWE_SERDESPLLFAILED |
+               INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
+               INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
+       infinipath_i_bitsextant =
+               (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
+               (INFINIPATH_I_RCVAVAIL_MASK <<
+                INFINIPATH_I_RCVAVAIL_SHIFT) |
+               INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
+               INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
+       infinipath_e_bitsextant =
+               INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
+               INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
+               INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
+               INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
+               INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
+               INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
+               INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+               INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
+               INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
+               INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
+               INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
+               INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
+               INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
+               INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
+               INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
+               INFINIPATH_E_HARDWARE;
+
+       infinipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+       infinipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+}
+
+/* setup the MSI stuff again after a reset.  I'd like to just call
+ * pci_enable_msi() and request_irq() again, but when I do that,
+ * the MSI enable bit doesn't get set in the command word, and
+ * we switch to to a different interrupt vector, which is confusing,
+ * so I instead just do it all inline.  Perhaps somehow can tie this
+ * into the PCIe hotplug support at some point
+ * Note, because I'm doing it all here, I don't call pci_disable_msi()
+ * or free_irq() at the start of ipath_setup_pe_reset().
+ */
+static int ipath_reinit_msi(struct ipath_devdata *dd)
+{
+       int pos;
+       u16 control;
+       int ret;
+
+       if (!dd->ipath_msi_lo) {
+               dev_info(&dd->pcidev->dev, "Can't restore MSI config, "
+                        "initial setup failed?\n");
+               ret = 0;
+               goto bail;
+       }
+
+       if (!(pos = pci_find_capability(dd->pcidev, PCI_CAP_ID_MSI))) {
+               ipath_dev_err(dd, "Can't find MSI capability, "
+                             "can't restore MSI settings\n");
+               ret = 0;
+               goto bail;
+       }
+       ipath_cdbg(VERBOSE, "Writing msi_lo 0x%x to config offset 0x%x\n",
+                  dd->ipath_msi_lo, pos + PCI_MSI_ADDRESS_LO);
+       pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO,
+                              dd->ipath_msi_lo);
+       ipath_cdbg(VERBOSE, "Writing msi_lo 0x%x to config offset 0x%x\n",
+                  dd->ipath_msi_hi, pos + PCI_MSI_ADDRESS_HI);
+       pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI,
+                              dd->ipath_msi_hi);
+       pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control);
+       if (!(control & PCI_MSI_FLAGS_ENABLE)) {
+               ipath_cdbg(VERBOSE, "MSI control at off %x was %x, "
+                          "setting MSI enable (%x)\n", pos + PCI_MSI_FLAGS,
+                          control, control | PCI_MSI_FLAGS_ENABLE);
+               control |= PCI_MSI_FLAGS_ENABLE;
+               pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS,
+                                     control);
+       }
+       /* now rewrite the data (vector) info */
+       pci_write_config_word(dd->pcidev, pos +
+                             ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8),
+                             dd->ipath_msi_data);
+       /* we restore the cachelinesize also, although it doesn't really
+        * matter */
+       pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE,
+                             dd->ipath_pci_cacheline);
+       /* and now set the pci master bit again */
+       pci_set_master(dd->pcidev);
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/* This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.  If we need interrupt context, we can split
+ * it into two routines.
+*/
+static int ipath_setup_pe_reset(struct ipath_devdata *dd)
+{
+       u64 val;
+       int i;
+       int ret;
+
+       /* Use ERROR so it shows up in logs, etc. */
+       ipath_dev_err(dd, "Resetting PE-800 unit %u\n",
+                     dd->ipath_unit);
+       val = dd->ipath_control | INFINIPATH_C_RESET;
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_control, val);
+       mb();
+
+       for (i = 1; i <= 5; i++) {
+               int r;
+               /* allow MBIST, etc. to complete; longer on each retry.
+                * We sometimes get machine checks from bus timeout if no
+                * response, so for now, make it *really* long.
+                */
+               msleep(1000 + (1 + i) * 2000);
+               if ((r =
+                    pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+                                           dd->ipath_pcibar0)))
+                       ipath_dev_err(dd, "rewrite of BAR0 failed: %d\n",
+                                     r);
+               if ((r =
+                    pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+                                           dd->ipath_pcibar1)))
+                       ipath_dev_err(dd, "rewrite of BAR1 failed: %d\n",
+                                     r);
+               /* now re-enable memory access */
+               if ((r = pci_enable_device(dd->pcidev)))
+                       ipath_dev_err(dd, "pci_enable_device failed after "
+                                     "reset: %d\n", r);
+               val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
+               if (val == dd->ipath_revision) {
+                       ipath_cdbg(VERBOSE, "Got matching revision "
+                                  "register %llx on try %d\n",
+                                  (unsigned long long) val, i);
+                       ret = ipath_reinit_msi(dd);
+                       goto bail;
+               }
+               /* Probably getting -1 back */
+               ipath_dbg("Didn't get expected revision register, "
+                         "got %llx, try %d\n", (unsigned long long) val,
+                         i + 1);
+       }
+       ret = 0; /* failed */
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_pe_put_tid - write a TID in chip
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to udpate
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void ipath_pe_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr,
+                            u32 type, unsigned long pa)
+{
+       u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+       unsigned long flags = 0; /* keep gcc quiet */
+
+       if (pa != dd->ipath_tidinvalid) {
+               if (pa & ((1U << 11) - 1)) {
+                       dev_info(&dd->pcidev->dev, "BUG: physaddr %lx "
+                                "not 4KB aligned!\n", pa);
+                       return;
+               }
+               pa >>= 11;
+               /* paranoia check */
+               if (pa & (7<<29))
+                       ipath_dev_err(dd,
+                                     "BUG: Physical page address 0x%lx "
+                                     "has bits set in 31-29\n", pa);
+
+               if (type == 0)
+                       pa |= dd->ipath_tidtemplate;
+               else /* for now, always full 4KB page */
+                       pa |= 2 << 29;
+       }
+
+       /* workaround chip bug 9437 by writing each TID twice
+        * and holding a spinlock around the writes, so they don't
+        * intermix with other TID (eager or expected) writes
+        * Unfortunately, this call can be done from interrupt level
+        * for the port 0 eager TIDs, so we have to use irqsave
+        */
+       spin_lock_irqsave(&dd->ipath_tid_lock, flags);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeeddeaf);
+       if (dd->ipath_kregbase)
+               writel(pa, tidp32);
+       ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xdeadbeef);
+       mmiowb();
+       spin_unlock_irqrestore(&dd->ipath_tid_lock, flags);
+}
+
+/**
+ * ipath_pe_clear_tid - clear all TID entries for a port, expected and eager
+ * @dd: the infinipath device
+ * @port: the port
+ *
+ * clear all TID entries for a port, expected and eager.
+ * Used from ipath_close().  On PE800, TIDs are only 32 bits,
+ * not 64, but they are still on 64 bit boundaries, so tidbase
+ * is declared as u64 * for the pointer math, even though we write 32 bits
+ */
+static void ipath_pe_clear_tids(struct ipath_devdata *dd, unsigned port)
+{
+       u64 __iomem *tidbase;
+       unsigned long tidinv;
+       int i;
+
+       if (!dd->ipath_kregbase)
+               return;
+
+       ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
+
+       tidinv = dd->ipath_tidinvalid;
+       tidbase = (u64 __iomem *)
+               ((char __iomem *)(dd->ipath_kregbase) +
+                dd->ipath_rcvtidbase +
+                port * dd->ipath_rcvtidcnt * sizeof(*tidbase));
+
+       for (i = 0; i < dd->ipath_rcvtidcnt; i++)
+               ipath_pe_put_tid(dd, &tidbase[i], 0, tidinv);
+
+       tidbase = (u64 __iomem *)
+               ((char __iomem *)(dd->ipath_kregbase) +
+                dd->ipath_rcvegrbase +
+                port * dd->ipath_rcvegrcnt * sizeof(*tidbase));
+
+       for (i = 0; i < dd->ipath_rcvegrcnt; i++)
+               ipath_pe_put_tid(dd, &tidbase[i], 1, tidinv);
+}
+
+/**
+ * ipath_pe_tidtemplate - setup constants for TID updates
+ * @dd: the infinipath device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void ipath_pe_tidtemplate(struct ipath_devdata *dd)
+{
+       u32 egrsize = dd->ipath_rcvegrbufsize;
+
+       /* For now, we always allocate 4KB buffers (at init) so we can
+        * receive max size packets.  We may want a module parameter to
+        * specify 2KB or 4KB and/or make be per port instead of per device
+        * for those who want to reduce memory footprint.  Note that the
+        * ipath_rcvhdrentsize size must be large enough to hold the largest
+        * IB header (currently 96 bytes) that we expect to handle (plus of
+        * course the 2 dwords of RHF).
+        */
+       if (egrsize == 2048)
+               dd->ipath_tidtemplate = 1U << 29;
+       else if (egrsize == 4096)
+               dd->ipath_tidtemplate = 2U << 29;
+       else {
+               egrsize = 4096;
+               dev_info(&dd->pcidev->dev, "BUG: unsupported egrbufsize "
+                        "%u, using %u\n", dd->ipath_rcvegrbufsize,
+                        egrsize);
+               dd->ipath_tidtemplate = 2U << 29;
+       }
+       dd->ipath_tidinvalid = 0;
+}
+
+static int ipath_pe_early_init(struct ipath_devdata *dd)
+{
+       dd->ipath_flags |= IPATH_4BYTE_TID;
+
+       /*
+        * For openib, we need to be able to handle an IB header of 96 bytes
+        * or 24 dwords.  HT-400 has arbitrary sized receive buffers, so we
+        * made them the same size as the PIO buffers.  The PE-800 does not
+        * handle arbitrary size buffers, so we need the header large enough
+        * to handle largest IB header, but still have room for a 2KB MTU
+        * standard IB packet.
+        */
+       dd->ipath_rcvhdrentsize = 24;
+       dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+
+       /* For HT-400, we allocate a somewhat overly large eager buffer,
+        * such that we can guarantee that we can receive the largest packet
+        * that we can send out.  To truly support a 4KB MTU, we need to
+        * bump this to a larger value.  We'll do this when I get around to
+        * testing 4KB sends on the PE-800, which I have not yet done.
+        */
+       dd->ipath_rcvegrbufsize = 2048;
+       /*
+        * the min() check here is currently a nop, but it may not always
+        * be, depending on just how we do ipath_rcvegrbufsize
+        */
+       dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
+                                dd->ipath_rcvegrbufsize +
+                                (dd->ipath_rcvhdrentsize << 2));
+       dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+
+       /*
+        * For PE-800, we can request a receive interrupt for 1 or
+        * more packets from current offset.  For now, we set this
+        * up for a single packet, to match the HT-400 behavior.
+        */
+       dd->ipath_rhdrhead_intr_off = 1ULL<<32;
+
+       return 0;
+}
+
+int __attribute__((weak)) ipath_unordered_wc(void)
+{
+       return 0;
+}
+
+/**
+ * ipath_init_pe_get_base_info - set chip-specific flags for user code
+ * @dd: the infinipath device
+ * @kbase: ipath_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+static int ipath_pe_get_base_info(struct ipath_portdata *pd, void *kbase)
+{
+       struct ipath_base_info *kinfo = kbase;
+
+       if (ipath_unordered_wc()) {
+               kinfo->spi_runtime_flags |= IPATH_RUNTIME_FORCE_WC_ORDER;
+               ipath_cdbg(PROC, "Intel processor, forcing WC order\n");
+       }
+       else
+               ipath_cdbg(PROC, "Not Intel processor, WC ordered\n");
+
+       kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE;
+
+       return 0;
+}
+
+/**
+ * ipath_init_pe800_funcs - set up the chip-specific function pointers
+ * @dd: the infinipath device
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+void ipath_init_pe800_funcs(struct ipath_devdata *dd)
+{
+       dd->ipath_f_intrsetup = ipath_pe_intconfig;
+       dd->ipath_f_bus = ipath_setup_pe_config;
+       dd->ipath_f_reset = ipath_setup_pe_reset;
+       dd->ipath_f_get_boardname = ipath_pe_boardname;
+       dd->ipath_f_init_hwerrors = ipath_pe_init_hwerrors;
+       dd->ipath_f_early_init = ipath_pe_early_init;
+       dd->ipath_f_handle_hwerrors = ipath_pe_handle_hwerrors;
+       dd->ipath_f_quiet_serdes = ipath_pe_quiet_serdes;
+       dd->ipath_f_bringup_serdes = ipath_pe_bringup_serdes;
+       dd->ipath_f_clear_tids = ipath_pe_clear_tids;
+       dd->ipath_f_put_tid = ipath_pe_put_tid;
+       dd->ipath_f_cleanup = ipath_setup_pe_cleanup;
+       dd->ipath_f_setextled = ipath_setup_pe_setextled;
+       dd->ipath_f_get_base_info = ipath_pe_get_base_info;
+
+       /* initialize chip-specific variables */
+       dd->ipath_f_tidtemplate = ipath_pe_tidtemplate;
+
+       /*
+        * setup the register offsets, since they are different for each
+        * chip
+        */
+       dd->ipath_kregs = &ipath_pe_kregs;
+       dd->ipath_cregs = &ipath_pe_cregs;
+
+       ipath_init_pe_variables();
+}
+
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
new file mode 100644 (file)
index 0000000..6058d70
--- /dev/null
@@ -0,0 +1,913 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+#include "ips_common.h"
+
+#define BITS_PER_PAGE          (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK     (BITS_PER_PAGE-1)
+#define mk_qpn(qpt, map, off)  (((map) - (qpt)->map) * BITS_PER_PAGE + \
+                                (off))
+#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
+                                                     BITS_PER_PAGE, off)
+
+#define TRANS_INVALID  0
+#define TRANS_ANY2RST  1
+#define TRANS_RST2INIT 2
+#define TRANS_INIT2INIT        3
+#define TRANS_INIT2RTR 4
+#define TRANS_RTR2RTS  5
+#define TRANS_RTS2RTS  6
+#define TRANS_SQERR2RTS        7
+#define TRANS_ANY2ERR  8
+#define TRANS_RTS2SQD  9  /* XXX Wait for expected ACKs & signal event */
+#define TRANS_SQD2SQD  10 /* error if not drained & parameter change */
+#define TRANS_SQD2RTS  11 /* error if not drained */
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+       0,                      /* 0 */
+       1,                      /* 1 */
+       2,                      /* 2 */
+       3,                      /* 3 */
+       4,                      /* 4 */
+       6,                      /* 5 */
+       8,                      /* 6 */
+       12,                     /* 7 */
+       16,                     /* 8 */
+       24,                     /* 9 */
+       32,                     /* A */
+       48,                     /* B */
+       64,                     /* C */
+       96,                     /* D */
+       128,                    /* E */
+       192,                    /* F */
+       256,                    /* 10 */
+       384,                    /* 11 */
+       512,                    /* 12 */
+       768,                    /* 13 */
+       1024,                   /* 14 */
+       1536,                   /* 15 */
+       2048,                   /* 16 */
+       3072,                   /* 17 */
+       4096,                   /* 18 */
+       6144,                   /* 19 */
+       8192,                   /* 1A */
+       12288,                  /* 1B */
+       16384,                  /* 1C */
+       24576,                  /* 1D */
+       32768                   /* 1E */
+};
+
+static u32 alloc_qpn(struct ipath_qp_table *qpt)
+{
+       u32 i, offset, max_scan, qpn;
+       struct qpn_map *map;
+       u32 ret;
+
+       qpn = qpt->last + 1;
+       if (qpn >= QPN_MAX)
+               qpn = 2;
+       offset = qpn & BITS_PER_PAGE_MASK;
+       map = &qpt->map[qpn / BITS_PER_PAGE];
+       max_scan = qpt->nmaps - !offset;
+       for (i = 0;;) {
+               if (unlikely(!map->page)) {
+                       unsigned long page = get_zeroed_page(GFP_KERNEL);
+                       unsigned long flags;
+
+                       /*
+                        * Free the page if someone raced with us
+                        * installing it:
+                        */
+                       spin_lock_irqsave(&qpt->lock, flags);
+                       if (map->page)
+                               free_page(page);
+                       else
+                               map->page = (void *)page;
+                       spin_unlock_irqrestore(&qpt->lock, flags);
+                       if (unlikely(!map->page))
+                               break;
+               }
+               if (likely(atomic_read(&map->n_free))) {
+                       do {
+                               if (!test_and_set_bit(offset, map->page)) {
+                                       atomic_dec(&map->n_free);
+                                       qpt->last = qpn;
+                                       ret = qpn;
+                                       goto bail;
+                               }
+                               offset = find_next_offset(map, offset);
+                               qpn = mk_qpn(qpt, map, offset);
+                               /*
+                                * This test differs from alloc_pidmap().
+                                * If find_next_offset() does find a zero
+                                * bit, we don't need to check for QPN
+                                * wrapping around past our starting QPN.
+                                * We just need to be sure we don't loop
+                                * forever.
+                                */
+                       } while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+               }
+               /*
+                * In order to keep the number of pages allocated to a
+                * minimum, we scan the all existing pages before increasing
+                * the size of the bitmap table.
+                */
+               if (++i > max_scan) {
+                       if (qpt->nmaps == QPNMAP_ENTRIES)
+                               break;
+                       map = &qpt->map[qpt->nmaps++];
+                       offset = 0;
+               } else if (map < &qpt->map[qpt->nmaps]) {
+                       ++map;
+                       offset = 0;
+               } else {
+                       map = &qpt->map[0];
+                       offset = 2;
+               }
+               qpn = mk_qpn(qpt, map, offset);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+       struct qpn_map *map;
+
+       map = qpt->map + qpn / BITS_PER_PAGE;
+       if (map->page)
+               clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+       atomic_inc(&map->n_free);
+}
+
+/**
+ * ipath_alloc_qpn - allocate a QP number
+ * @qpt: the QP table
+ * @qp: the QP
+ * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
+ *
+ * Allocate the next available QPN and put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
+                   enum ib_qp_type type)
+{
+       unsigned long flags;
+       u32 qpn;
+       int ret;
+
+       if (type == IB_QPT_SMI)
+               qpn = 0;
+       else if (type == IB_QPT_GSI)
+               qpn = 1;
+       else {
+               /* Allocate the next available QPN */
+               qpn = alloc_qpn(qpt);
+               if (qpn == 0) {
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+       }
+       qp->ibqp.qp_num = qpn;
+
+       /* Add the QP to the hash table. */
+       spin_lock_irqsave(&qpt->lock, flags);
+
+       qpn %= qpt->max;
+       qp->next = qpt->table[qpn];
+       qpt->table[qpn] = qp;
+       atomic_inc(&qp->refcount);
+
+       spin_unlock_irqrestore(&qpt->lock, flags);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_free_qp - remove a QP from the QP table
+ * @qpt: the QP table
+ * @qp: the QP to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
+{
+       struct ipath_qp *q, **qpp;
+       unsigned long flags;
+       int fnd = 0;
+
+       spin_lock_irqsave(&qpt->lock, flags);
+
+       /* Remove QP from the hash table. */
+       qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
+       for (; (q = *qpp) != NULL; qpp = &q->next) {
+               if (q == qp) {
+                       *qpp = qp->next;
+                       qp->next = NULL;
+                       atomic_dec(&qp->refcount);
+                       fnd = 1;
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&qpt->lock, flags);
+
+       if (!fnd)
+               return;
+
+       /* If QPN is not reserved, mark QPN free in the bitmap. */
+       if (qp->ibqp.qp_num > 1)
+               free_qpn(qpt, qp->ibqp.qp_num);
+
+       wait_event(qp->wait, !atomic_read(&qp->refcount));
+}
+
+/**
+ * ipath_free_all_qps - remove all QPs from the table
+ * @qpt: the QP table to empty
+ */
+void ipath_free_all_qps(struct ipath_qp_table *qpt)
+{
+       unsigned long flags;
+       struct ipath_qp *qp, *nqp;
+       u32 n;
+
+       for (n = 0; n < qpt->max; n++) {
+               spin_lock_irqsave(&qpt->lock, flags);
+               qp = qpt->table[n];
+               qpt->table[n] = NULL;
+               spin_unlock_irqrestore(&qpt->lock, flags);
+
+               while (qp) {
+                       nqp = qp->next;
+                       if (qp->ibqp.qp_num > 1)
+                               free_qpn(qpt, qp->ibqp.qp_num);
+                       if (!atomic_dec_and_test(&qp->refcount) ||
+                           !ipath_destroy_qp(&qp->ibqp))
+                               _VERBS_INFO("QP memory leak!\n");
+                       qp = nqp;
+               }
+       }
+
+       for (n = 0; n < ARRAY_SIZE(qpt->map); n++) {
+               if (qpt->map[n].page)
+                       free_page((unsigned long)qpt->map[n].page);
+       }
+}
+
+/**
+ * ipath_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+       unsigned long flags;
+       struct ipath_qp *qp;
+
+       spin_lock_irqsave(&qpt->lock, flags);
+
+       for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
+               if (qp->ibqp.qp_num == qpn) {
+                       atomic_inc(&qp->refcount);
+                       break;
+               }
+       }
+
+       spin_unlock_irqrestore(&qpt->lock, flags);
+       return qp;
+}
+
+/**
+ * ipath_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ */
+static void ipath_reset_qp(struct ipath_qp *qp)
+{
+       qp->remote_qpn = 0;
+       qp->qkey = 0;
+       qp->qp_access_flags = 0;
+       qp->s_hdrwords = 0;
+       qp->s_psn = 0;
+       qp->r_psn = 0;
+       atomic_set(&qp->msn, 0);
+       if (qp->ibqp.qp_type == IB_QPT_RC) {
+               qp->s_state = IB_OPCODE_RC_SEND_LAST;
+               qp->r_state = IB_OPCODE_RC_SEND_LAST;
+       } else {
+               qp->s_state = IB_OPCODE_UC_SEND_LAST;
+               qp->r_state = IB_OPCODE_UC_SEND_LAST;
+       }
+       qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+       qp->s_nak_state = 0;
+       qp->s_rnr_timeout = 0;
+       qp->s_head = 0;
+       qp->s_tail = 0;
+       qp->s_cur = 0;
+       qp->s_last = 0;
+       qp->s_ssn = 1;
+       qp->s_lsn = 0;
+       qp->r_rq.head = 0;
+       qp->r_rq.tail = 0;
+       qp->r_reuse_sge = 0;
+}
+
+/**
+ * ipath_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                   int attr_mask)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       enum ib_qp_state cur_state, new_state;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&qp->r_rq.lock, flags);
+       spin_lock(&qp->s_lock);
+
+       cur_state = attr_mask & IB_QP_CUR_STATE ?
+               attr->cur_qp_state : qp->state;
+       new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+                               attr_mask))
+               goto inval;
+
+       switch (new_state) {
+       case IB_QPS_RESET:
+               ipath_reset_qp(qp);
+               break;
+
+       case IB_QPS_ERR:
+               ipath_error_qp(qp);
+               break;
+
+       default:
+               break;
+
+       }
+
+       if (attr_mask & IB_QP_PKEY_INDEX) {
+               struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+               if (attr->pkey_index >= ipath_layer_get_npkeys(dev->dd))
+                       goto inval;
+               qp->s_pkey_index = attr->pkey_index;
+       }
+
+       if (attr_mask & IB_QP_DEST_QPN)
+               qp->remote_qpn = attr->dest_qp_num;
+
+       if (attr_mask & IB_QP_SQ_PSN) {
+               qp->s_next_psn = attr->sq_psn;
+               qp->s_last_psn = qp->s_next_psn - 1;
+       }
+
+       if (attr_mask & IB_QP_RQ_PSN)
+               qp->r_psn = attr->rq_psn;
+
+       if (attr_mask & IB_QP_ACCESS_FLAGS)
+               qp->qp_access_flags = attr->qp_access_flags;
+
+       if (attr_mask & IB_QP_AV) {
+               if (attr->ah_attr.dlid == 0 ||
+                   attr->ah_attr.dlid >= IPS_MULTICAST_LID_BASE)
+                       goto inval;
+               qp->remote_ah_attr = attr->ah_attr;
+       }
+
+       if (attr_mask & IB_QP_PATH_MTU)
+               qp->path_mtu = attr->path_mtu;
+
+       if (attr_mask & IB_QP_RETRY_CNT)
+               qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
+
+       if (attr_mask & IB_QP_RNR_RETRY) {
+               qp->s_rnr_retry = attr->rnr_retry;
+               if (qp->s_rnr_retry > 7)
+                       qp->s_rnr_retry = 7;
+               qp->s_rnr_retry_cnt = qp->s_rnr_retry;
+       }
+
+       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+               if (attr->min_rnr_timer > 31)
+                       goto inval;
+               qp->s_min_rnr_timer = attr->min_rnr_timer;
+       }
+
+       if (attr_mask & IB_QP_QKEY)
+               qp->qkey = attr->qkey;
+
+       if (attr_mask & IB_QP_PKEY_INDEX)
+               qp->s_pkey_index = attr->pkey_index;
+
+       qp->state = new_state;
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+
+       /*
+        * If QP1 changed to the RTS state, try to move to the link to INIT
+        * even if it was ACTIVE so the SM will reinitialize the SMA's
+        * state.
+        */
+       if (qp->ibqp.qp_num == 1 && new_state == IB_QPS_RTS) {
+               struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+               ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
+       }
+       ret = 0;
+       goto bail;
+
+inval:
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+       ret = -EINVAL;
+
+bail:
+       return ret;
+}
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                  int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+
+       attr->qp_state = qp->state;
+       attr->cur_qp_state = attr->qp_state;
+       attr->path_mtu = qp->path_mtu;
+       attr->path_mig_state = 0;
+       attr->qkey = qp->qkey;
+       attr->rq_psn = qp->r_psn;
+       attr->sq_psn = qp->s_next_psn;
+       attr->dest_qp_num = qp->remote_qpn;
+       attr->qp_access_flags = qp->qp_access_flags;
+       attr->cap.max_send_wr = qp->s_size - 1;
+       attr->cap.max_recv_wr = qp->r_rq.size - 1;
+       attr->cap.max_send_sge = qp->s_max_sge;
+       attr->cap.max_recv_sge = qp->r_rq.max_sge;
+       attr->cap.max_inline_data = 0;
+       attr->ah_attr = qp->remote_ah_attr;
+       memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
+       attr->pkey_index = qp->s_pkey_index;
+       attr->alt_pkey_index = 0;
+       attr->en_sqd_async_notify = 0;
+       attr->sq_draining = 0;
+       attr->max_rd_atomic = 1;
+       attr->max_dest_rd_atomic = 1;
+       attr->min_rnr_timer = qp->s_min_rnr_timer;
+       attr->port_num = 1;
+       attr->timeout = 0;
+       attr->retry_cnt = qp->s_retry_cnt;
+       attr->rnr_retry = qp->s_rnr_retry;
+       attr->alt_port_num = 0;
+       attr->alt_timeout = 0;
+
+       init_attr->event_handler = qp->ibqp.event_handler;
+       init_attr->qp_context = qp->ibqp.qp_context;
+       init_attr->send_cq = qp->ibqp.send_cq;
+       init_attr->recv_cq = qp->ibqp.recv_cq;
+       init_attr->srq = qp->ibqp.srq;
+       init_attr->cap = attr->cap;
+       init_attr->sq_sig_type =
+               (qp->s_flags & (1 << IPATH_S_SIGNAL_REQ_WR))
+               ? IB_SIGNAL_REQ_WR : 0;
+       init_attr->qp_type = qp->ibqp.qp_type;
+       init_attr->port_num = 1;
+       return 0;
+}
+
+/**
+ * ipath_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ *
+ * The QP s_lock should be held.
+ */
+__be32 ipath_compute_aeth(struct ipath_qp *qp)
+{
+       u32 aeth = atomic_read(&qp->msn) & IPS_MSN_MASK;
+
+       if (qp->s_nak_state) {
+               aeth |= qp->s_nak_state << IPS_AETH_CREDIT_SHIFT;
+       } else if (qp->ibqp.srq) {
+               /*
+                * Shared receive queues don't generate credits.
+                * Set the credit field to the invalid value.
+                */
+               aeth |= IPS_AETH_CREDIT_INVAL << IPS_AETH_CREDIT_SHIFT;
+       } else {
+               u32 min, max, x;
+               u32 credits;
+
+               /*
+                * Compute the number of credits available (RWQEs).
+                * XXX Not holding the r_rq.lock here so there is a small
+                * chance that the pair of reads are not atomic.
+                */
+               credits = qp->r_rq.head - qp->r_rq.tail;
+               if ((int)credits < 0)
+                       credits += qp->r_rq.size;
+               /*
+                * Binary search the credit table to find the code to
+                * use.
+                */
+               min = 0;
+               max = 31;
+               for (;;) {
+                       x = (min + max) / 2;
+                       if (credit_table[x] == credits)
+                               break;
+                       if (credit_table[x] > credits)
+                               max = x;
+                       else if (min == x)
+                               break;
+                       else
+                               min = x;
+               }
+               aeth |= x << IPS_AETH_CREDIT_SHIFT;
+       }
+       return cpu_to_be32(aeth);
+}
+
+/**
+ * ipath_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: unused by InfiniPath
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+                             struct ib_qp_init_attr *init_attr,
+                             struct ib_udata *udata)
+{
+       struct ipath_qp *qp;
+       int err;
+       struct ipath_swqe *swq = NULL;
+       struct ipath_ibdev *dev;
+       size_t sz;
+       struct ib_qp *ret;
+
+       if (init_attr->cap.max_send_sge > 255 ||
+           init_attr->cap.max_recv_sge > 255) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       switch (init_attr->qp_type) {
+       case IB_QPT_UC:
+       case IB_QPT_RC:
+               sz = sizeof(struct ipath_sge) *
+                       init_attr->cap.max_send_sge +
+                       sizeof(struct ipath_swqe);
+               swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+               if (swq == NULL) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail;
+               }
+               /* FALLTHROUGH */
+       case IB_QPT_UD:
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               qp = kmalloc(sizeof(*qp), GFP_KERNEL);
+               if (!qp) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail;
+               }
+               qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+               sz = sizeof(struct ipath_sge) *
+                       init_attr->cap.max_recv_sge +
+                       sizeof(struct ipath_rwqe);
+               qp->r_rq.wq = vmalloc(qp->r_rq.size * sz);
+               if (!qp->r_rq.wq) {
+                       kfree(qp);
+                       ret = ERR_PTR(-ENOMEM);
+                       goto bail;
+               }
+
+               /*
+                * ib_create_qp() will initialize qp->ibqp
+                * except for qp->ibqp.qp_num.
+                */
+               spin_lock_init(&qp->s_lock);
+               spin_lock_init(&qp->r_rq.lock);
+               atomic_set(&qp->refcount, 0);
+               init_waitqueue_head(&qp->wait);
+               tasklet_init(&qp->s_task,
+                            init_attr->qp_type == IB_QPT_RC ?
+                            ipath_do_rc_send : ipath_do_uc_send,
+                            (unsigned long)qp);
+               qp->piowait.next = LIST_POISON1;
+               qp->piowait.prev = LIST_POISON2;
+               qp->timerwait.next = LIST_POISON1;
+               qp->timerwait.prev = LIST_POISON2;
+               qp->state = IB_QPS_RESET;
+               qp->s_wq = swq;
+               qp->s_size = init_attr->cap.max_send_wr + 1;
+               qp->s_max_sge = init_attr->cap.max_send_sge;
+               qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+               qp->s_flags = init_attr->sq_sig_type == IB_SIGNAL_REQ_WR ?
+                       1 << IPATH_S_SIGNAL_REQ_WR : 0;
+               dev = to_idev(ibpd->device);
+               err = ipath_alloc_qpn(&dev->qp_table, qp,
+                                     init_attr->qp_type);
+               if (err) {
+                       vfree(swq);
+                       vfree(qp->r_rq.wq);
+                       kfree(qp);
+                       ret = ERR_PTR(err);
+                       goto bail;
+               }
+               ipath_reset_qp(qp);
+
+               /* Tell the core driver that the kernel SMA is present. */
+               if (qp->ibqp.qp_type == IB_QPT_SMI)
+                       ipath_layer_set_verbs_flags(dev->dd,
+                                                   IPATH_VERBS_KERNEL_SMA);
+               break;
+
+       default:
+               /* Don't support raw QPs */
+               ret = ERR_PTR(-ENOSYS);
+               goto bail;
+       }
+
+       init_attr->cap.max_inline_data = 0;
+
+       ret = &qp->ibqp;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int ipath_destroy_qp(struct ib_qp *ibqp)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       struct ipath_ibdev *dev = to_idev(ibqp->device);
+       unsigned long flags;
+
+       /* Tell the core driver that the kernel SMA is gone. */
+       if (qp->ibqp.qp_type == IB_QPT_SMI)
+               ipath_layer_set_verbs_flags(dev->dd, 0);
+
+       spin_lock_irqsave(&qp->r_rq.lock, flags);
+       spin_lock(&qp->s_lock);
+       qp->state = IB_QPS_ERR;
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+
+       /* Stop the sending tasklet. */
+       tasklet_kill(&qp->s_task);
+
+       /* Make sure the QP isn't on the timeout list. */
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       if (qp->timerwait.next != LIST_POISON1)
+               list_del(&qp->timerwait);
+       if (qp->piowait.next != LIST_POISON1)
+               list_del(&qp->piowait);
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+       /*
+        * Make sure that the QP is not in the QPN table so receive
+        * interrupts will discard packets for this QP.  XXX Also remove QP
+        * from multicast table.
+        */
+       if (atomic_read(&qp->refcount) != 0)
+               ipath_free_qp(&dev->qp_table, qp);
+
+       vfree(qp->s_wq);
+       vfree(qp->r_rq.wq);
+       kfree(qp);
+       return 0;
+}
+
+/**
+ * ipath_init_qp_table - initialize the QP table for a device
+ * @idev: the device who's QP table we're initializing
+ * @size: the size of the QP table
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
+{
+       int i;
+       int ret;
+
+       idev->qp_table.last = 1;        /* QPN 0 and 1 are special. */
+       idev->qp_table.max = size;
+       idev->qp_table.nmaps = 1;
+       idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
+                                      GFP_KERNEL);
+       if (idev->qp_table.table == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
+               atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
+               idev->qp_table.map[i].page = NULL;
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_sqerror_qp - put a QP's send queue into an error state
+ * @qp: QP who's send queue will be put into an error state
+ * @wc: the WC responsible for putting the QP in this state
+ *
+ * Flushes the send work queue.
+ * The QP s_lock should be held.
+ */
+
+void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+
+       _VERBS_INFO("Send queue error on QP%d/%d: err: %d\n",
+                   qp->ibqp.qp_num, qp->remote_qpn, wc->status);
+
+       spin_lock(&dev->pending_lock);
+       /* XXX What if its already removed by the timeout code? */
+       if (qp->timerwait.next != LIST_POISON1)
+               list_del(&qp->timerwait);
+       if (qp->piowait.next != LIST_POISON1)
+               list_del(&qp->piowait);
+       spin_unlock(&dev->pending_lock);
+
+       ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
+       if (++qp->s_last >= qp->s_size)
+               qp->s_last = 0;
+
+       wc->status = IB_WC_WR_FLUSH_ERR;
+
+       while (qp->s_last != qp->s_head) {
+               wc->wr_id = wqe->wr.wr_id;
+               wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+               ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 1);
+               if (++qp->s_last >= qp->s_size)
+                       qp->s_last = 0;
+               wqe = get_swqe_ptr(qp, qp->s_last);
+       }
+       qp->s_cur = qp->s_tail = qp->s_head;
+       qp->state = IB_QPS_SQE;
+}
+
+/**
+ * ipath_error_qp - put a QP into an error state
+ * @qp: the QP to put into an error state
+ *
+ * Flushes both send and receive work queues.
+ * QP r_rq.lock and s_lock should be held.
+ */
+
+void ipath_error_qp(struct ipath_qp *qp)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ib_wc wc;
+
+       _VERBS_INFO("QP%d/%d in error state\n",
+                   qp->ibqp.qp_num, qp->remote_qpn);
+
+       spin_lock(&dev->pending_lock);
+       /* XXX What if its already removed by the timeout code? */
+       if (qp->timerwait.next != LIST_POISON1)
+               list_del(&qp->timerwait);
+       if (qp->piowait.next != LIST_POISON1)
+               list_del(&qp->piowait);
+       spin_unlock(&dev->pending_lock);
+
+       wc.status = IB_WC_WR_FLUSH_ERR;
+       wc.vendor_err = 0;
+       wc.byte_len = 0;
+       wc.imm_data = 0;
+       wc.qp_num = qp->ibqp.qp_num;
+       wc.src_qp = 0;
+       wc.wc_flags = 0;
+       wc.pkey_index = 0;
+       wc.slid = 0;
+       wc.sl = 0;
+       wc.dlid_path_bits = 0;
+       wc.port_num = 0;
+
+       while (qp->s_last != qp->s_head) {
+               struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+
+               wc.wr_id = wqe->wr.wr_id;
+               wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+               if (++qp->s_last >= qp->s_size)
+                       qp->s_last = 0;
+               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
+       }
+       qp->s_cur = qp->s_tail = qp->s_head;
+       qp->s_hdrwords = 0;
+       qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+
+       wc.opcode = IB_WC_RECV;
+       while (qp->r_rq.tail != qp->r_rq.head) {
+               wc.wr_id = get_rwqe_ptr(&qp->r_rq, qp->r_rq.tail)->wr_id;
+               if (++qp->r_rq.tail >= qp->r_rq.size)
+                       qp->r_rq.tail = 0;
+               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+       }
+}
+
+/**
+ * ipath_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
+{
+       u32 credit = (aeth >> IPS_AETH_CREDIT_SHIFT) & IPS_AETH_CREDIT_MASK;
+
+       /*
+        * If the credit is invalid, we can send
+        * as many packets as we like.  Otherwise, we have to
+        * honor the credit field.
+        */
+       if (credit == IPS_AETH_CREDIT_INVAL) {
+               qp->s_lsn = (u32) -1;
+       } else if (qp->s_lsn != (u32) -1) {
+               /* Compute new LSN (i.e., MSN + credit) */
+               credit = (aeth + credit_table[credit]) & IPS_MSN_MASK;
+               if (ipath_cmp24(credit, qp->s_lsn) > 0)
+                       qp->s_lsn = credit;
+       }
+
+       /* Restart sending if it was blocked due to lack of credits. */
+       if (qp->s_cur != qp->s_head &&
+           (qp->s_lsn == (u32) -1 ||
+            ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
+                        qp->s_lsn + 1) <= 0))
+               tasklet_hi_schedule(&qp->s_task);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
new file mode 100644 (file)
index 0000000..a4055ca
--- /dev/null
@@ -0,0 +1,1857 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+#include "ips_common.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+/**
+ * ipath_init_restart- initialize the qp->s_sge after a restart
+ * @qp: the QP who's SGE we're restarting
+ * @wqe: the work queue to initialize the QP's SGE from
+ *
+ * The QP s_lock should be held.
+ */
+static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
+{
+       struct ipath_ibdev *dev;
+       u32 len;
+
+       len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) *
+               ib_mtu_enum_to_int(qp->path_mtu);
+       qp->s_sge.sge = wqe->sg_list[0];
+       qp->s_sge.sg_list = wqe->sg_list + 1;
+       qp->s_sge.num_sge = wqe->wr.num_sge;
+       ipath_skip_sge(&qp->s_sge, len);
+       qp->s_len = wqe->length - len;
+       dev = to_idev(qp->ibqp.device);
+       spin_lock(&dev->pending_lock);
+       if (qp->timerwait.next == LIST_POISON1)
+               list_add_tail(&qp->timerwait,
+                             &dev->pending[dev->pending_index]);
+       spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return bth0 if constructed; otherwise, return 0.
+ * Note the QP s_lock must be held.
+ */
+static inline u32 ipath_make_rc_ack(struct ipath_qp *qp,
+                                   struct ipath_other_headers *ohdr,
+                                   u32 pmtu)
+{
+       struct ipath_sge_state *ss;
+       u32 hwords;
+       u32 len;
+       u32 bth0;
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+
+       /*
+        * Send a response.  Note that we are in the responder's
+        * side of the QP context.
+        */
+       switch (qp->s_ack_state) {
+       case OP(RDMA_READ_REQUEST):
+               ss = &qp->s_rdma_sge;
+               len = qp->s_rdma_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+               }
+               else
+                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+               qp->s_rdma_len -= len;
+               bth0 = qp->s_ack_state << 24;
+               ohdr->u.aeth = ipath_compute_aeth(qp);
+               hwords++;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               ss = &qp->s_rdma_sge;
+               len = qp->s_rdma_len;
+               if (len > pmtu)
+                       len = pmtu;
+               else {
+                       ohdr->u.aeth = ipath_compute_aeth(qp);
+                       hwords++;
+                       qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+               }
+               qp->s_rdma_len -= len;
+               bth0 = qp->s_ack_state << 24;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               /*
+                * We have to prevent new requests from changing
+                * the r_sge state while a ipath_verbs_send()
+                * is in progress.
+                * Changing r_state allows the receiver
+                * to continue processing new packets.
+                * We do it here now instead of above so
+                * that we are sure the packet was sent before
+                * changing the state.
+                */
+               qp->r_state = OP(RDMA_READ_RESPONSE_LAST);
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+               return 0;
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD):
+               ss = NULL;
+               len = 0;
+               qp->r_state = OP(SEND_LAST);
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+               bth0 = IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24;
+               ohdr->u.at.aeth = ipath_compute_aeth(qp);
+               ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic);
+               hwords += sizeof(ohdr->u.at) / 4;
+               break;
+
+       default:
+               /* Send a regular ACK. */
+               ss = NULL;
+               len = 0;
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+               bth0 = qp->s_ack_state << 24;
+               ohdr->u.aeth = ipath_compute_aeth(qp);
+               hwords++;
+       }
+       qp->s_hdrwords = hwords;
+       qp->s_cur_sge = ss;
+       qp->s_cur_size = len;
+
+       return bth0;
+}
+
+/**
+ * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ * @bth0p: pointer to the BTH opcode word
+ * @bth2p: pointer to the BTH PSN word
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note the QP s_lock must be held.
+ */
+static inline int ipath_make_rc_req(struct ipath_qp *qp,
+                                   struct ipath_other_headers *ohdr,
+                                   u32 pmtu, u32 *bth0p, u32 *bth2p)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_sge_state *ss;
+       struct ipath_swqe *wqe;
+       u32 hwords;
+       u32 len;
+       u32 bth0;
+       u32 bth2;
+       char newreq;
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
+           qp->s_rnr_timeout)
+               goto done;
+
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+       bth0 = 0;
+
+       /* Send a request. */
+       wqe = get_swqe_ptr(qp, qp->s_cur);
+       switch (qp->s_state) {
+       default:
+               /*
+                * Resend an old request or start a new one.
+                *
+                * We keep track of the current SWQE so that
+                * we don't reset the "furthest progress" state
+                * if we need to back up.
+                */
+               newreq = 0;
+               if (qp->s_cur == qp->s_tail) {
+                       /* Check if send work queue is empty. */
+                       if (qp->s_tail == qp->s_head)
+                               goto done;
+                       qp->s_psn = wqe->psn = qp->s_next_psn;
+                       newreq = 1;
+               }
+               /*
+                * Note that we have to be careful not to modify the
+                * original work request since we may need to resend
+                * it.
+                */
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_len = len = wqe->length;
+               ss = &qp->s_sge;
+               bth2 = 0;
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       /* If no credit, return. */
+                       if (qp->s_lsn != (u32) -1 &&
+                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
+                               goto done;
+                       wqe->lpsn = wqe->psn;
+                       if (len > pmtu) {
+                               wqe->lpsn += (len - 1) / pmtu;
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND)
+                               qp->s_state = OP(SEND_ONLY);
+                       else {
+                               qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+                       bth2 = 1 << 31; /* Request ACK. */
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+                       if (newreq)
+                               qp->s_lsn++;
+                       /* FALLTHROUGH */
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       /* If no credit, return. */
+                       if (qp->s_lsn != (u32) -1 &&
+                           ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0)
+                               goto done;
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / 4;
+                       wqe->lpsn = wqe->psn;
+                       if (len > pmtu) {
+                               wqe->lpsn += (len - 1) / pmtu;
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes
+                                * after RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= 1 << 23;
+                       }
+                       bth2 = 1 << 31; /* Request ACK. */
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_RDMA_READ:
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       qp->s_state = OP(RDMA_READ_REQUEST);
+                       hwords += sizeof(ohdr->u.rc.reth) / 4;
+                       if (newreq) {
+                               qp->s_lsn++;
+                               /*
+                                * Adjust s_next_psn to count the
+                                * expected number of responses.
+                                */
+                               if (len > pmtu)
+                                       qp->s_next_psn += (len - 1) / pmtu;
+                               wqe->lpsn = qp->s_next_psn++;
+                       }
+                       ss = NULL;
+                       len = 0;
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       break;
+
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+                       if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP)
+                               qp->s_state = OP(COMPARE_SWAP);
+                       else
+                               qp->s_state = OP(FETCH_ADD);
+                       ohdr->u.atomic_eth.vaddr = cpu_to_be64(
+                               wqe->wr.wr.atomic.remote_addr);
+                       ohdr->u.atomic_eth.rkey = cpu_to_be32(
+                               wqe->wr.wr.atomic.rkey);
+                       ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+                               wqe->wr.wr.atomic.swap);
+                       ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+                               wqe->wr.wr.atomic.compare_add);
+                       hwords += sizeof(struct ib_atomic_eth) / 4;
+                       if (newreq) {
+                               qp->s_lsn++;
+                               wqe->lpsn = wqe->psn;
+                       }
+                       if (++qp->s_cur == qp->s_size)
+                               qp->s_cur = 0;
+                       ss = NULL;
+                       len = 0;
+                       break;
+
+               default:
+                       goto done;
+               }
+               if (newreq) {
+                       qp->s_tail++;
+                       if (qp->s_tail >= qp->s_size)
+                               qp->s_tail = 0;
+               }
+               bth2 |= qp->s_psn++ & IPS_PSN_MASK;
+               if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               spin_lock(&dev->pending_lock);
+               if (qp->timerwait.next == LIST_POISON1)
+                       list_add_tail(&qp->timerwait,
+                                     &dev->pending[dev->pending_index]);
+               spin_unlock(&dev->pending_lock);
+               break;
+
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               /*
+                * This case can only happen if a send is restarted.  See
+                * ipath_restart_rc().
+                */
+               ipath_init_restart(qp, wqe);
+               /* FALLTHROUGH */
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               bth2 = qp->s_psn++ & IPS_PSN_MASK;
+               if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       /*
+                        * Request an ACK every 1/2 MB to avoid retransmit
+                        * timeouts.
+                        */
+                       if (((wqe->length - len) % (512 * 1024)) == 0)
+                               bth2 |= 1 << 31;
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND)
+                       qp->s_state = OP(SEND_LAST);
+               else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= 1 << 23;
+               bth2 |= 1 << 31;        /* Request ACK. */
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /*
+                * This case can only happen if a RDMA write is restarted.
+                * See ipath_restart_rc().
+                */
+               ipath_init_restart(qp, wqe);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               bth2 = qp->s_psn++ & IPS_PSN_MASK;
+               if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               ss = &qp->s_sge;
+               len = qp->s_len;
+               if (len > pmtu) {
+                       /*
+                        * Request an ACK every 1/2 MB to avoid retransmit
+                        * timeouts.
+                        */
+                       if (((wqe->length - len) % (512 * 1024)) == 0)
+                               bth2 |= 1 << 31;
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               else {
+                       qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+               }
+               bth2 |= 1 << 31;        /* Request ACK. */
+               qp->s_cur++;
+               if (qp->s_cur >= qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /*
+                * This case can only happen if a RDMA read is restarted.
+                * See ipath_restart_rc().
+                */
+               ipath_init_restart(qp, wqe);
+               len = ((qp->s_psn - wqe->psn) & IPS_PSN_MASK) * pmtu;
+               ohdr->u.rc.reth.vaddr =
+                       cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+               ohdr->u.rc.reth.rkey =
+                       cpu_to_be32(wqe->wr.wr.rdma.rkey);
+               ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
+               qp->s_state = OP(RDMA_READ_REQUEST);
+               hwords += sizeof(ohdr->u.rc.reth) / 4;
+               bth2 = qp->s_psn++ & IPS_PSN_MASK;
+               if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+                       qp->s_next_psn = qp->s_psn;
+               ss = NULL;
+               len = 0;
+               qp->s_cur++;
+               if (qp->s_cur == qp->s_size)
+                       qp->s_cur = 0;
+               break;
+
+       case OP(RDMA_READ_REQUEST):
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD):
+               /*
+                * We shouldn't start anything new until this request is
+                * finished.  The ACK will handle rescheduling us.  XXX The
+                * number of outstanding ones is negotiated at connection
+                * setup time (see pg. 258,289)?  XXX Also, if we support
+                * multiple outstanding requests, we need to check the WQE
+                * IB_SEND_FENCE flag and not send a new request if a RDMA
+                * read or atomic is pending.
+                */
+               goto done;
+       }
+       qp->s_len -= len;
+       qp->s_hdrwords = hwords;
+       qp->s_cur_sge = ss;
+       qp->s_cur_size = len;
+       *bth0p = bth0 | (qp->s_state << 24);
+       *bth2p = bth2;
+       return 1;
+
+done:
+       return 0;
+}
+
+static inline void ipath_make_rc_grh(struct ipath_qp *qp,
+                                    struct ib_global_route *grh,
+                                    u32 nwords)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+
+       /* GRH header size in 32-bit words. */
+       qp->s_hdrwords += 10;
+       qp->s_hdr.u.l.grh.version_tclass_flow =
+               cpu_to_be32((6 << 28) |
+                           (grh->traffic_class << 20) |
+                           grh->flow_label);
+       qp->s_hdr.u.l.grh.paylen =
+               cpu_to_be16(((qp->s_hdrwords - 12) + nwords +
+                            SIZE_OF_CRC) << 2);
+       /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+       qp->s_hdr.u.l.grh.next_hdr = 0x1B;
+       qp->s_hdr.u.l.grh.hop_limit = grh->hop_limit;
+       /* The SGID is 32-bit aligned. */
+       qp->s_hdr.u.l.grh.sgid.global.subnet_prefix = dev->gid_prefix;
+       qp->s_hdr.u.l.grh.sgid.global.interface_id =
+               ipath_layer_get_guid(dev->dd);
+       qp->s_hdr.u.l.grh.dgid = grh->dgid;
+}
+
+/**
+ * ipath_do_rc_send - perform a send on an RC QP
+ * @data: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, after we drop the QP s_lock, two threads could send
+ * packets out of order.
+ */
+void ipath_do_rc_send(unsigned long data)
+{
+       struct ipath_qp *qp = (struct ipath_qp *)data;
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       unsigned long flags;
+       u16 lrh0;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth0;
+       u32 bth2;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       struct ipath_other_headers *ohdr;
+
+       if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags))
+               goto bail;
+
+       if (unlikely(qp->remote_ah_attr.dlid ==
+                    ipath_layer_get_lid(dev->dd))) {
+               struct ib_wc wc;
+
+               /*
+                * Pass in an uninitialized ib_wc to be consistent with
+                * other places where ipath_ruc_loopback() is called.
+                */
+               ipath_ruc_loopback(qp, &wc);
+               goto clear;
+       }
+
+       ohdr = &qp->s_hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &qp->s_hdr.u.l.oth;
+
+again:
+       /* Check for a constructed packet to be sent. */
+       if (qp->s_hdrwords != 0) {
+               /*
+                * If no PIO bufs are available, return.  An interrupt will
+                * call ipath_ib_piobufavail() when one is available.
+                */
+               _VERBS_INFO("h %u %p\n", qp->s_hdrwords, &qp->s_hdr);
+               _VERBS_INFO("d %u %p %u %p %u %u %u %u\n", qp->s_cur_size,
+                           qp->s_cur_sge->sg_list,
+                           qp->s_cur_sge->num_sge,
+                           qp->s_cur_sge->sge.vaddr,
+                           qp->s_cur_sge->sge.sge_length,
+                           qp->s_cur_sge->sge.length,
+                           qp->s_cur_sge->sge.m,
+                           qp->s_cur_sge->sge.n);
+               if (ipath_verbs_send(dev->dd, qp->s_hdrwords,
+                                    (u32 *) &qp->s_hdr, qp->s_cur_size,
+                                    qp->s_cur_sge)) {
+                       ipath_no_bufs_available(qp, dev);
+                       goto bail;
+               }
+               dev->n_unicast_xmit++;
+               /* Record that we sent the packet and s_hdr is empty. */
+               qp->s_hdrwords = 0;
+       }
+
+       /*
+        * The lock is needed to synchronize between setting
+        * qp->s_ack_state, resend timer, and post_send().
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Sending responses has higher priority over sending requests. */
+       if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
+           (bth0 = ipath_make_rc_ack(qp, ohdr, pmtu)) != 0)
+               bth2 = qp->s_ack_psn++ & IPS_PSN_MASK;
+       else if (!ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2))
+               goto done;
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       /* Construct the header. */
+       extra_bytes = (4 - qp->s_cur_size) & 3;
+       nwords = (qp->s_cur_size + extra_bytes) >> 2;
+       lrh0 = IPS_LRH_BTH;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, nwords);
+               lrh0 = IPS_LRH_GRH;
+       }
+       lrh0 |= qp->remote_ah_attr.sl << 4;
+       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+       qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
+                                      SIZE_OF_CRC);
+       qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd));
+       bth0 |= ipath_layer_get_pkey(dev->dd, qp->s_pkey_index);
+       bth0 |= extra_bytes << 20;
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(bth2);
+
+       /* Check for more work to do. */
+       goto again;
+
+done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+clear:
+       clear_bit(IPATH_S_BUSY, &qp->s_flags);
+bail:
+       return;
+}
+
+static void send_rc_ack(struct ipath_qp *qp)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       u16 lrh0;
+       u32 bth0;
+       struct ipath_other_headers *ohdr;
+
+       /* Construct the header. */
+       ohdr = &qp->s_hdr.u.oth;
+       lrh0 = IPS_LRH_BTH;
+       /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+       qp->s_hdrwords = 6;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               ipath_make_rc_grh(qp, &qp->remote_ah_attr.grh, 0);
+               ohdr = &qp->s_hdr.u.l.oth;
+               lrh0 = IPS_LRH_GRH;
+       }
+       bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index);
+       ohdr->u.aeth = ipath_compute_aeth(qp);
+       if (qp->s_ack_state >= OP(COMPARE_SWAP)) {
+               bth0 |= IB_OPCODE_ATOMIC_ACKNOWLEDGE << 24;
+               ohdr->u.at.atomic_ack_eth = cpu_to_be64(qp->s_ack_atomic);
+               qp->s_hdrwords += sizeof(ohdr->u.at.atomic_ack_eth) / 4;
+       }
+       else
+               bth0 |= OP(ACKNOWLEDGE) << 24;
+       lrh0 |= qp->remote_ah_attr.sl << 4;
+       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+       qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + SIZE_OF_CRC);
+       qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd));
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(qp->s_ack_psn & IPS_PSN_MASK);
+
+       /*
+        * If we can send the ACK, clear the ACK state.
+        */
+       if (ipath_verbs_send(dev->dd, qp->s_hdrwords, (u32 *) &qp->s_hdr,
+                            0, NULL) == 0) {
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+               dev->n_rc_qacks++;
+               dev->n_unicast_xmit++;
+       }
+}
+
+/**
+ * ipath_restart_rc - back up requester to resend the last un-ACKed request
+ * @qp: the QP to restart
+ * @psn: packet sequence number for the request
+ * @wc: the work completion request
+ *
+ * The QP s_lock should be held.
+ */
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
+{
+       struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+       struct ipath_ibdev *dev;
+       u32 n;
+
+       /*
+        * If there are no requests pending, we are done.
+        */
+       if (ipath_cmp24(psn, qp->s_next_psn) >= 0 ||
+           qp->s_last == qp->s_tail)
+               goto done;
+
+       if (qp->s_retry == 0) {
+               wc->wr_id = wqe->wr.wr_id;
+               wc->status = IB_WC_RETRY_EXC_ERR;
+               wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+               wc->vendor_err = 0;
+               wc->byte_len = 0;
+               wc->qp_num = qp->ibqp.qp_num;
+               wc->src_qp = qp->remote_qpn;
+               wc->pkey_index = 0;
+               wc->slid = qp->remote_ah_attr.dlid;
+               wc->sl = qp->remote_ah_attr.sl;
+               wc->dlid_path_bits = 0;
+               wc->port_num = 0;
+               ipath_sqerror_qp(qp, wc);
+               goto bail;
+       }
+       qp->s_retry--;
+
+       /*
+        * Remove the QP from the timeout queue.
+        * Note: it may already have been removed by ipath_ib_timer().
+        */
+       dev = to_idev(qp->ibqp.device);
+       spin_lock(&dev->pending_lock);
+       if (qp->timerwait.next != LIST_POISON1)
+               list_del(&qp->timerwait);
+       spin_unlock(&dev->pending_lock);
+
+       if (wqe->wr.opcode == IB_WR_RDMA_READ)
+               dev->n_rc_resends++;
+       else
+               dev->n_rc_resends += (int)qp->s_psn - (int)psn;
+
+       /*
+        * If we are starting the request from the beginning, let the normal
+        * send code handle initialization.
+        */
+       qp->s_cur = qp->s_last;
+       if (ipath_cmp24(psn, wqe->psn) <= 0) {
+               qp->s_state = OP(SEND_LAST);
+               qp->s_psn = wqe->psn;
+       } else {
+               n = qp->s_cur;
+               for (;;) {
+                       if (++n == qp->s_size)
+                               n = 0;
+                       if (n == qp->s_tail) {
+                               if (ipath_cmp24(psn, qp->s_next_psn) >= 0) {
+                                       qp->s_cur = n;
+                                       wqe = get_swqe_ptr(qp, n);
+                               }
+                               break;
+                       }
+                       wqe = get_swqe_ptr(qp, n);
+                       if (ipath_cmp24(psn, wqe->psn) < 0)
+                               break;
+                       qp->s_cur = n;
+               }
+               qp->s_psn = psn;
+
+               /*
+                * Reset the state to restart in the middle of a request.
+                * Don't change the s_sge, s_cur_sge, or s_cur_size.
+                * See ipath_do_rc_send().
+                */
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+                       break;
+
+               case IB_WR_RDMA_READ:
+                       qp->s_state =
+                               OP(RDMA_READ_RESPONSE_MIDDLE);
+                       break;
+
+               default:
+                       /*
+                        * This case shouldn't happen since its only
+                        * one PSN per req.
+                        */
+                       qp->s_state = OP(SEND_LAST);
+               }
+       }
+
+done:
+       tasklet_hi_schedule(&qp->s_task);
+
+bail:
+       return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct ipath_qp *qp, u32 psn)
+{
+       struct ipath_swqe *wqe;
+       u32 n;
+
+       n = qp->s_cur;
+       wqe = get_swqe_ptr(qp, n);
+       for (;;) {
+               if (++n == qp->s_size)
+                       n = 0;
+               if (n == qp->s_tail) {
+                       if (ipath_cmp24(psn, qp->s_next_psn) >= 0) {
+                               qp->s_cur = n;
+                               wqe = get_swqe_ptr(qp, n);
+                       }
+                       break;
+               }
+               wqe = get_swqe_ptr(qp, n);
+               if (ipath_cmp24(psn, wqe->psn) < 0)
+                       break;
+               qp->s_cur = n;
+       }
+       qp->s_psn = psn;
+
+       /*
+        * Set the state to restart in the middle of a
+        * request.  Don't change the s_sge, s_cur_sge, or
+        * s_cur_size.  See ipath_do_rc_send().
+        */
+       switch (wqe->wr.opcode) {
+       case IB_WR_SEND:
+       case IB_WR_SEND_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+               break;
+
+       case IB_WR_RDMA_WRITE:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+               break;
+
+       case IB_WR_RDMA_READ:
+               qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+               break;
+
+       default:
+               /*
+                * This case shouldn't happen since its only
+                * one PSN per req.
+                */
+               qp->s_state = OP(SEND_LAST);
+       }
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ib_wc wc;
+       struct ipath_swqe *wqe;
+       int ret = 0;
+
+       /*
+        * Remove the QP from the timeout queue (or RNR timeout queue).
+        * If ipath_ib_timer() has already removed it,
+        * it's OK since we hold the QP s_lock and ipath_restart_rc()
+        * just won't find anything to restart if we ACK everything.
+        */
+       spin_lock(&dev->pending_lock);
+       if (qp->timerwait.next != LIST_POISON1)
+               list_del(&qp->timerwait);
+       spin_unlock(&dev->pending_lock);
+
+       /*
+        * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+        * requests and implicitly NAK RDMA read and atomic requests issued
+        * before the NAK'ed request.  The MSN won't include the NAK'ed
+        * request but will include an ACK'ed request(s).
+        */
+       wqe = get_swqe_ptr(qp, qp->s_last);
+
+       /* Nothing is pending to ACK/NAK. */
+       if (qp->s_last == qp->s_tail)
+               goto bail;
+
+       /*
+        * The MSN might be for a later WQE than the PSN indicates so
+        * only complete WQEs that the PSN finishes.
+        */
+       while (ipath_cmp24(psn, wqe->lpsn) >= 0) {
+               /* If we are ACKing a WQE, the MSN should be >= the SSN. */
+               if (ipath_cmp24(aeth, wqe->ssn) < 0)
+                       break;
+               /*
+                * If this request is a RDMA read or atomic, and the ACK is
+                * for a later operation, this ACK NAKs the RDMA read or
+                * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+                * can ACK a RDMA read and likewise for atomic ops.  Note
+                * that the NAK case can only happen if relaxed ordering is
+                * used and requests are sent after an RDMA read or atomic
+                * is sent but before the response is received.
+                */
+               if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+                    opcode != OP(RDMA_READ_RESPONSE_LAST)) ||
+                   ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+                    (opcode != OP(ATOMIC_ACKNOWLEDGE) ||
+                     ipath_cmp24(wqe->psn, psn) != 0))) {
+                       /*
+                        * The last valid PSN seen is the previous
+                        * request's.
+                        */
+                       qp->s_last_psn = wqe->psn - 1;
+                       /* Retry this request. */
+                       ipath_restart_rc(qp, wqe->psn, &wc);
+                       /*
+                        * No need to process the ACK/NAK since we are
+                        * restarting an earlier request.
+                        */
+                       goto bail;
+               }
+               /* Post a send completion queue entry if requested. */
+               if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
+                   (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.status = IB_WC_SUCCESS;
+                       wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+                       wc.vendor_err = 0;
+                       wc.byte_len = wqe->length;
+                       wc.qp_num = qp->ibqp.qp_num;
+                       wc.src_qp = qp->remote_qpn;
+                       wc.pkey_index = 0;
+                       wc.slid = qp->remote_ah_attr.dlid;
+                       wc.sl = qp->remote_ah_attr.sl;
+                       wc.dlid_path_bits = 0;
+                       wc.port_num = 0;
+                       ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+               }
+               qp->s_retry = qp->s_retry_cnt;
+               /*
+                * If we are completing a request which is in the process of
+                * being resent, we can stop resending it since we know the
+                * responder has already seen it.
+                */
+               if (qp->s_last == qp->s_cur) {
+                       if (++qp->s_cur >= qp->s_size)
+                               qp->s_cur = 0;
+                       wqe = get_swqe_ptr(qp, qp->s_cur);
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = wqe->psn;
+               }
+               if (++qp->s_last >= qp->s_size)
+                       qp->s_last = 0;
+               wqe = get_swqe_ptr(qp, qp->s_last);
+               if (qp->s_last == qp->s_tail)
+                       break;
+       }
+
+       switch (aeth >> 29) {
+       case 0:         /* ACK */
+               dev->n_rc_acks++;
+               /* If this is a partial ACK, reset the retransmit timer. */
+               if (qp->s_last != qp->s_tail) {
+                       spin_lock(&dev->pending_lock);
+                       list_add_tail(&qp->timerwait,
+                                     &dev->pending[dev->pending_index]);
+                       spin_unlock(&dev->pending_lock);
+               }
+               ipath_get_credit(qp, aeth);
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               qp->s_retry = qp->s_retry_cnt;
+               qp->s_last_psn = psn;
+               ret = 1;
+               goto bail;
+
+       case 1:         /* RNR NAK */
+               dev->n_rnr_naks++;
+               if (qp->s_rnr_retry == 0) {
+                       if (qp->s_last == qp->s_tail)
+                               goto bail;
+
+                       wc.status = IB_WC_RNR_RETRY_EXC_ERR;
+                       goto class_b;
+               }
+               if (qp->s_rnr_retry_cnt < 7)
+                       qp->s_rnr_retry--;
+               if (qp->s_last == qp->s_tail)
+                       goto bail;
+
+               /* The last valid PSN seen is the previous request's. */
+               qp->s_last_psn = wqe->psn - 1;
+
+               dev->n_rc_resends += (int)qp->s_psn - (int)psn;
+
+               /*
+                * If we are starting the request from the beginning, let
+                * the normal send code handle initialization.
+                */
+               qp->s_cur = qp->s_last;
+               wqe = get_swqe_ptr(qp, qp->s_cur);
+               if (ipath_cmp24(psn, wqe->psn) <= 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = wqe->psn;
+               } else
+                       reset_psn(qp, psn);
+
+               qp->s_rnr_timeout =
+                       ib_ipath_rnr_table[(aeth >> IPS_AETH_CREDIT_SHIFT) &
+                                          IPS_AETH_CREDIT_MASK];
+               ipath_insert_rnr_queue(qp);
+               goto bail;
+
+       case 3:         /* NAK */
+               /* The last valid PSN seen is the previous request's. */
+               if (qp->s_last != qp->s_tail)
+                       qp->s_last_psn = wqe->psn - 1;
+               switch ((aeth >> IPS_AETH_CREDIT_SHIFT) &
+                       IPS_AETH_CREDIT_MASK) {
+               case 0: /* PSN sequence error */
+                       dev->n_seq_naks++;
+                       /*
+                        * Back up to the responder's expected PSN.  XXX
+                        * Note that we might get a NAK in the middle of an
+                        * RDMA READ response which terminates the RDMA
+                        * READ.
+                        */
+                       if (qp->s_last == qp->s_tail)
+                               break;
+
+                       if (ipath_cmp24(psn, wqe->psn) < 0)
+                               break;
+
+                       /* Retry the request. */
+                       ipath_restart_rc(qp, psn, &wc);
+                       break;
+
+               case 1: /* Invalid Request */
+                       wc.status = IB_WC_REM_INV_REQ_ERR;
+                       dev->n_other_naks++;
+                       goto class_b;
+
+               case 2: /* Remote Access Error */
+                       wc.status = IB_WC_REM_ACCESS_ERR;
+                       dev->n_other_naks++;
+                       goto class_b;
+
+               case 3: /* Remote Operation Error */
+                       wc.status = IB_WC_REM_OP_ERR;
+                       dev->n_other_naks++;
+               class_b:
+                       wc.wr_id = wqe->wr.wr_id;
+                       wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+                       wc.vendor_err = 0;
+                       wc.byte_len = 0;
+                       wc.qp_num = qp->ibqp.qp_num;
+                       wc.src_qp = qp->remote_qpn;
+                       wc.pkey_index = 0;
+                       wc.slid = qp->remote_ah_attr.dlid;
+                       wc.sl = qp->remote_ah_attr.sl;
+                       wc.dlid_path_bits = 0;
+                       wc.port_num = 0;
+                       ipath_sqerror_qp(qp, &wc);
+                       break;
+
+               default:
+                       /* Ignore other reserved NAK error codes */
+                       goto reserved;
+               }
+               qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+               goto bail;
+
+       default:                /* 2: reserved */
+       reserved:
+               /* Ignore reserved NAK codes. */
+               goto bail;
+       }
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_rc_rcv_resp - process an incoming RC response packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
+                                    struct ipath_other_headers *ohdr,
+                                    void *data, u32 tlen,
+                                    struct ipath_qp *qp,
+                                    u32 opcode,
+                                    u32 psn, u32 hdrsize, u32 pmtu,
+                                    int header_in_data)
+{
+       unsigned long flags;
+       struct ib_wc wc;
+       int diff;
+       u32 pad;
+       u32 aeth;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Ignore invalid responses. */
+       if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
+               goto ack_done;
+
+       /* Ignore duplicate responses. */
+       diff = ipath_cmp24(psn, qp->s_last_psn);
+       if (unlikely(diff <= 0)) {
+               /* Update credits for "ghost" ACKs */
+               if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+                       if (!header_in_data)
+                               aeth = be32_to_cpu(ohdr->u.aeth);
+                       else {
+                               aeth = be32_to_cpu(((__be32 *) data)[0]);
+                               data += sizeof(__be32);
+                       }
+                       if ((aeth >> 29) == 0)
+                               ipath_get_credit(qp, aeth);
+               }
+               goto ack_done;
+       }
+
+       switch (opcode) {
+       case OP(ACKNOWLEDGE):
+       case OP(ATOMIC_ACKNOWLEDGE):
+       case OP(RDMA_READ_RESPONSE_FIRST):
+               if (!header_in_data)
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+               else {
+                       aeth = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               }
+               if (opcode == OP(ATOMIC_ACKNOWLEDGE))
+                       *(u64 *) qp->s_sge.sge.vaddr = *(u64 *) data;
+               if (!do_rc_ack(qp, aeth, psn, opcode) ||
+                   opcode != OP(RDMA_READ_RESPONSE_FIRST))
+                       goto ack_done;
+               hdrsize += 4;
+               /*
+                * do_rc_ack() has already checked the PSN so skip
+                * the sequence check.
+                */
+               goto rdma_read;
+
+       case OP(RDMA_READ_RESPONSE_MIDDLE):
+               /* no AETH, no ACK */
+               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+                       dev->n_rdma_seq++;
+                       ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
+                       goto ack_done;
+               }
+       rdma_read:
+       if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST)))
+               goto ack_done;
+       if (unlikely(tlen != (hdrsize + pmtu + 4)))
+               goto ack_done;
+       if (unlikely(pmtu >= qp->s_len))
+               goto ack_done;
+       /* We got a response so update the timeout. */
+       if (unlikely(qp->s_last == qp->s_tail ||
+                    get_swqe_ptr(qp, qp->s_last)->wr.opcode !=
+                    IB_WR_RDMA_READ))
+               goto ack_done;
+       spin_lock(&dev->pending_lock);
+       if (qp->s_rnr_timeout == 0 &&
+           qp->timerwait.next != LIST_POISON1)
+               list_move_tail(&qp->timerwait,
+                              &dev->pending[dev->pending_index]);
+       spin_unlock(&dev->pending_lock);
+       /*
+        * Update the RDMA receive state but do the copy w/o holding the
+        * locks and blocking interrupts.  XXX Yet another place that
+        * affects relaxed RDMA order since we don't want s_sge modified.
+        */
+       qp->s_len -= pmtu;
+       qp->s_last_psn = psn;
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       ipath_copy_sge(&qp->s_sge, data, pmtu);
+       goto bail;
+
+       case OP(RDMA_READ_RESPONSE_LAST):
+               /* ACKs READ req. */
+               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+                       dev->n_rdma_seq++;
+                       ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
+                       goto ack_done;
+               }
+               /* FALLTHROUGH */
+       case OP(RDMA_READ_RESPONSE_ONLY):
+               if (unlikely(qp->s_state != OP(RDMA_READ_REQUEST)))
+                       goto ack_done;
+               /*
+                * Get the number of bytes the message was padded by.
+                */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 1 && <= pmtu.
+                * Remember to account for the AETH header (4) and
+                * ICRC (4).
+                */
+               if (unlikely(tlen <= (hdrsize + pad + 8))) {
+                       /*
+                        * XXX Need to generate an error CQ
+                        * entry.
+                        */
+                       goto ack_done;
+               }
+               tlen -= hdrsize + pad + 8;
+               if (unlikely(tlen != qp->s_len)) {
+                       /*
+                        * XXX Need to generate an error CQ
+                        * entry.
+                        */
+                       goto ack_done;
+               }
+               if (!header_in_data)
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+               else {
+                       aeth = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               }
+               ipath_copy_sge(&qp->s_sge, data, tlen);
+               if (do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST))) {
+                       /*
+                        * Change the state so we contimue
+                        * processing new requests.
+                        */
+                       qp->s_state = OP(SEND_LAST);
+               }
+               goto ack_done;
+       }
+
+ack_done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+       return;
+}
+
+/**
+ * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent and the s_lock unlocked.
+ */
+static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
+                                    struct ipath_other_headers *ohdr,
+                                    void *data,
+                                    struct ipath_qp *qp,
+                                    u32 opcode,
+                                    u32 psn,
+                                    int diff,
+                                    int header_in_data)
+{
+       struct ib_reth *reth;
+
+       if (diff > 0) {
+               /*
+                * Packet sequence error.
+                * A NAK will ACK earlier sends and RDMA writes.
+                * Don't queue the NAK if a RDMA read, atomic, or
+                * NAK is pending though.
+                */
+               spin_lock(&qp->s_lock);
+               if ((qp->s_ack_state >= OP(RDMA_READ_REQUEST) &&
+                    qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) ||
+                   qp->s_nak_state != 0) {
+                       spin_unlock(&qp->s_lock);
+                       goto done;
+               }
+               qp->s_ack_state = OP(SEND_ONLY);
+               qp->s_nak_state = IB_NAK_PSN_ERROR;
+               /* Use the expected PSN. */
+               qp->s_ack_psn = qp->r_psn;
+               goto resched;
+       }
+
+       /*
+        * Handle a duplicate request.  Don't re-execute SEND, RDMA
+        * write or atomic op.  Don't NAK errors, just silently drop
+        * the duplicate request.  Note that r_sge, r_len, and
+        * r_rcv_len may be in use so don't modify them.
+        *
+        * We are supposed to ACK the earliest duplicate PSN but we
+        * can coalesce an outstanding duplicate ACK.  We have to
+        * send the earliest so that RDMA reads can be restarted at
+        * the requester's expected PSN.
+        */
+       spin_lock(&qp->s_lock);
+       if (qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE &&
+           ipath_cmp24(psn, qp->s_ack_psn) >= 0) {
+               if (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST)
+                       qp->s_ack_psn = psn;
+               spin_unlock(&qp->s_lock);
+               goto done;
+       }
+       switch (opcode) {
+       case OP(RDMA_READ_REQUEST):
+               /*
+                * We have to be careful to not change s_rdma_sge
+                * while ipath_do_rc_send() is using it and not
+                * holding the s_lock.
+                */
+               if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
+                   qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) {
+                       spin_unlock(&qp->s_lock);
+                       dev->n_rdma_dup_busy++;
+                       goto done;
+               }
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               qp->s_rdma_len = be32_to_cpu(reth->length);
+               if (qp->s_rdma_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /*
+                        * Address range must be a subset of the original
+                        * request and start on pmtu boundaries.
+                        */
+                       ok = ipath_rkey_ok(dev, &qp->s_rdma_sge,
+                                          qp->s_rdma_len, vaddr, rkey,
+                                          IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok))
+                               goto done;
+               } else {
+                       qp->s_rdma_sge.sg_list = NULL;
+                       qp->s_rdma_sge.num_sge = 0;
+                       qp->s_rdma_sge.sge.mr = NULL;
+                       qp->s_rdma_sge.sge.vaddr = NULL;
+                       qp->s_rdma_sge.sge.length = 0;
+                       qp->s_rdma_sge.sge.sge_length = 0;
+               }
+               break;
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD):
+               /*
+                * Check for the PSN of the last atomic operations
+                * performed and resend the result if found.
+                */
+               if ((psn & IPS_PSN_MASK) != qp->r_atomic_psn) {
+                       spin_unlock(&qp->s_lock);
+                       goto done;
+               }
+               qp->s_ack_atomic = qp->r_atomic_data;
+               break;
+       }
+       qp->s_ack_state = opcode;
+       qp->s_nak_state = 0;
+       qp->s_ack_psn = psn;
+resched:
+       return 0;
+
+done:
+       return 1;
+}
+
+/**
+ * ipath_rc_rcv - process an incoming RC packet
+ * @dev: the device this packet came in on
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from ipath_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+       struct ipath_other_headers *ohdr;
+       u32 opcode;
+       u32 hdrsize;
+       u32 psn;
+       u32 pad;
+       unsigned long flags;
+       struct ib_wc wc;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       int diff;
+       struct ib_reth *reth;
+       int header_in_data;
+
+       /* Check for GRH */
+       if (!has_grh) {
+               ohdr = &hdr->u.oth;
+               hdrsize = 8 + 12;       /* LRH + BTH */
+               psn = be32_to_cpu(ohdr->bth[2]);
+               header_in_data = 0;
+       } else {
+               ohdr = &hdr->u.l.oth;
+               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+               /*
+                * The header with GRH is 60 bytes and the core driver sets
+                * the eager header buffer size to 56 bytes so the last 4
+                * bytes of the BTH header (PSN) is in the data buffer.
+                */
+               header_in_data =
+                       ipath_layer_get_rcvhdrentsize(dev->dd) == 16;
+               if (header_in_data) {
+                       psn = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               } else
+                       psn = be32_to_cpu(ohdr->bth[2]);
+       }
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+       /*
+        * Process responses (ACKs) before anything else.  Note that the
+        * packet sequence number will be for something in the send work
+        * queue rather than the expected receive packet sequence number.
+        * In other words, this QP is the requester.
+        */
+       if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+           opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+               ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
+                                 hdrsize, pmtu, header_in_data);
+               goto bail;
+       }
+
+       spin_lock_irqsave(&qp->r_rq.lock, flags);
+
+       /* Compute 24 bits worth of difference. */
+       diff = ipath_cmp24(psn, qp->r_psn);
+       if (unlikely(diff)) {
+               if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
+                                      psn, diff, header_in_data))
+                       goto done;
+               goto resched;
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+       nack_inv:
+       /*
+        * A NAK will ACK earlier sends and RDMA writes.  Don't queue the
+        * NAK if a RDMA read, atomic, or NAK is pending though.
+        */
+       spin_lock(&qp->s_lock);
+       if (qp->s_ack_state >= OP(RDMA_READ_REQUEST) &&
+           qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) {
+               spin_unlock(&qp->s_lock);
+               goto done;
+       }
+       /* XXX Flush WQEs */
+       qp->state = IB_QPS_ERR;
+       qp->s_ack_state = OP(SEND_ONLY);
+       qp->s_nak_state = IB_NAK_INVALID_REQUEST;
+       qp->s_ack_psn = qp->r_psn;
+       goto resched;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto nack_inv;
+
+       case OP(RDMA_READ_REQUEST):
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD):
+               /*
+                * Drop all new requests until a response has been sent.  A
+                * new request then ACKs the RDMA response we sent.  Relaxed
+                * ordering would allow new requests to be processed but we
+                * would need to keep a queue of rwqe's for all that are in
+                * progress.  Note that we can't RNR NAK this request since
+                * the RDMA READ or atomic response is already queued to be
+                * sent (unless we implement a response send queue).
+                */
+               goto done;
+
+       default:
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       goto nack_inv;
+               break;
+       }
+
+       wc.imm_data = 0;
+       wc.wc_flags = 0;
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+               if (!ipath_get_rwqe(qp, 0)) {
+               rnr_nak:
+                       /*
+                        * A RNR NAK will ACK earlier sends and RDMA writes.
+                        * Don't queue the NAK if a RDMA read or atomic
+                        * is pending though.
+                        */
+                       spin_lock(&qp->s_lock);
+                       if (qp->s_ack_state >=
+                           OP(RDMA_READ_REQUEST) &&
+                           qp->s_ack_state != IB_OPCODE_ACKNOWLEDGE) {
+                               spin_unlock(&qp->s_lock);
+                               goto done;
+                       }
+                       qp->s_ack_state = OP(SEND_ONLY);
+                       qp->s_nak_state = IB_RNR_NAK | qp->s_min_rnr_timer;
+                       qp->s_ack_psn = qp->r_psn;
+                       goto resched;
+               }
+               qp->r_rcv_len = 0;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+       case OP(RDMA_WRITE_MIDDLE):
+       send_middle:
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4)))
+                       goto nack_inv;
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len))
+                       goto nack_inv;
+               ipath_copy_sge(&qp->r_sge, data, pmtu);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+               /* consume RWQE */
+               if (!ipath_get_rwqe(qp, 1))
+                       goto rnr_nak;
+               goto send_last_imm;
+
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+               if (!ipath_get_rwqe(qp, 0))
+                       goto rnr_nak;
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto send_last;
+               /* FALLTHROUGH */
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+       send_last_imm:
+               if (header_in_data) {
+                       wc.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else {
+                       /* Immediate data comes after BTH */
+                       wc.imm_data = ohdr->u.imm_data;
+               }
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               /* FALLTHROUGH */
+       case OP(SEND_LAST):
+       case OP(RDMA_WRITE_LAST):
+       send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4)))
+                       goto nack_inv;
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len))
+                       goto nack_inv;
+               ipath_copy_sge(&qp->r_sge, data, tlen);
+               atomic_inc(&qp->msn);
+               if (opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_ONLY))
+                       break;
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               wc.opcode = IB_WC_RECV;
+               wc.vendor_err = 0;
+               wc.qp_num = qp->ibqp.qp_num;
+               wc.src_qp = qp->remote_qpn;
+               wc.pkey_index = 0;
+               wc.slid = qp->remote_ah_attr.dlid;
+               wc.sl = qp->remote_ah_attr.sl;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                              (ohdr->bth[0] &
+                               __constant_cpu_to_be32(1 << 23)) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               /* consume RWQE */
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               hdrsize += sizeof(*reth);
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = ipath_rkey_ok(dev, &qp->r_sge,
+                                          qp->r_len, vaddr, rkey,
+                                          IB_ACCESS_REMOTE_WRITE);
+                       if (unlikely(!ok)) {
+                       nack_acc:
+                               /*
+                                * A NAK will ACK earlier sends and RDMA
+                                * writes.  Don't queue the NAK if a RDMA
+                                * read, atomic, or NAK is pending though.
+                                */
+                               spin_lock(&qp->s_lock);
+                               if (qp->s_ack_state >=
+                                   OP(RDMA_READ_REQUEST) &&
+                                   qp->s_ack_state !=
+                                   IB_OPCODE_ACKNOWLEDGE) {
+                                       spin_unlock(&qp->s_lock);
+                                       goto done;
+                               }
+                               /* XXX Flush WQEs */
+                               qp->state = IB_QPS_ERR;
+                               qp->s_ack_state = OP(RDMA_WRITE_ONLY);
+                               qp->s_nak_state =
+                                       IB_NAK_REMOTE_ACCESS_ERROR;
+                               qp->s_ack_psn = qp->r_psn;
+                               goto resched;
+                       }
+               } else {
+                       qp->r_sge.sg_list = NULL;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE)))
+                       goto nack_acc;
+               if (opcode == OP(RDMA_WRITE_FIRST))
+                       goto send_middle;
+               else if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto send_last;
+               if (!ipath_get_rwqe(qp, 1))
+                       goto rnr_nak;
+               goto send_last_imm;
+
+       case OP(RDMA_READ_REQUEST):
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               spin_lock(&qp->s_lock);
+               if (qp->s_ack_state != OP(ACKNOWLEDGE) &&
+                   qp->s_ack_state >= IB_OPCODE_RDMA_READ_REQUEST) {
+                       spin_unlock(&qp->s_lock);
+                       goto done;
+               }
+               qp->s_rdma_len = be32_to_cpu(reth->length);
+               if (qp->s_rdma_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+                       int ok;
+
+                       /* Check rkey & NAK */
+                       ok = ipath_rkey_ok(dev, &qp->s_rdma_sge,
+                                          qp->s_rdma_len, vaddr, rkey,
+                                          IB_ACCESS_REMOTE_READ);
+                       if (unlikely(!ok)) {
+                               spin_unlock(&qp->s_lock);
+                               goto nack_acc;
+                       }
+                       /*
+                        * Update the next expected PSN.  We add 1 later
+                        * below, so only add the remainder here.
+                        */
+                       if (qp->s_rdma_len > pmtu)
+                               qp->r_psn += (qp->s_rdma_len - 1) / pmtu;
+               } else {
+                       qp->s_rdma_sge.sg_list = NULL;
+                       qp->s_rdma_sge.num_sge = 0;
+                       qp->s_rdma_sge.sge.mr = NULL;
+                       qp->s_rdma_sge.sge.vaddr = NULL;
+                       qp->s_rdma_sge.sge.length = 0;
+                       qp->s_rdma_sge.sge.sge_length = 0;
+               }
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_READ)))
+                       goto nack_acc;
+               /*
+                * We need to increment the MSN here instead of when we
+                * finish sending the result since a duplicate request would
+                * increment it more than once.
+                */
+               atomic_inc(&qp->msn);
+               qp->s_ack_state = opcode;
+               qp->s_nak_state = 0;
+               qp->s_ack_psn = psn;
+               qp->r_psn++;
+               qp->r_state = opcode;
+               goto rdmadone;
+
+       case OP(COMPARE_SWAP):
+       case OP(FETCH_ADD): {
+               struct ib_atomic_eth *ateth;
+               u64 vaddr;
+               u64 sdata;
+               u32 rkey;
+
+               if (!header_in_data)
+                       ateth = &ohdr->u.atomic_eth;
+               else {
+                       ateth = (struct ib_atomic_eth *)data;
+                       data += sizeof(*ateth);
+               }
+               vaddr = be64_to_cpu(ateth->vaddr);
+               if (unlikely(vaddr & (sizeof(u64) - 1)))
+                       goto nack_inv;
+               rkey = be32_to_cpu(ateth->rkey);
+               /* Check rkey & NAK */
+               if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge,
+                                           sizeof(u64), vaddr, rkey,
+                                           IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_acc;
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_ATOMIC)))
+                       goto nack_acc;
+               /* Perform atomic OP and save result. */
+               sdata = be64_to_cpu(ateth->swap_data);
+               spin_lock(&dev->pending_lock);
+               qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr;
+               if (opcode == OP(FETCH_ADD))
+                       *(u64 *) qp->r_sge.sge.vaddr =
+                               qp->r_atomic_data + sdata;
+               else if (qp->r_atomic_data ==
+                        be64_to_cpu(ateth->compare_data))
+                       *(u64 *) qp->r_sge.sge.vaddr = sdata;
+               spin_unlock(&dev->pending_lock);
+               atomic_inc(&qp->msn);
+               qp->r_atomic_psn = psn & IPS_PSN_MASK;
+               psn |= 1 << 31;
+               break;
+       }
+
+       default:
+               /* Drop packet for unknown opcodes. */
+               goto done;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+       /* Send an ACK if requested or required. */
+       if (psn & (1 << 31)) {
+               /*
+                * Coalesce ACKs unless there is a RDMA READ or
+                * ATOMIC pending.
+                */
+               spin_lock(&qp->s_lock);
+               if (qp->s_ack_state == OP(ACKNOWLEDGE) ||
+                   qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST) {
+                       qp->s_ack_state = opcode;
+                       qp->s_nak_state = 0;
+                       qp->s_ack_psn = psn;
+                       qp->s_ack_atomic = qp->r_atomic_data;
+                       goto resched;
+               }
+               spin_unlock(&qp->s_lock);
+       }
+done:
+       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+       goto bail;
+
+resched:
+       /*
+        * Try to send ACK right away but not if ipath_do_rc_send() is
+        * active.
+        */
+       if (qp->s_hdrwords == 0 &&
+           (qp->s_ack_state < IB_OPCODE_RDMA_READ_REQUEST ||
+            qp->s_ack_state >= IB_OPCODE_COMPARE_SWAP))
+               send_rc_ack(qp);
+
+rdmadone:
+       spin_unlock(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+
+       /* Call ipath_do_rc_send() in another thread. */
+       tasklet_hi_schedule(&qp->s_task);
+
+bail:
+       return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h
new file mode 100644 (file)
index 0000000..1e59750
--- /dev/null
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_REGISTERS_H
+#define _IPATH_REGISTERS_H
+
+/*
+ * This file should only be included by kernel source, and by the diags.
+ * It defines the registers, and their contents, for the InfiniPath HT-400 chip
+ */
+
+/*
+ * These are the InfiniPath register and buffer bit definitions,
+ * that are visible to software, and needed only by the kernel
+ * and diag code.  A few, that are visible to protocol and user
+ * code are in ipath_common.h.  Some bits are specific
+ * to a given chip implementation, and have been moved to the
+ * chip-specific source file
+ */
+
+/* kr_revision bits */
+#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
+#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
+#define INFINIPATH_R_ARCH_MASK 0xFF
+#define INFINIPATH_R_ARCH_SHIFT 16
+#define INFINIPATH_R_SOFTWARE_MASK 0xFF
+#define INFINIPATH_R_SOFTWARE_SHIFT 24
+#define INFINIPATH_R_BOARDID_MASK 0xFF
+#define INFINIPATH_R_BOARDID_SHIFT 32
+
+/* kr_control bits */
+#define INFINIPATH_C_FREEZEMODE 0x00000002
+#define INFINIPATH_C_LINKENABLE 0x00000004
+#define INFINIPATH_C_RESET 0x00000001
+
+/* kr_sendctrl bits */
+#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
+
+#define IPATH_S_ABORT          0
+#define IPATH_S_PIOINTBUFAVAIL 1
+#define IPATH_S_PIOBUFAVAILUPD 2
+#define IPATH_S_PIOENABLE      3
+#define IPATH_S_DISARM         31
+
+#define INFINIPATH_S_ABORT             (1U << IPATH_S_ABORT)
+#define INFINIPATH_S_PIOINTBUFAVAIL    (1U << IPATH_S_PIOINTBUFAVAIL)
+#define INFINIPATH_S_PIOBUFAVAILUPD    (1U << IPATH_S_PIOBUFAVAILUPD)
+#define INFINIPATH_S_PIOENABLE         (1U << IPATH_S_PIOENABLE)
+#define INFINIPATH_S_DISARM            (1U << IPATH_S_DISARM)
+
+/* kr_rcvctrl bits */
+#define INFINIPATH_R_PORTENABLE_SHIFT 0
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD   0x80000000
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_SHIFT 0
+#define INFINIPATH_I_RCVAVAIL_SHIFT 12
+#define INFINIPATH_I_ERROR        0x80000000
+#define INFINIPATH_I_SPIOSENT     0x40000000
+#define INFINIPATH_I_SPIOBUFAVAIL 0x20000000
+#define INFINIPATH_I_GPIO         0x10000000
+
+/* kr_errorstatus, kr_errorclear, kr_errormask bits */
+#define INFINIPATH_E_RFORMATERR      0x0000000000000001ULL
+#define INFINIPATH_E_RVCRC           0x0000000000000002ULL
+#define INFINIPATH_E_RICRC           0x0000000000000004ULL
+#define INFINIPATH_E_RMINPKTLEN      0x0000000000000008ULL
+#define INFINIPATH_E_RMAXPKTLEN      0x0000000000000010ULL
+#define INFINIPATH_E_RLONGPKTLEN     0x0000000000000020ULL
+#define INFINIPATH_E_RSHORTPKTLEN    0x0000000000000040ULL
+#define INFINIPATH_E_RUNEXPCHAR      0x0000000000000080ULL
+#define INFINIPATH_E_RUNSUPVL        0x0000000000000100ULL
+#define INFINIPATH_E_REBP            0x0000000000000200ULL
+#define INFINIPATH_E_RIBFLOW         0x0000000000000400ULL
+#define INFINIPATH_E_RBADVERSION     0x0000000000000800ULL
+#define INFINIPATH_E_RRCVEGRFULL     0x0000000000001000ULL
+#define INFINIPATH_E_RRCVHDRFULL     0x0000000000002000ULL
+#define INFINIPATH_E_RBADTID         0x0000000000004000ULL
+#define INFINIPATH_E_RHDRLEN         0x0000000000008000ULL
+#define INFINIPATH_E_RHDR            0x0000000000010000ULL
+#define INFINIPATH_E_RIBLOSTLINK     0x0000000000020000ULL
+#define INFINIPATH_E_SMINPKTLEN      0x0000000020000000ULL
+#define INFINIPATH_E_SMAXPKTLEN      0x0000000040000000ULL
+#define INFINIPATH_E_SUNDERRUN       0x0000000080000000ULL
+#define INFINIPATH_E_SPKTLEN         0x0000000100000000ULL
+#define INFINIPATH_E_SDROPPEDSMPPKT  0x0000000200000000ULL
+#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL
+#define INFINIPATH_E_SPIOARMLAUNCH   0x0000000800000000ULL
+#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL
+#define INFINIPATH_E_SUNSUPVL        0x0000002000000000ULL
+#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL
+#define INFINIPATH_E_INVALIDADDR     0x0002000000000000ULL
+#define INFINIPATH_E_RESET           0x0004000000000000ULL
+#define INFINIPATH_E_HARDWARE        0x0008000000000000ULL
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
+ * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: eagerTID, 3: expTID
+ *             bit 4: flag buffer, 5: datainfo, 6: header info */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
+#define INFINIPATH_HWE_MEMBISTFAILED        0x0040000000000000ULL
+#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+
+/* kr_hwdiagctrl bits */
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
+#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
+#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+
+/* kr_ibcctrl bits */
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
+#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
+#define INFINIPATH_IBCC_LINKINITCMD_POLL 2     /* cycle through TS1/TS2 till OK */
+#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3    /* wait for TS1, then go on */
+#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
+#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKCMD_INIT 1 /* move to 0x11 */
+#define INFINIPATH_IBCC_LINKCMD_ARMED 2        /* move to 0x21 */
+#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3       /* move to 0x31 */
+#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
+#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
+#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
+#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
+#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
+#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
+#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
+
+/* kr_ibcstatus bits */
+#define INFINIPATH_IBCS_LINKTRAININGSTATE_MASK 0xF
+#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
+#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
+#define INFINIPATH_IBCS_LINKSTATE_SHIFT 4
+#define INFINIPATH_IBCS_TXREADY       0x40000000
+#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
+/* link training states (shift by INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
+#define INFINIPATH_IBCS_LT_STATE_DISABLED      0x00
+#define INFINIPATH_IBCS_LT_STATE_LINKUP                0x01
+#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE    0x02
+#define INFINIPATH_IBCS_LT_STATE_POLLQUIET     0x03
+#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY    0x04
+#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET    0x05
+#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE   0x08
+#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG    0x09
+#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT    0x0a
+#define INFINIPATH_IBCS_LT_STATE_CFGIDLE       0x0b
+#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN        0x0c
+#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT        0x0e
+#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE   0x0f
+/* link state machine states (shift by INFINIPATH_IBCS_LINKSTATE_SHIFT) */
+#define INFINIPATH_IBCS_L_STATE_DOWN           0x0
+#define INFINIPATH_IBCS_L_STATE_INIT           0x1
+#define INFINIPATH_IBCS_L_STATE_ARM            0x2
+#define INFINIPATH_IBCS_L_STATE_ACTIVE         0x3
+#define INFINIPATH_IBCS_L_STATE_ACT_DEFER      0x4
+
+/* combination link status states that we use with some frequency */
+#define IPATH_IBSTATE_MASK ((INFINIPATH_IBCS_LINKTRAININGSTATE_MASK \
+               << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \
+               (INFINIPATH_IBCS_LINKSTATE_MASK \
+               <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT))
+#define IPATH_IBSTATE_INIT ((INFINIPATH_IBCS_L_STATE_INIT \
+               << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \
+               (INFINIPATH_IBCS_LT_STATE_LINKUP \
+               <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT))
+#define IPATH_IBSTATE_ARM ((INFINIPATH_IBCS_L_STATE_ARM \
+               << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \
+               (INFINIPATH_IBCS_LT_STATE_LINKUP \
+               <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT))
+#define IPATH_IBSTATE_ACTIVE ((INFINIPATH_IBCS_L_STATE_ACTIVE \
+               << INFINIPATH_IBCS_LINKSTATE_SHIFT) | \
+               (INFINIPATH_IBCS_LT_STATE_LINKUP \
+               <<INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT))
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
+#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
+#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
+
+/* kr_extctrl bits */
+#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
+#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
+#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
+#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
+#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
+#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
+#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
+#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
+#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
+#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
+#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
+#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
+#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
+
+/* kr_mdio bits */
+#define INFINIPATH_MDIO_CLKDIV_MASK 0x7FULL
+#define INFINIPATH_MDIO_CLKDIV_SHIFT 32
+#define INFINIPATH_MDIO_COMMAND_MASK 0x7ULL
+#define INFINIPATH_MDIO_COMMAND_SHIFT 26
+#define INFINIPATH_MDIO_DEVADDR_MASK 0x1FULL
+#define INFINIPATH_MDIO_DEVADDR_SHIFT 21
+#define INFINIPATH_MDIO_REGADDR_MASK 0x1FULL
+#define INFINIPATH_MDIO_REGADDR_SHIFT 16
+#define INFINIPATH_MDIO_DATA_MASK 0xFFFFULL
+#define INFINIPATH_MDIO_DATA_SHIFT 0
+#define INFINIPATH_MDIO_CMDVALID    0x0000000040000000ULL
+#define INFINIPATH_MDIO_RDDATAVALID 0x0000000080000000ULL
+
+/* kr_partitionkey bits */
+#define INFINIPATH_PKEY_SIZE 16
+#define INFINIPATH_PKEY_MASK 0xFFFF
+#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
+
+/* kr_serdesconfig0 bits */
+#define INFINIPATH_SERDC0_RESET_MASK  0xfULL   /* overal reset bits */
+#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL    /* pll reset */
+#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL        /* tx idle enables (per lane) */
+#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL       /* rx detect enables (per lane) */
+#define INFINIPATH_SERDC0_L1PWR_DN      0xF0ULL        /* L1 Power down; use with RXDETECT,
+                                                          Otherwise not used on IB side */
+
+/* kr_xgxsconfig bits */
+#define INFINIPATH_XGXS_RESET          0x7ULL
+#define INFINIPATH_XGXS_MDIOADDR_MASK  0xfULL
+#define INFINIPATH_XGXS_MDIOADDR_SHIFT 4
+
+#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL        /* 40 bits valid */
+
+/* TID entries (memory), HT400-only */
+#define INFINIPATH_RT_VALID 0x8000000000000000ULL
+#define INFINIPATH_RT_ADDR_SHIFT 0
+#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFF
+#define INFINIPATH_RT_BUFSIZE_SHIFT 48
+
+/*
+ * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define IPATH_PIO_MAXIBHDR 128
+
+typedef u64 ipath_err_t;
+
+/* mask of defined bits for various registers */
+extern u64 infinipath_i_bitsextant;
+extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
+
+/* masks that are different in various chips, or only exist in some chips */
+extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
+
+/*
+ * register bits for selecting i2c direction and values, used for I2C serial
+ * flash
+ */
+extern u16 ipath_gpio_sda_num, ipath_gpio_scl_num;
+extern u64 ipath_gpio_sda, ipath_gpio_scl;
+
+/*
+ * These are the infinipath general register numbers (not offsets).
+ * The kernel registers are used directly, those beyond the kernel
+ * registers are calculated from one of the base registers.  The use of
+ * an integer type doesn't allow type-checking as thorough as, say,
+ * an enum but allows for better hiding of chip differences.
+ */
+typedef const u16 ipath_kreg,  /* infinipath general registers */
+ ipath_creg,                   /* infinipath counter registers */
+ ipath_sreg;                   /* kernel-only, infinipath send registers */
+
+/*
+ * These are the chip registers common to all infinipath chips, and
+ * used both by the kernel and the diagnostics or other user code.
+ * They are all implemented such that 64 bit accesses work.
+ * Some implement no more than 32 bits.  Because 64 bit reads
+ * require 2 HT cmds on opteron, we access those with 32 bit
+ * reads for efficiency (they are written as 64 bits, since
+ * the extra 32 bits are nearly free on writes, and it slightly reduces
+ * complexity).  The rest are all accessed as 64 bits.
+ */
+struct ipath_kregs {
+       /* These are the 32 bit group */
+       ipath_kreg kr_control;
+       ipath_kreg kr_counterregbase;
+       ipath_kreg kr_intmask;
+       ipath_kreg kr_intstatus;
+       ipath_kreg kr_pagealign;
+       ipath_kreg kr_portcnt;
+       ipath_kreg kr_rcvtidbase;
+       ipath_kreg kr_rcvtidcnt;
+       ipath_kreg kr_rcvegrbase;
+       ipath_kreg kr_rcvegrcnt;
+       ipath_kreg kr_scratch;
+       ipath_kreg kr_sendctrl;
+       ipath_kreg kr_sendpiobufbase;
+       ipath_kreg kr_sendpiobufcnt;
+       ipath_kreg kr_sendpiosize;
+       ipath_kreg kr_sendregbase;
+       ipath_kreg kr_userregbase;
+       /* These are the 64 bit group */
+       ipath_kreg kr_debugport;
+       ipath_kreg kr_debugportselect;
+       ipath_kreg kr_errorclear;
+       ipath_kreg kr_errormask;
+       ipath_kreg kr_errorstatus;
+       ipath_kreg kr_extctrl;
+       ipath_kreg kr_extstatus;
+       ipath_kreg kr_gpio_clear;
+       ipath_kreg kr_gpio_mask;
+       ipath_kreg kr_gpio_out;
+       ipath_kreg kr_gpio_status;
+       ipath_kreg kr_hwdiagctrl;
+       ipath_kreg kr_hwerrclear;
+       ipath_kreg kr_hwerrmask;
+       ipath_kreg kr_hwerrstatus;
+       ipath_kreg kr_ibcctrl;
+       ipath_kreg kr_ibcstatus;
+       ipath_kreg kr_intblocked;
+       ipath_kreg kr_intclear;
+       ipath_kreg kr_interruptconfig;
+       ipath_kreg kr_mdio;
+       ipath_kreg kr_partitionkey;
+       ipath_kreg kr_rcvbthqp;
+       ipath_kreg kr_rcvbufbase;
+       ipath_kreg kr_rcvbufsize;
+       ipath_kreg kr_rcvctrl;
+       ipath_kreg kr_rcvhdrcnt;
+       ipath_kreg kr_rcvhdrentsize;
+       ipath_kreg kr_rcvhdrsize;
+       ipath_kreg kr_rcvintmembase;
+       ipath_kreg kr_rcvintmemsize;
+       ipath_kreg kr_revision;
+       ipath_kreg kr_sendbuffererror;
+       ipath_kreg kr_sendpioavailaddr;
+       ipath_kreg kr_serdesconfig0;
+       ipath_kreg kr_serdesconfig1;
+       ipath_kreg kr_serdesstatus;
+       ipath_kreg kr_txintmembase;
+       ipath_kreg kr_txintmemsize;
+       ipath_kreg kr_xgxsconfig;
+       ipath_kreg kr_ibpllcfg;
+       /* use these two (and the following N ports) only with ipath_k*_kreg64_port();
+        * not *kreg64() */
+       ipath_kreg kr_rcvhdraddr;
+       ipath_kreg kr_rcvhdrtailaddr;
+
+       /* remaining registers are not present on all types of infinipath chips  */
+       ipath_kreg kr_rcvpktledcnt;
+       ipath_kreg kr_pcierbuftestreg0;
+       ipath_kreg kr_pcierbuftestreg1;
+       ipath_kreg kr_pcieq0serdesconfig0;
+       ipath_kreg kr_pcieq0serdesconfig1;
+       ipath_kreg kr_pcieq0serdesstatus;
+       ipath_kreg kr_pcieq1serdesconfig0;
+       ipath_kreg kr_pcieq1serdesconfig1;
+       ipath_kreg kr_pcieq1serdesstatus;
+};
+
+struct ipath_cregs {
+       ipath_creg cr_badformatcnt;
+       ipath_creg cr_erricrccnt;
+       ipath_creg cr_errlinkcnt;
+       ipath_creg cr_errlpcrccnt;
+       ipath_creg cr_errpkey;
+       ipath_creg cr_errrcvflowctrlcnt;
+       ipath_creg cr_err_rlencnt;
+       ipath_creg cr_errslencnt;
+       ipath_creg cr_errtidfull;
+       ipath_creg cr_errtidvalid;
+       ipath_creg cr_errvcrccnt;
+       ipath_creg cr_ibstatuschange;
+       ipath_creg cr_intcnt;
+       ipath_creg cr_invalidrlencnt;
+       ipath_creg cr_invalidslencnt;
+       ipath_creg cr_lbflowstallcnt;
+       ipath_creg cr_iblinkdowncnt;
+       ipath_creg cr_iblinkerrrecovcnt;
+       ipath_creg cr_ibsymbolerrcnt;
+       ipath_creg cr_pktrcvcnt;
+       ipath_creg cr_pktrcvflowctrlcnt;
+       ipath_creg cr_pktsendcnt;
+       ipath_creg cr_pktsendflowcnt;
+       ipath_creg cr_portovflcnt;
+       ipath_creg cr_rcvebpcnt;
+       ipath_creg cr_rcvovflcnt;
+       ipath_creg cr_rxdroppktcnt;
+       ipath_creg cr_senddropped;
+       ipath_creg cr_sendstallcnt;
+       ipath_creg cr_sendunderruncnt;
+       ipath_creg cr_unsupvlcnt;
+       ipath_creg cr_wordrcvcnt;
+       ipath_creg cr_wordsendcnt;
+};
+
+#endif                         /* _IPATH_REGISTERS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
new file mode 100644 (file)
index 0000000..f232e77
--- /dev/null
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of milliseconds.
+ */
+const u32 ib_ipath_rnr_table[32] = {
+       656,                    /* 0 */
+       1,                      /* 1 */
+       1,                      /* 2 */
+       1,                      /* 3 */
+       1,                      /* 4 */
+       1,                      /* 5 */
+       1,                      /* 6 */
+       1,                      /* 7 */
+       1,                      /* 8 */
+       1,                      /* 9 */
+       1,                      /* A */
+       1,                      /* B */
+       1,                      /* C */
+       1,                      /* D */
+       2,                      /* E */
+       2,                      /* F */
+       3,                      /* 10 */
+       4,                      /* 11 */
+       6,                      /* 12 */
+       8,                      /* 13 */
+       11,                     /* 14 */
+       16,                     /* 15 */
+       21,                     /* 16 */
+       31,                     /* 17 */
+       41,                     /* 18 */
+       62,                     /* 19 */
+       82,                     /* 1A */
+       123,                    /* 1B */
+       164,                    /* 1C */
+       246,                    /* 1D */
+       328,                    /* 1E */
+       492                     /* 1F */
+};
+
+/**
+ * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
+ * @qp: the QP
+ *
+ * XXX Use a simple list for now.  We might need a priority
+ * queue if we have lots of QPs waiting for RNR timeouts
+ * but that should be rare.
+ */
+void ipath_insert_rnr_queue(struct ipath_qp *qp)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       if (list_empty(&dev->rnrwait))
+               list_add(&qp->timerwait, &dev->rnrwait);
+       else {
+               struct list_head *l = &dev->rnrwait;
+               struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
+                                                 timerwait);
+
+               while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
+                       qp->s_rnr_timeout -= nqp->s_rnr_timeout;
+                       l = l->next;
+                       if (l->next == &dev->rnrwait)
+                               break;
+                       nqp = list_entry(l->next, struct ipath_qp,
+                                        timerwait);
+               }
+               list_add(&qp->timerwait, l);
+       }
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+/**
+ * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update wr_id only, not SGEs
+ *
+ * Return 0 if no RWQE is available, otherwise return 1.
+ *
+ * Called at interrupt level with the QP r_rq.lock held.
+ */
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
+{
+       struct ipath_rq *rq;
+       struct ipath_srq *srq;
+       struct ipath_rwqe *wqe;
+       int ret;
+
+       if (!qp->ibqp.srq) {
+               rq = &qp->r_rq;
+               if (unlikely(rq->tail == rq->head)) {
+                       ret = 0;
+                       goto bail;
+               }
+               wqe = get_rwqe_ptr(rq, rq->tail);
+               qp->r_wr_id = wqe->wr_id;
+               if (!wr_id_only) {
+                       qp->r_sge.sge = wqe->sg_list[0];
+                       qp->r_sge.sg_list = wqe->sg_list + 1;
+                       qp->r_sge.num_sge = wqe->num_sge;
+                       qp->r_len = wqe->length;
+               }
+               if (++rq->tail >= rq->size)
+                       rq->tail = 0;
+               ret = 1;
+               goto bail;
+       }
+
+       srq = to_isrq(qp->ibqp.srq);
+       rq = &srq->rq;
+       spin_lock(&rq->lock);
+       if (unlikely(rq->tail == rq->head)) {
+               spin_unlock(&rq->lock);
+               ret = 0;
+               goto bail;
+       }
+       wqe = get_rwqe_ptr(rq, rq->tail);
+       qp->r_wr_id = wqe->wr_id;
+       if (!wr_id_only) {
+               qp->r_sge.sge = wqe->sg_list[0];
+               qp->r_sge.sg_list = wqe->sg_list + 1;
+               qp->r_sge.num_sge = wqe->num_sge;
+               qp->r_len = wqe->length;
+       }
+       if (++rq->tail >= rq->size)
+               rq->tail = 0;
+       if (srq->ibsrq.event_handler) {
+               struct ib_event ev;
+               u32 n;
+
+               if (rq->head < rq->tail)
+                       n = rq->size + rq->head - rq->tail;
+               else
+                       n = rq->head - rq->tail;
+               if (n < srq->limit) {
+                       srq->limit = 0;
+                       spin_unlock(&rq->lock);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       srq->ibsrq.event_handler(&ev,
+                                                srq->ibsrq.srq_context);
+               } else
+                       spin_unlock(&rq->lock);
+       } else
+               spin_unlock(&rq->lock);
+       ret = 1;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the loopback QP
+ * @wc: the work completion entry
+ *
+ * This is called from ipath_do_uc_send() or ipath_do_rc_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc)
+{
+       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+       struct ipath_qp *qp;
+       struct ipath_swqe *wqe;
+       struct ipath_sge *sge;
+       unsigned long flags;
+       u64 sdata;
+
+       qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
+       if (!qp) {
+               dev->n_pkt_drops++;
+               return;
+       }
+
+again:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+
+       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK)) {
+               spin_unlock_irqrestore(&sqp->s_lock, flags);
+               goto done;
+       }
+
+       /* Get the next send request. */
+       if (sqp->s_last == sqp->s_head) {
+               /* Send work queue is empty. */
+               spin_unlock_irqrestore(&sqp->s_lock, flags);
+               goto done;
+       }
+
+       /*
+        * We can rely on the entry not changing without the s_lock
+        * being held until we update s_last.
+        */
+       wqe = get_swqe_ptr(sqp, sqp->s_last);
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+       wc->wc_flags = 0;
+       wc->imm_data = 0;
+
+       sqp->s_sge.sge = wqe->sg_list[0];
+       sqp->s_sge.sg_list = wqe->sg_list + 1;
+       sqp->s_sge.num_sge = wqe->wr.num_sge;
+       sqp->s_len = wqe->length;
+       switch (wqe->wr.opcode) {
+       case IB_WR_SEND_WITH_IMM:
+               wc->wc_flags = IB_WC_WITH_IMM;
+               wc->imm_data = wqe->wr.imm_data;
+               /* FALLTHROUGH */
+       case IB_WR_SEND:
+               spin_lock_irqsave(&qp->r_rq.lock, flags);
+               if (!ipath_get_rwqe(qp, 0)) {
+               rnr_nak:
+                       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+                       /* Handle RNR NAK */
+                       if (qp->ibqp.qp_type == IB_QPT_UC)
+                               goto send_comp;
+                       if (sqp->s_rnr_retry == 0) {
+                               wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+                               goto err;
+                       }
+                       if (sqp->s_rnr_retry_cnt < 7)
+                               sqp->s_rnr_retry--;
+                       dev->n_rnr_naks++;
+                       sqp->s_rnr_timeout =
+                               ib_ipath_rnr_table[sqp->s_min_rnr_timer];
+                       ipath_insert_rnr_queue(sqp);
+                       goto done;
+               }
+               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+               break;
+
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               wc->wc_flags = IB_WC_WITH_IMM;
+               wc->imm_data = wqe->wr.imm_data;
+               spin_lock_irqsave(&qp->r_rq.lock, flags);
+               if (!ipath_get_rwqe(qp, 1))
+                       goto rnr_nak;
+               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+               /* FALLTHROUGH */
+       case IB_WR_RDMA_WRITE:
+               if (wqe->length == 0)
+                       break;
+               if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, wqe->length,
+                                           wqe->wr.wr.rdma.remote_addr,
+                                           wqe->wr.wr.rdma.rkey,
+                                           IB_ACCESS_REMOTE_WRITE))) {
+               acc_err:
+                       wc->status = IB_WC_REM_ACCESS_ERR;
+               err:
+                       wc->wr_id = wqe->wr.wr_id;
+                       wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+                       wc->vendor_err = 0;
+                       wc->byte_len = 0;
+                       wc->qp_num = sqp->ibqp.qp_num;
+                       wc->src_qp = sqp->remote_qpn;
+                       wc->pkey_index = 0;
+                       wc->slid = sqp->remote_ah_attr.dlid;
+                       wc->sl = sqp->remote_ah_attr.sl;
+                       wc->dlid_path_bits = 0;
+                       wc->port_num = 0;
+                       ipath_sqerror_qp(sqp, wc);
+                       goto done;
+               }
+               break;
+
+       case IB_WR_RDMA_READ:
+               if (unlikely(!ipath_rkey_ok(dev, &sqp->s_sge, wqe->length,
+                                           wqe->wr.wr.rdma.remote_addr,
+                                           wqe->wr.wr.rdma.rkey,
+                                           IB_ACCESS_REMOTE_READ)))
+                       goto acc_err;
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_READ)))
+                       goto acc_err;
+               qp->r_sge.sge = wqe->sg_list[0];
+               qp->r_sge.sg_list = wqe->sg_list + 1;
+               qp->r_sge.num_sge = wqe->wr.num_sge;
+               break;
+
+       case IB_WR_ATOMIC_CMP_AND_SWP:
+       case IB_WR_ATOMIC_FETCH_AND_ADD:
+               if (unlikely(!ipath_rkey_ok(dev, &qp->r_sge, sizeof(u64),
+                                           wqe->wr.wr.rdma.remote_addr,
+                                           wqe->wr.wr.rdma.rkey,
+                                           IB_ACCESS_REMOTE_ATOMIC)))
+                       goto acc_err;
+               /* Perform atomic OP and save result. */
+               sdata = wqe->wr.wr.atomic.swap;
+               spin_lock_irqsave(&dev->pending_lock, flags);
+               qp->r_atomic_data = *(u64 *) qp->r_sge.sge.vaddr;
+               if (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+                       *(u64 *) qp->r_sge.sge.vaddr =
+                               qp->r_atomic_data + sdata;
+               else if (qp->r_atomic_data == wqe->wr.wr.atomic.compare_add)
+                       *(u64 *) qp->r_sge.sge.vaddr = sdata;
+               spin_unlock_irqrestore(&dev->pending_lock, flags);
+               *(u64 *) sqp->s_sge.sge.vaddr = qp->r_atomic_data;
+               goto send_comp;
+
+       default:
+               goto done;
+       }
+
+       sge = &sqp->s_sge.sge;
+       while (sqp->s_len) {
+               u32 len = sqp->s_len;
+
+               if (len > sge->length)
+                       len = sge->length;
+               BUG_ON(len == 0);
+               ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--sqp->s_sge.num_sge)
+                               *sge = *sqp->s_sge.sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               sqp->s_len -= len;
+       }
+
+       if (wqe->wr.opcode == IB_WR_RDMA_WRITE ||
+           wqe->wr.opcode == IB_WR_RDMA_READ)
+               goto send_comp;
+
+       if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+               wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+       else
+               wc->opcode = IB_WC_RECV;
+       wc->wr_id = qp->r_wr_id;
+       wc->status = IB_WC_SUCCESS;
+       wc->vendor_err = 0;
+       wc->byte_len = wqe->length;
+       wc->qp_num = qp->ibqp.qp_num;
+       wc->src_qp = qp->remote_qpn;
+       /* XXX do we know which pkey matched? Only needed for GSI. */
+       wc->pkey_index = 0;
+       wc->slid = qp->remote_ah_attr.dlid;
+       wc->sl = qp->remote_ah_attr.sl;
+       wc->dlid_path_bits = 0;
+       /* Signal completion event if the solicited bit is set. */
+       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc,
+                      wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+       sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+
+       if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &sqp->s_flags) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+               wc->wr_id = wqe->wr.wr_id;
+               wc->status = IB_WC_SUCCESS;
+               wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+               wc->vendor_err = 0;
+               wc->byte_len = wqe->length;
+               wc->qp_num = sqp->ibqp.qp_num;
+               wc->src_qp = 0;
+               wc->pkey_index = 0;
+               wc->slid = 0;
+               wc->sl = 0;
+               wc->dlid_path_bits = 0;
+               wc->port_num = 0;
+               ipath_cq_enter(to_icq(sqp->ibqp.send_cq), wc, 0);
+       }
+
+       /* Update s_last now that we are finished with the SWQE */
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       if (++sqp->s_last >= sqp->s_size)
+               sqp->s_last = 0;
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
+       goto again;
+
+done:
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+/**
+ * ipath_no_bufs_available - tell the layer driver we need buffers
+ * @qp: the QP that caused the problem
+ * @dev: the device we ran out of buffers on
+ *
+ * Called when we run out of PIO buffers.
+ */
+void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       if (qp->piowait.next == LIST_POISON1)
+               list_add_tail(&qp->piowait, &dev->piowait);
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+       /*
+        * Note that as soon as ipath_layer_want_buffer() is called and
+        * possibly before it returns, ipath_ib_piobufavail()
+        * could be called.  If we are still in the tasklet function,
+        * tasklet_hi_schedule() will not call us until the next time
+        * tasklet_hi_schedule() is called.
+        * We clear the tasklet flag now since we are committing to return
+        * from the tasklet function.
+        */
+       clear_bit(IPATH_S_BUSY, &qp->s_flags);
+       tasklet_unlock(&qp->s_task);
+       ipath_layer_want_buffer(dev->dd);
+       dev->n_piowait++;
+}
+
+/**
+ * ipath_post_rc_send - post RC and UC sends
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+       struct ipath_swqe *wqe;
+       unsigned long flags;
+       u32 next;
+       int i, j;
+       int acc;
+       int ret;
+
+       /*
+        * Don't allow RDMA reads or atomic operations on UC or
+        * undefined operations.
+        * Make sure buffer is large enough to hold the result for atomics.
+        */
+       if (qp->ibqp.qp_type == IB_QPT_UC) {
+               if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+       } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) {
+               ret = -EINVAL;
+               goto bail;
+       } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+                  (wr->num_sge == 0 ||
+                   wr->sg_list[0].length < sizeof(u64) ||
+                   wr->sg_list[0].addr & (sizeof(u64) - 1))) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       /* IB spec says that num_sge == 0 is OK. */
+       if (wr->num_sge > qp->s_max_sge) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       spin_lock_irqsave(&qp->s_lock, flags);
+       next = qp->s_head + 1;
+       if (next >= qp->s_size)
+               next = 0;
+       if (next == qp->s_last) {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       wqe = get_swqe_ptr(qp, qp->s_head);
+       wqe->wr = *wr;
+       wqe->ssn = qp->s_ssn++;
+       wqe->sg_list[0].mr = NULL;
+       wqe->sg_list[0].vaddr = NULL;
+       wqe->sg_list[0].length = 0;
+       wqe->sg_list[0].sge_length = 0;
+       wqe->length = 0;
+       acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0;
+       for (i = 0, j = 0; i < wr->num_sge; i++) {
+               if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               if (wr->sg_list[i].length == 0)
+                       continue;
+               if (!ipath_lkey_ok(&to_idev(qp->ibqp.device)->lk_table,
+                                  &wqe->sg_list[j], &wr->sg_list[i],
+                                  acc)) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               wqe->length += wr->sg_list[i].length;
+               j++;
+       }
+       wqe->wr.num_sge = j;
+       qp->s_head = next;
+       /*
+        * Wake up the send tasklet if the QP is not waiting
+        * for an RNR timeout.
+        */
+       next = qp->s_rnr_timeout;
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       if (next == 0) {
+               if (qp->ibqp.qp_type == IB_QPT_UC)
+                       ipath_do_uc_send((unsigned long) qp);
+               else
+                       ipath_do_rc_send((unsigned long) qp);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c
new file mode 100644 (file)
index 0000000..01c4c6c
--- /dev/null
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                          struct ib_recv_wr **bad_wr)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+       struct ipath_ibdev *dev = to_idev(ibsrq->device);
+       unsigned long flags;
+       int ret;
+
+       for (; wr; wr = wr->next) {
+               struct ipath_rwqe *wqe;
+               u32 next;
+               int i, j;
+
+               if (wr->num_sge > srq->rq.max_sge) {
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               spin_lock_irqsave(&srq->rq.lock, flags);
+               next = srq->rq.head + 1;
+               if (next >= srq->rq.size)
+                       next = 0;
+               if (next == srq->rq.tail) {
+                       spin_unlock_irqrestore(&srq->rq.lock, flags);
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               wqe = get_rwqe_ptr(&srq->rq, srq->rq.head);
+               wqe->wr_id = wr->wr_id;
+               wqe->sg_list[0].mr = NULL;
+               wqe->sg_list[0].vaddr = NULL;
+               wqe->sg_list[0].length = 0;
+               wqe->sg_list[0].sge_length = 0;
+               wqe->length = 0;
+               for (i = 0, j = 0; i < wr->num_sge; i++) {
+                       /* Check LKEY */
+                       if (to_ipd(srq->ibsrq.pd)->user &&
+                           wr->sg_list[i].lkey == 0) {
+                               spin_unlock_irqrestore(&srq->rq.lock,
+                                                      flags);
+                               *bad_wr = wr;
+                               ret = -EINVAL;
+                               goto bail;
+                       }
+                       if (wr->sg_list[i].length == 0)
+                               continue;
+                       if (!ipath_lkey_ok(&dev->lk_table,
+                                          &wqe->sg_list[j],
+                                          &wr->sg_list[i],
+                                          IB_ACCESS_LOCAL_WRITE)) {
+                               spin_unlock_irqrestore(&srq->rq.lock,
+                                                      flags);
+                               *bad_wr = wr;
+                               ret = -EINVAL;
+                               goto bail;
+                       }
+                       wqe->length += wr->sg_list[i].length;
+                       j++;
+               }
+               wqe->num_sge = j;
+               srq->rq.head = next;
+               spin_unlock_irqrestore(&srq->rq.lock, flags);
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @attr: the attributes of the SRQ
+ * @udata: not used by the InfiniPath verbs driver
+ */
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+                               struct ib_srq_init_attr *srq_init_attr,
+                               struct ib_udata *udata)
+{
+       struct ipath_srq *srq;
+       u32 sz;
+       struct ib_srq *ret;
+
+       if (srq_init_attr->attr.max_sge < 1) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+       if (!srq) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       /*
+        * Need to use vmalloc() if we want to support large #s of entries.
+        */
+       srq->rq.size = srq_init_attr->attr.max_wr + 1;
+       sz = sizeof(struct ipath_sge) * srq_init_attr->attr.max_sge +
+               sizeof(struct ipath_rwqe);
+       srq->rq.wq = vmalloc(srq->rq.size * sz);
+       if (!srq->rq.wq) {
+               kfree(srq);
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       /*
+        * ib_create_srq() will initialize srq->ibsrq.
+        */
+       spin_lock_init(&srq->rq.lock);
+       srq->rq.head = 0;
+       srq->rq.tail = 0;
+       srq->rq.max_sge = srq_init_attr->attr.max_sge;
+       srq->limit = srq_init_attr->attr.srq_limit;
+
+       ret = &srq->ibsrq;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ */
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                    enum ib_srq_attr_mask attr_mask)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+       unsigned long flags;
+       int ret;
+
+       if (attr_mask & IB_SRQ_LIMIT) {
+               spin_lock_irqsave(&srq->rq.lock, flags);
+               srq->limit = attr->srq_limit;
+               spin_unlock_irqrestore(&srq->rq.lock, flags);
+       }
+       if (attr_mask & IB_SRQ_MAX_WR) {
+               u32 size = attr->max_wr + 1;
+               struct ipath_rwqe *wq, *p;
+               u32 n;
+               u32 sz;
+
+               if (attr->max_sge < srq->rq.max_sge) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               sz = sizeof(struct ipath_rwqe) +
+                       attr->max_sge * sizeof(struct ipath_sge);
+               wq = vmalloc(size * sz);
+               if (!wq) {
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               spin_lock_irqsave(&srq->rq.lock, flags);
+               if (srq->rq.head < srq->rq.tail)
+                       n = srq->rq.size + srq->rq.head - srq->rq.tail;
+               else
+                       n = srq->rq.head - srq->rq.tail;
+               if (size <= n || size <= srq->limit) {
+                       spin_unlock_irqrestore(&srq->rq.lock, flags);
+                       vfree(wq);
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               n = 0;
+               p = wq;
+               while (srq->rq.tail != srq->rq.head) {
+                       struct ipath_rwqe *wqe;
+                       int i;
+
+                       wqe = get_rwqe_ptr(&srq->rq, srq->rq.tail);
+                       p->wr_id = wqe->wr_id;
+                       p->length = wqe->length;
+                       p->num_sge = wqe->num_sge;
+                       for (i = 0; i < wqe->num_sge; i++)
+                               p->sg_list[i] = wqe->sg_list[i];
+                       n++;
+                       p = (struct ipath_rwqe *)((char *) p + sz);
+                       if (++srq->rq.tail >= srq->rq.size)
+                               srq->rq.tail = 0;
+               }
+               vfree(srq->rq.wq);
+               srq->rq.wq = wq;
+               srq->rq.size = size;
+               srq->rq.head = n;
+               srq->rq.tail = 0;
+               srq->rq.max_sge = attr->max_sge;
+               spin_unlock_irqrestore(&srq->rq.lock, flags);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+
+       attr->max_wr = srq->rq.size - 1;
+       attr->max_sge = srq->rq.max_sge;
+       attr->srq_limit = srq->limit;
+       return 0;
+}
+
+/**
+ * ipath_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int ipath_destroy_srq(struct ib_srq *ibsrq)
+{
+       struct ipath_srq *srq = to_isrq(ibsrq);
+
+       vfree(srq->rq.wq);
+       kfree(srq);
+
+       return 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
new file mode 100644 (file)
index 0000000..fe20913
--- /dev/null
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+
+#include "ipath_kernel.h"
+
+struct infinipath_stats ipath_stats;
+
+/**
+ * ipath_snap_cntr - snapshot a chip counter
+ * @dd: the infinipath device
+ * @creg: the counter to snapshot
+ *
+ * called from add_timer and user counter read calls, to deal with
+ * counters that wrap in "human time".  The words sent and received, and
+ * the packets sent and received are all that we worry about.  For now,
+ * at least, we don't worry about error counters, because if they wrap
+ * that quickly, we probably don't care.  We may eventually just make this
+ * handle all the counters.  word counters can wrap in about 20 seconds
+ * of full bandwidth traffic, packet counters in a few hours.
+ */
+
+u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
+{
+       u32 val, reg64 = 0;
+       u64 val64;
+       unsigned long t0, t1;
+       u64 ret;
+
+       t0 = jiffies;
+       /* If fast increment counters are only 32 bits, snapshot them,
+        * and maintain them as 64bit values in the driver */
+       if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
+           (creg == dd->ipath_cregs->cr_wordsendcnt ||
+            creg == dd->ipath_cregs->cr_wordrcvcnt ||
+            creg == dd->ipath_cregs->cr_pktsendcnt ||
+            creg == dd->ipath_cregs->cr_pktrcvcnt)) {
+               val64 = ipath_read_creg(dd, creg);
+               val = val64 == ~0ULL ? ~0U : 0;
+               reg64 = 1;
+       } else                  /* val64 just to keep gcc quiet... */
+               val64 = val = ipath_read_creg32(dd, creg);
+       /*
+        * See if a second has passed.  This is just a way to detect things
+        * that are quite broken.  Normally this should take just a few
+        * cycles (the check is for long enough that we don't care if we get
+        * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
+        * normal NB values
+        */
+       t1 = jiffies;
+       if (time_before(t0 + HZ, t1) && val == -1) {
+               ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
+                             creg);
+               ret = 0ULL;
+               goto bail;
+       }
+       if (reg64) {
+               ret = val64;
+               goto bail;
+       }
+
+       if (creg == dd->ipath_cregs->cr_wordsendcnt) {
+               if (val != dd->ipath_lastsword) {
+                       dd->ipath_sword += val - dd->ipath_lastsword;
+                       dd->ipath_lastsword = val;
+               }
+               val64 = dd->ipath_sword;
+       } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
+               if (val != dd->ipath_lastrword) {
+                       dd->ipath_rword += val - dd->ipath_lastrword;
+                       dd->ipath_lastrword = val;
+               }
+               val64 = dd->ipath_rword;
+       } else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
+               if (val != dd->ipath_lastspkts) {
+                       dd->ipath_spkts += val - dd->ipath_lastspkts;
+                       dd->ipath_lastspkts = val;
+               }
+               val64 = dd->ipath_spkts;
+       } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
+               if (val != dd->ipath_lastrpkts) {
+                       dd->ipath_rpkts += val - dd->ipath_lastrpkts;
+                       dd->ipath_lastrpkts = val;
+               }
+               val64 = dd->ipath_rpkts;
+       } else
+               val64 = (u64) val;
+
+       ret = val64;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
+ * @dd: the infinipath device
+ *
+ * print the delta of egrfull/hdrqfull errors for kernel ports no more than
+ * every 5 seconds.  User processes are printed at close, but kernel doesn't
+ * close, so...  Separate routine so may call from other places someday, and
+ * so function name when printed by _IPATH_INFO is meaningfull
+ */
+static void ipath_qcheck(struct ipath_devdata *dd)
+{
+       static u64 last_tot_hdrqfull;
+       size_t blen = 0;
+       char buf[128];
+
+       *buf = 0;
+       if (dd->ipath_pd[0]->port_hdrqfull != dd->ipath_p0_hdrqfull) {
+               blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
+                               dd->ipath_pd[0]->port_hdrqfull -
+                               dd->ipath_p0_hdrqfull);
+               dd->ipath_p0_hdrqfull = dd->ipath_pd[0]->port_hdrqfull;
+       }
+       if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
+               blen += snprintf(buf + blen, sizeof buf - blen,
+                                "%srcvegrfull %llu",
+                                blen ? ", " : "",
+                                (unsigned long long)
+                                (ipath_stats.sps_etidfull -
+                                 dd->ipath_last_tidfull));
+               dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
+       }
+
+       /*
+        * this is actually the number of hdrq full interrupts, not actual
+        * events, but at the moment that's mostly what I'm interested in.
+        * Actual count, etc. is in the counters, if needed.  For production
+        * users this won't ordinarily be printed.
+        */
+
+       if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
+           ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
+               blen += snprintf(buf + blen, sizeof buf - blen,
+                                "%shdrqfull %llu (all ports)",
+                                blen ? ", " : "",
+                                (unsigned long long)
+                                (ipath_stats.sps_hdrqfull -
+                                 last_tot_hdrqfull));
+               last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
+       }
+       if (blen)
+               ipath_dbg("%s\n", buf);
+
+       if (dd->ipath_port0head != (u32)
+           le64_to_cpu(*dd->ipath_hdrqtailptr)) {
+               if (dd->ipath_lastport0rcv_cnt ==
+                   ipath_stats.sps_port0pkts) {
+                       ipath_cdbg(PKT, "missing rcv interrupts? "
+                                  "port0 hd=%llx tl=%x; port0pkts %llx\n",
+                                  (unsigned long long)
+                                  le64_to_cpu(*dd->ipath_hdrqtailptr),
+                                  dd->ipath_port0head,
+                                  (unsigned long long)
+                                  ipath_stats.sps_port0pkts);
+                       ipath_kreceive(dd);
+               }
+               dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
+       }
+}
+
+/**
+ * ipath_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the infinipath device ipath_devdata
+ *
+ * called from add_timer
+ */
+void ipath_get_faststats(unsigned long opaque)
+{
+       struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+       u32 val;
+       static unsigned cnt;
+
+       /*
+        * don't access the chip while running diags, or memory diags can
+        * fail
+        */
+       if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT) ||
+           ipath_diag_inuse)
+               /* but re-arm the timer, for diags case; won't hurt other */
+               goto done;
+
+       if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+               ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+       }
+
+       ipath_qcheck(dd);
+
+       /*
+        * deal with repeat error suppression.  Doesn't really matter if
+        * last error was almost a full interval ago, or just a few usecs
+        * ago; still won't get more than 2 per interval.  We may want
+        * longer intervals for this eventually, could do with mod, counter
+        * or separate timer.  Also see code in ipath_handle_errors() and
+        * ipath_handle_hwerrors().
+        */
+
+       if (dd->ipath_lasterror)
+               dd->ipath_lasterror = 0;
+       if (dd->ipath_lasthwerror)
+               dd->ipath_lasthwerror = 0;
+       if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs)
+           && time_after(jiffies, dd->ipath_unmasktime)) {
+               char ebuf[256];
+               ipath_decode_err(ebuf, sizeof ebuf,
+                                (dd->ipath_maskederrs & ~dd->
+                                 ipath_ignorederrs));
+               if ((dd->ipath_maskederrs & ~dd->ipath_ignorederrs) &
+                   ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL))
+                       ipath_dev_err(dd, "Re-enabling masked errors "
+                                     "(%s)\n", ebuf);
+               else {
+                       /*
+                        * rcvegrfull and rcvhdrqfull are "normal", for some
+                        * types of processes (mostly benchmarks) that send
+                        * huge numbers of messages, while not processing
+                        * them.  So only complain about these at debug
+                        * level.
+                        */
+                       ipath_dbg("Disabling frequent queue full errors "
+                                 "(%s)\n", ebuf);
+               }
+               dd->ipath_maskederrs = dd->ipath_ignorederrs;
+               ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+                                ~dd->ipath_maskederrs);
+       }
+
+       /* limit qfull messages to ~one per minute per port */
+       if ((++cnt & 0x10)) {
+               for (val = dd->ipath_cfgports - 1; ((int)val) >= 0;
+                    val--) {
+                       if (dd->ipath_lastegrheads[val] != -1)
+                               dd->ipath_lastegrheads[val] = -1;
+                       if (dd->ipath_lastrcvhdrqtails[val] != -1)
+                               dd->ipath_lastrcvhdrqtails[val] = -1;
+               }
+       }
+
+       if (dd->ipath_nosma_bufs) {
+               dd->ipath_nosma_secs += 5;
+               if (dd->ipath_nosma_secs >= 30) {
+                       ipath_cdbg(SMA, "No SMA bufs avail %u seconds; "
+                                  "cancelling pending sends\n",
+                                  dd->ipath_nosma_secs);
+                       /*
+                        * issue an abort as well, in case we have a packet
+                        * stuck in launch fifo.  This could corrupt an
+                        * outgoing user packet in the worst case,
+                        * but this is a pretty catastrophic, anyway.
+                        */
+                       ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+                                        INFINIPATH_S_ABORT);
+                       ipath_disarm_piobufs(dd, dd->ipath_lastport_piobuf,
+                                            dd->ipath_piobcnt2k +
+                                            dd->ipath_piobcnt4k -
+                                            dd->ipath_lastport_piobuf);
+                       /* start again, if necessary */
+                       dd->ipath_nosma_secs = 0;
+               } else
+                       ipath_cdbg(SMA, "No SMA bufs avail %u tries, "
+                                  "after %u seconds\n",
+                                  dd->ipath_nosma_bufs,
+                                  dd->ipath_nosma_secs);
+       }
+
+done:
+       mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
new file mode 100644 (file)
index 0000000..32acd80
--- /dev/null
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/ctype.h>
+#include <linux/pci.h>
+
+#include "ipath_kernel.h"
+#include "ips_common.h"
+#include "ipath_layer.h"
+
+/**
+ * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
+ * @str: the string containing the number
+ * @valp: where to put the result
+ *
+ * returns the number of bytes consumed, or negative value on error
+ */
+int ipath_parse_ushort(const char *str, unsigned short *valp)
+{
+       unsigned long val;
+       char *end;
+       int ret;
+
+       if (!isdigit(str[0])) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       val = simple_strtoul(str, &end, 0);
+
+       if (val > 0xffff) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       *valp = val;
+
+       ret = end + 1 - str;
+       if (ret == 0)
+               ret = -EINVAL;
+
+bail:
+       return ret;
+}
+
+static ssize_t show_version(struct device_driver *dev, char *buf)
+{
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", ipath_core_version);
+}
+
+static ssize_t show_num_units(struct device_driver *dev, char *buf)
+{
+       return scnprintf(buf, PAGE_SIZE, "%d\n",
+                        ipath_count_units(NULL, NULL, NULL));
+}
+
+#define DRIVER_STAT(name, attr) \
+       static ssize_t show_stat_##name(struct device_driver *dev, \
+                                       char *buf) \
+       { \
+               return scnprintf( \
+                       buf, PAGE_SIZE, "%llu\n", \
+                       (unsigned long long) ipath_stats.sps_ ##attr); \
+       } \
+       static DRIVER_ATTR(name, S_IRUGO, show_stat_##name, NULL)
+
+DRIVER_STAT(intrs, ints);
+DRIVER_STAT(err_intrs, errints);
+DRIVER_STAT(errs, errs);
+DRIVER_STAT(pkt_errs, pkterrs);
+DRIVER_STAT(crc_errs, crcerrs);
+DRIVER_STAT(hw_errs, hwerrs);
+DRIVER_STAT(ib_link, iblink);
+DRIVER_STAT(port0_pkts, port0pkts);
+DRIVER_STAT(ether_spkts, ether_spkts);
+DRIVER_STAT(ether_rpkts, ether_rpkts);
+DRIVER_STAT(sma_spkts, sma_spkts);
+DRIVER_STAT(sma_rpkts, sma_rpkts);
+DRIVER_STAT(hdrq_full, hdrqfull);
+DRIVER_STAT(etid_full, etidfull);
+DRIVER_STAT(no_piobufs, nopiobufs);
+DRIVER_STAT(ports, ports);
+DRIVER_STAT(pkey0, pkeys[0]);
+DRIVER_STAT(pkey1, pkeys[1]);
+DRIVER_STAT(pkey2, pkeys[2]);
+DRIVER_STAT(pkey3, pkeys[3]);
+/* XXX fix the following when dynamic table of devices used */
+DRIVER_STAT(lid0, lid[0]);
+DRIVER_STAT(lid1, lid[1]);
+DRIVER_STAT(lid2, lid[2]);
+DRIVER_STAT(lid3, lid[3]);
+
+DRIVER_STAT(nports, nports);
+DRIVER_STAT(null_intr, nullintr);
+DRIVER_STAT(max_pkts_call, maxpkts_call);
+DRIVER_STAT(avg_pkts_call, avgpkts_call);
+DRIVER_STAT(page_locks, pagelocks);
+DRIVER_STAT(page_unlocks, pageunlocks);
+DRIVER_STAT(krdrops, krdrops);
+/* XXX fix the following when dynamic table of devices used */
+DRIVER_STAT(mlid0, mlid[0]);
+DRIVER_STAT(mlid1, mlid[1]);
+DRIVER_STAT(mlid2, mlid[2]);
+DRIVER_STAT(mlid3, mlid[3]);
+
+static struct attribute *driver_stat_attributes[] = {
+       &driver_attr_intrs.attr,
+       &driver_attr_err_intrs.attr,
+       &driver_attr_errs.attr,
+       &driver_attr_pkt_errs.attr,
+       &driver_attr_crc_errs.attr,
+       &driver_attr_hw_errs.attr,
+       &driver_attr_ib_link.attr,
+       &driver_attr_port0_pkts.attr,
+       &driver_attr_ether_spkts.attr,
+       &driver_attr_ether_rpkts.attr,
+       &driver_attr_sma_spkts.attr,
+       &driver_attr_sma_rpkts.attr,
+       &driver_attr_hdrq_full.attr,
+       &driver_attr_etid_full.attr,
+       &driver_attr_no_piobufs.attr,
+       &driver_attr_ports.attr,
+       &driver_attr_pkey0.attr,
+       &driver_attr_pkey1.attr,
+       &driver_attr_pkey2.attr,
+       &driver_attr_pkey3.attr,
+       &driver_attr_lid0.attr,
+       &driver_attr_lid1.attr,
+       &driver_attr_lid2.attr,
+       &driver_attr_lid3.attr,
+       &driver_attr_nports.attr,
+       &driver_attr_null_intr.attr,
+       &driver_attr_max_pkts_call.attr,
+       &driver_attr_avg_pkts_call.attr,
+       &driver_attr_page_locks.attr,
+       &driver_attr_page_unlocks.attr,
+       &driver_attr_krdrops.attr,
+       &driver_attr_mlid0.attr,
+       &driver_attr_mlid1.attr,
+       &driver_attr_mlid2.attr,
+       &driver_attr_mlid3.attr,
+       NULL
+};
+
+static struct attribute_group driver_stat_attr_group = {
+       .name = "stats",
+       .attrs = driver_stat_attributes
+};
+
+static ssize_t show_status(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+
+       if (!dd->ipath_statusp) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+                       (unsigned long long) *(dd->ipath_statusp));
+
+bail:
+       return ret;
+}
+
+static const char *ipath_status_str[] = {
+       "Initted",
+       "Disabled",
+       "Admin_Disabled",
+       "OIB_SMA",
+       "SMA",
+       "Present",
+       "IB_link_up",
+       "IB_configured",
+       "NoIBcable",
+       "Fatal_Hardware_Error",
+       NULL,
+};
+
+static ssize_t show_status_str(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int i, any;
+       u64 s;
+       ssize_t ret;
+
+       if (!dd->ipath_statusp) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       s = *(dd->ipath_statusp);
+       *buf = '\0';
+       for (any = i = 0; s && ipath_status_str[i]; i++) {
+               if (s & 1) {
+                       if (any && strlcat(buf, " ", PAGE_SIZE) >=
+                           PAGE_SIZE)
+                               /* overflow */
+                               break;
+                       if (strlcat(buf, ipath_status_str[i],
+                                   PAGE_SIZE) >= PAGE_SIZE)
+                               break;
+                       any = 1;
+               }
+               s >>= 1;
+       }
+       if (any)
+               strlcat(buf, "\n", PAGE_SIZE);
+
+       ret = strlen(buf);
+
+bail:
+       return ret;
+}
+
+static ssize_t show_boardversion(struct device *dev,
+                              struct device_attribute *attr,
+                              char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       /* The string printed here is already newline-terminated. */
+       return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
+}
+
+static ssize_t show_lid(struct device *dev,
+                       struct device_attribute *attr,
+                       char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
+}
+
+static ssize_t store_lid(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u16 lid;
+       int ret;
+
+       ret = ipath_parse_ushort(buf, &lid);
+       if (ret < 0)
+               goto invalid;
+
+       if (lid == 0 || lid >= 0xc000) {
+               ret = -EINVAL;
+               goto invalid;
+       }
+
+       ipath_set_sps_lid(dd, lid, 0);
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid LID\n");
+bail:
+       return ret;
+}
+
+static ssize_t show_mlid(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
+}
+
+static ssize_t store_mlid(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int unit;
+       u16 mlid;
+       int ret;
+
+       ret = ipath_parse_ushort(buf, &mlid);
+       if (ret < 0)
+               goto invalid;
+
+       unit = dd->ipath_unit;
+
+       dd->ipath_mlid = mlid;
+       ipath_stats.sps_mlid[unit] = mlid;
+       ipath_layer_intr(dd, IPATH_LAYER_INT_BCAST);
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid MLID\n");
+bail:
+       return ret;
+}
+
+static ssize_t show_guid(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       u8 *guid;
+
+       guid = (u8 *) & (dd->ipath_guid);
+
+       return scnprintf(buf, PAGE_SIZE,
+                        "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+                        guid[0], guid[1], guid[2], guid[3],
+                        guid[4], guid[5], guid[6], guid[7]);
+}
+
+static ssize_t store_guid(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+       unsigned short guid[8];
+       __be64 nguid;
+       u8 *ng;
+       int i;
+
+       if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
+                  &guid[0], &guid[1], &guid[2], &guid[3],
+                  &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
+               goto invalid;
+
+       ng = (u8 *) &nguid;
+
+       for (i = 0; i < 8; i++) {
+               if (guid[i] > 0xff)
+                       goto invalid;
+               ng[i] = guid[i];
+       }
+
+       dd->ipath_guid = nguid;
+       dd->ipath_nguid = 1;
+
+       ret = strlen(buf);
+       goto bail;
+
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid GUID\n");
+       ret = -EINVAL;
+
+bail:
+       return ret;
+}
+
+static ssize_t show_nguid(struct device *dev,
+                         struct device_attribute *attr,
+                         char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
+}
+
+static ssize_t show_serial(struct device *dev,
+                          struct device_attribute *attr,
+                          char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       buf[sizeof dd->ipath_serial] = '\0';
+       memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
+       strcat(buf, "\n");
+       return strlen(buf);
+}
+
+static ssize_t show_unit(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
+}
+
+#define DEVICE_COUNTER(name, attr) \
+       static ssize_t show_counter_##name(struct device *dev, \
+                                          struct device_attribute *attr, \
+                                          char *buf) \
+       { \
+               struct ipath_devdata *dd = dev_get_drvdata(dev); \
+               return scnprintf(\
+                       buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
+                       ipath_snap_cntr( \
+                               dd, offsetof(struct infinipath_counters, \
+                                            attr) / sizeof(u64)));     \
+       } \
+       static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
+
+DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
+DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
+DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
+DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
+DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
+DEVICE_COUNTER(lb_ints, LBIntCnt);
+DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
+DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
+DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
+DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
+DEVICE_COUNTER(rx_dwords, RxDwordCnt);
+DEVICE_COUNTER(rx_ebps, RxEBPCnt);
+DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
+DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
+DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
+DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
+DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
+DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
+DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
+DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
+DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
+DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
+DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
+DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
+DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
+DEVICE_COUNTER(tx_dwords, TxDwordCnt);
+DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
+DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
+DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
+DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
+DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
+DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
+
+static struct attribute *dev_counter_attributes[] = {
+       &dev_attr_ib_link_downeds.attr,
+       &dev_attr_ib_link_err_recoveries.attr,
+       &dev_attr_ib_status_changes.attr,
+       &dev_attr_ib_symbol_errs.attr,
+       &dev_attr_lb_flow_stalls.attr,
+       &dev_attr_lb_ints.attr,
+       &dev_attr_rx_bad_formats.attr,
+       &dev_attr_rx_buf_ovfls.attr,
+       &dev_attr_rx_data_pkts.attr,
+       &dev_attr_rx_dropped_pkts.attr,
+       &dev_attr_rx_dwords.attr,
+       &dev_attr_rx_ebps.attr,
+       &dev_attr_rx_flow_ctrl_errs.attr,
+       &dev_attr_rx_flow_pkts.attr,
+       &dev_attr_rx_icrc_errs.attr,
+       &dev_attr_rx_len_errs.attr,
+       &dev_attr_rx_link_problems.attr,
+       &dev_attr_rx_lpcrc_errs.attr,
+       &dev_attr_rx_max_min_len_errs.attr,
+       &dev_attr_rx_p0_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p1_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p2_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p3_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p4_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p5_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p6_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p7_hdr_egr_ovfls.attr,
+       &dev_attr_rx_p8_hdr_egr_ovfls.attr,
+       &dev_attr_rx_pkey_mismatches.attr,
+       &dev_attr_rx_tid_full_errs.attr,
+       &dev_attr_rx_tid_valid_errs.attr,
+       &dev_attr_rx_vcrc_errs.attr,
+       &dev_attr_tx_data_pkts.attr,
+       &dev_attr_tx_dropped_pkts.attr,
+       &dev_attr_tx_dwords.attr,
+       &dev_attr_tx_flow_pkts.attr,
+       &dev_attr_tx_flow_stalls.attr,
+       &dev_attr_tx_len_errs.attr,
+       &dev_attr_tx_max_min_len_errs.attr,
+       &dev_attr_tx_underruns.attr,
+       &dev_attr_tx_unsup_vl_errs.attr,
+       NULL
+};
+
+static struct attribute_group dev_counter_attr_group = {
+       .name = "counters",
+       .attrs = dev_counter_attributes
+};
+
+static ssize_t store_reset(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret;
+
+       if (count < 5 || memcmp(buf, "reset", 5)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (dd->ipath_flags & IPATH_DISABLED) {
+               /*
+                * post-reset init would re-enable interrupts, etc.
+                * so don't allow reset on disabled devices.  Not
+                * perfect error, but about the best choice.
+                */
+               dev_info(dev,"Unit %d is disabled, can't reset\n",
+                        dd->ipath_unit);
+               ret = -EINVAL;
+       }
+       ret = ipath_reset_device(dd->ipath_unit);
+bail:
+       return ret<0 ? ret : count;
+}
+
+static ssize_t store_link_state(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       int ret, r;
+       u16 state;
+
+       ret = ipath_parse_ushort(buf, &state);
+       if (ret < 0)
+               goto invalid;
+
+       r = ipath_layer_set_linkstate(dd, state);
+       if (r < 0) {
+               ret = r;
+               goto bail;
+       }
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid link state\n");
+bail:
+       return ret;
+}
+
+static ssize_t show_mtu(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
+}
+
+static ssize_t store_mtu(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+       u16 mtu = 0;
+       int r;
+
+       ret = ipath_parse_ushort(buf, &mtu);
+       if (ret < 0)
+               goto invalid;
+
+       r = ipath_layer_set_mtu(dd, mtu);
+       if (r < 0)
+               ret = r;
+
+       goto bail;
+invalid:
+       ipath_dev_err(dd, "attempt to set invalid MTU\n");
+bail:
+       return ret;
+}
+
+static ssize_t show_enabled(struct device *dev,
+                        struct device_attribute *attr,
+                        char *buf)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
+}
+
+static ssize_t store_enabled(struct device *dev,
+                        struct device_attribute *attr,
+                         const char *buf,
+                         size_t count)
+{
+       struct ipath_devdata *dd = dev_get_drvdata(dev);
+       ssize_t ret;
+       u16 enable = 0;
+
+       ret = ipath_parse_ushort(buf, &enable);
+       if (ret < 0) {
+               ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
+               goto bail;
+       }
+
+       if (enable) {
+               if (!(dd->ipath_flags & IPATH_DISABLED))
+                       goto bail;
+
+               dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
+               /* same as post-reset */
+               ret = ipath_init_chip(dd, 1);
+               if (ret)
+                       ipath_dev_err(dd, "Failed to enable unit %d\n",
+                                     dd->ipath_unit);
+               else {
+                       dd->ipath_flags &= ~IPATH_DISABLED;
+                       *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
+               }
+       }
+       else if (!(dd->ipath_flags & IPATH_DISABLED)) {
+               dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
+               ipath_shutdown_device(dd);
+               dd->ipath_flags |= IPATH_DISABLED;
+               *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
+       }
+
+bail:
+       return ret;
+}
+
+static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
+static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
+
+static struct attribute *driver_attributes[] = {
+       &driver_attr_num_units.attr,
+       &driver_attr_version.attr,
+       NULL
+};
+
+static struct attribute_group driver_attr_group = {
+       .attrs = driver_attributes
+};
+
+static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
+static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
+static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
+static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
+static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
+static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
+static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
+static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
+
+static struct attribute *dev_attributes[] = {
+       &dev_attr_guid.attr,
+       &dev_attr_lid.attr,
+       &dev_attr_link_state.attr,
+       &dev_attr_mlid.attr,
+       &dev_attr_mtu.attr,
+       &dev_attr_nguid.attr,
+       &dev_attr_serial.attr,
+       &dev_attr_status.attr,
+       &dev_attr_status_str.attr,
+       &dev_attr_boardversion.attr,
+       &dev_attr_unit.attr,
+       &dev_attr_enabled.attr,
+       NULL
+};
+
+static struct attribute_group dev_attr_group = {
+       .attrs = dev_attributes
+};
+
+/**
+ * ipath_expose_reset - create a device reset file
+ * @dev: the device structure
+ *
+ * Only expose a file that lets us reset the device after someone
+ * enters diag mode.  A device reset is quite likely to crash the
+ * machine entirely, so we don't want to normally make it
+ * available.
+ */
+int ipath_expose_reset(struct device *dev)
+{
+       return device_create_file(dev, &dev_attr_reset);
+}
+
+int ipath_driver_create_group(struct device_driver *drv)
+{
+       int ret;
+
+       ret = sysfs_create_group(&drv->kobj, &driver_attr_group);
+       if (ret)
+               goto bail;
+
+       ret = sysfs_create_group(&drv->kobj, &driver_stat_attr_group);
+       if (ret)
+               sysfs_remove_group(&drv->kobj, &driver_attr_group);
+
+bail:
+       return ret;
+}
+
+void ipath_driver_remove_group(struct device_driver *drv)
+{
+       sysfs_remove_group(&drv->kobj, &driver_stat_attr_group);
+       sysfs_remove_group(&drv->kobj, &driver_attr_group);
+}
+
+int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
+{
+       int ret;
+       char unit[5];
+
+       ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
+       if (ret)
+               goto bail;
+
+       ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
+       if (ret)
+               goto bail_attrs;
+
+       snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit);
+       ret = sysfs_create_link(&dev->driver->kobj, &dev->kobj, unit);
+       if (ret == 0)
+               goto bail;
+
+       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+bail_attrs:
+       sysfs_remove_group(&dev->kobj, &dev_attr_group);
+bail:
+       return ret;
+}
+
+void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
+{
+       char unit[5];
+
+       snprintf(unit, sizeof(unit), "%02d", dd->ipath_unit);
+       sysfs_remove_link(&dev->driver->kobj, unit);
+
+       sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+       sysfs_remove_group(&dev->kobj, &dev_attr_group);
+
+       device_remove_file(dev, &dev_attr_reset);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
new file mode 100644 (file)
index 0000000..0d6dbc0
--- /dev/null
@@ -0,0 +1,645 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+#include "ips_common.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+static void complete_last_send(struct ipath_qp *qp, struct ipath_swqe *wqe,
+                              struct ib_wc *wc)
+{
+       if (++qp->s_last == qp->s_size)
+               qp->s_last = 0;
+       if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
+           (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+               wc->wr_id = wqe->wr.wr_id;
+               wc->status = IB_WC_SUCCESS;
+               wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+               wc->vendor_err = 0;
+               wc->byte_len = wqe->length;
+               wc->qp_num = qp->ibqp.qp_num;
+               wc->src_qp = qp->remote_qpn;
+               wc->pkey_index = 0;
+               wc->slid = qp->remote_ah_attr.dlid;
+               wc->sl = qp->remote_ah_attr.sl;
+               wc->dlid_path_bits = 0;
+               wc->port_num = 0;
+               ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 0);
+       }
+       wqe = get_swqe_ptr(qp, qp->s_last);
+}
+
+/**
+ * ipath_do_uc_send - do a send on a UC queue
+ * @data: contains a pointer to the QP to send on
+ *
+ * Process entries in the send work queue until the queue is exhausted.
+ * Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, after we drop the QP lock, two threads could send
+ * packets out of order.
+ * This is similar to ipath_do_rc_send() below except we don't have
+ * timeouts or resends.
+ */
+void ipath_do_uc_send(unsigned long data)
+{
+       struct ipath_qp *qp = (struct ipath_qp *)data;
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_swqe *wqe;
+       unsigned long flags;
+       u16 lrh0;
+       u32 hwords;
+       u32 nwords;
+       u32 extra_bytes;
+       u32 bth0;
+       u32 bth2;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       u32 len;
+       struct ipath_other_headers *ohdr;
+       struct ib_wc wc;
+
+       if (test_and_set_bit(IPATH_S_BUSY, &qp->s_flags))
+               goto bail;
+
+       if (unlikely(qp->remote_ah_attr.dlid ==
+                    ipath_layer_get_lid(dev->dd))) {
+               /* Pass in an uninitialized ib_wc to save stack space. */
+               ipath_ruc_loopback(qp, &wc);
+               clear_bit(IPATH_S_BUSY, &qp->s_flags);
+               goto bail;
+       }
+
+       ohdr = &qp->s_hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &qp->s_hdr.u.l.oth;
+
+again:
+       /* Check for a constructed packet to be sent. */
+       if (qp->s_hdrwords != 0) {
+                       /*
+                        * If no PIO bufs are available, return.
+                        * An interrupt will call ipath_ib_piobufavail()
+                        * when one is available.
+                        */
+                       if (ipath_verbs_send(dev->dd, qp->s_hdrwords,
+                                            (u32 *) &qp->s_hdr,
+                                            qp->s_cur_size,
+                                            qp->s_cur_sge)) {
+                               ipath_no_bufs_available(qp, dev);
+                               goto bail;
+                       }
+                       dev->n_unicast_xmit++;
+               /* Record that we sent the packet and s_hdr is empty. */
+               qp->s_hdrwords = 0;
+       }
+
+       lrh0 = IPS_LRH_BTH;
+       /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+       hwords = 5;
+
+       /*
+        * The lock is needed to synchronize between
+        * setting qp->s_ack_state and post_send().
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK))
+               goto done;
+
+       bth0 = ipath_layer_get_pkey(dev->dd, qp->s_pkey_index);
+
+       /* Send a request. */
+       wqe = get_swqe_ptr(qp, qp->s_last);
+       switch (qp->s_state) {
+       default:
+               /*
+                * Signal the completion of the last send (if there is
+                * one).
+                */
+               if (qp->s_last != qp->s_tail)
+                       complete_last_send(qp, wqe, &wc);
+
+               /* Check if send work queue is empty. */
+               if (qp->s_tail == qp->s_head)
+                       goto done;
+               /*
+                * Start a new request.
+                */
+               qp->s_psn = wqe->psn = qp->s_next_psn;
+               qp->s_sge.sge = wqe->sg_list[0];
+               qp->s_sge.sg_list = wqe->sg_list + 1;
+               qp->s_sge.num_sge = wqe->wr.num_sge;
+               qp->s_len = len = wqe->length;
+               switch (wqe->wr.opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+                       if (len > pmtu) {
+                               qp->s_state = OP(SEND_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_SEND)
+                               qp->s_state = OP(SEND_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(SEND_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the BTH */
+                               ohdr->u.imm_data = wqe->wr.imm_data;
+                               hwords += 1;
+                       }
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+                       break;
+
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+                       ohdr->u.rc.reth.vaddr =
+                               cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+                       ohdr->u.rc.reth.rkey =
+                               cpu_to_be32(wqe->wr.wr.rdma.rkey);
+                       ohdr->u.rc.reth.length = cpu_to_be32(len);
+                       hwords += sizeof(struct ib_reth) / 4;
+                       if (len > pmtu) {
+                               qp->s_state = OP(RDMA_WRITE_FIRST);
+                               len = pmtu;
+                               break;
+                       }
+                       if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                               qp->s_state = OP(RDMA_WRITE_ONLY);
+                       else {
+                               qp->s_state =
+                                       OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+                               /* Immediate data comes after the RETH */
+                               ohdr->u.rc.imm_data = wqe->wr.imm_data;
+                               hwords += 1;
+                               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                                       bth0 |= 1 << 23;
+                       }
+                       break;
+
+               default:
+                       goto done;
+               }
+               if (++qp->s_tail >= qp->s_size)
+                       qp->s_tail = 0;
+               break;
+
+       case OP(SEND_FIRST):
+               qp->s_state = OP(SEND_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_SEND)
+                       qp->s_state = OP(SEND_LAST);
+               else {
+                       qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.imm_data;
+                       hwords += 1;
+               }
+               if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                       bth0 |= 1 << 23;
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+               qp->s_state = OP(RDMA_WRITE_MIDDLE);
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               len = qp->s_len;
+               if (len > pmtu) {
+                       len = pmtu;
+                       break;
+               }
+               if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+                       qp->s_state = OP(RDMA_WRITE_LAST);
+               else {
+                       qp->s_state =
+                               OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+                       /* Immediate data comes after the BTH */
+                       ohdr->u.imm_data = wqe->wr.imm_data;
+                       hwords += 1;
+                       if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+                               bth0 |= 1 << 23;
+               }
+               break;
+       }
+       bth2 = qp->s_next_psn++ & IPS_PSN_MASK;
+       qp->s_len -= len;
+       bth0 |= qp->s_state << 24;
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       /* Construct the header. */
+       extra_bytes = (4 - len) & 3;
+       nwords = (len + extra_bytes) >> 2;
+       if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+               /* Header size in 32-bit words. */
+               hwords += 10;
+               lrh0 = IPS_LRH_GRH;
+               qp->s_hdr.u.l.grh.version_tclass_flow =
+                       cpu_to_be32((6 << 28) |
+                                   (qp->remote_ah_attr.grh.traffic_class
+                                    << 20) |
+                                   qp->remote_ah_attr.grh.flow_label);
+               qp->s_hdr.u.l.grh.paylen =
+                       cpu_to_be16(((hwords - 12) + nwords +
+                                    SIZE_OF_CRC) << 2);
+               /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+               qp->s_hdr.u.l.grh.next_hdr = 0x1B;
+               qp->s_hdr.u.l.grh.hop_limit =
+                       qp->remote_ah_attr.grh.hop_limit;
+               /* The SGID is 32-bit aligned. */
+               qp->s_hdr.u.l.grh.sgid.global.subnet_prefix =
+                       dev->gid_prefix;
+               qp->s_hdr.u.l.grh.sgid.global.interface_id =
+                       ipath_layer_get_guid(dev->dd);
+               qp->s_hdr.u.l.grh.dgid = qp->remote_ah_attr.grh.dgid;
+       }
+       qp->s_hdrwords = hwords;
+       qp->s_cur_sge = &qp->s_sge;
+       qp->s_cur_size = len;
+       lrh0 |= qp->remote_ah_attr.sl << 4;
+       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+       /* DEST LID */
+       qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+       qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
+       qp->s_hdr.lrh[3] = cpu_to_be16(ipath_layer_get_lid(dev->dd));
+       bth0 |= extra_bytes << 20;
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+       ohdr->bth[2] = cpu_to_be32(bth2);
+
+       /* Check for more work to do. */
+       goto again;
+
+done:
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       clear_bit(IPATH_S_BUSY, &qp->s_flags);
+
+bail:
+       return;
+}
+
+/**
+ * ipath_uc_rcv - handle an incoming UC packet
+ * @dev: the device the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+       struct ipath_other_headers *ohdr;
+       int opcode;
+       u32 hdrsize;
+       u32 psn;
+       u32 pad;
+       unsigned long flags;
+       struct ib_wc wc;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+       struct ib_reth *reth;
+       int header_in_data;
+
+       /* Check for GRH */
+       if (!has_grh) {
+               ohdr = &hdr->u.oth;
+               hdrsize = 8 + 12;       /* LRH + BTH */
+               psn = be32_to_cpu(ohdr->bth[2]);
+               header_in_data = 0;
+       } else {
+               ohdr = &hdr->u.l.oth;
+               hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+               /*
+                * The header with GRH is 60 bytes and the
+                * core driver sets the eager header buffer
+                * size to 56 bytes so the last 4 bytes of
+                * the BTH header (PSN) is in the data buffer.
+                */
+               header_in_data =
+                       ipath_layer_get_rcvhdrentsize(dev->dd) == 16;
+               if (header_in_data) {
+                       psn = be32_to_cpu(((__be32 *) data)[0]);
+                       data += sizeof(__be32);
+               } else
+                       psn = be32_to_cpu(ohdr->bth[2]);
+       }
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+       wc.imm_data = 0;
+       wc.wc_flags = 0;
+
+       spin_lock_irqsave(&qp->r_rq.lock, flags);
+
+       /* Compare the PSN verses the expected PSN. */
+       if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
+               /*
+                * Handle a sequence error.
+                * Silently drop any current message.
+                */
+               qp->r_psn = psn;
+       inv:
+               qp->r_state = OP(SEND_LAST);
+               switch (opcode) {
+               case OP(SEND_FIRST):
+               case OP(SEND_ONLY):
+               case OP(SEND_ONLY_WITH_IMMEDIATE):
+                       goto send_first;
+
+               case OP(RDMA_WRITE_FIRST):
+               case OP(RDMA_WRITE_ONLY):
+               case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+                       goto rdma_first;
+
+               default:
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+       }
+
+       /* Check for opcode sequence errors. */
+       switch (qp->r_state) {
+       case OP(SEND_FIRST):
+       case OP(SEND_MIDDLE):
+               if (opcode == OP(SEND_MIDDLE) ||
+                   opcode == OP(SEND_LAST) ||
+                   opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_MIDDLE):
+               if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+                   opcode == OP(RDMA_WRITE_LAST) ||
+                   opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+
+       default:
+               if (opcode == OP(SEND_FIRST) ||
+                   opcode == OP(SEND_ONLY) ||
+                   opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+                   opcode == OP(RDMA_WRITE_FIRST) ||
+                   opcode == OP(RDMA_WRITE_ONLY) ||
+                   opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       break;
+               goto inv;
+       }
+
+       /* OK, process the packet. */
+       switch (opcode) {
+       case OP(SEND_FIRST):
+       case OP(SEND_ONLY):
+       case OP(SEND_ONLY_WITH_IMMEDIATE):
+       send_first:
+               if (qp->r_reuse_sge) {
+                       qp->r_reuse_sge = 0;
+                       qp->r_sge = qp->s_rdma_sge;
+               } else if (!ipath_get_rwqe(qp, 0)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Save the WQE so we can reuse it in case of an error. */
+               qp->s_rdma_sge = qp->r_sge;
+               qp->r_rcv_len = 0;
+               if (opcode == OP(SEND_ONLY))
+                       goto send_last;
+               else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+                       goto send_last_imm;
+               /* FALLTHROUGH */
+       case OP(SEND_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+                       qp->r_reuse_sge = 1;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len)) {
+                       qp->r_reuse_sge = 1;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               ipath_copy_sge(&qp->r_sge, data, pmtu);
+               break;
+
+       case OP(SEND_LAST_WITH_IMMEDIATE):
+       send_last_imm:
+               if (header_in_data) {
+                       wc.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else {
+                       /* Immediate data comes after BTH */
+                       wc.imm_data = ohdr->u.imm_data;
+               }
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               /* FALLTHROUGH */
+       case OP(SEND_LAST):
+       send_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4))) {
+                       qp->r_reuse_sge = 1;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               wc.byte_len = tlen + qp->r_rcv_len;
+               if (unlikely(wc.byte_len > qp->r_len)) {
+                       qp->r_reuse_sge = 1;
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* XXX Need to free SGEs */
+       last_imm:
+               ipath_copy_sge(&qp->r_sge, data, tlen);
+               wc.wr_id = qp->r_wr_id;
+               wc.status = IB_WC_SUCCESS;
+               wc.opcode = IB_WC_RECV;
+               wc.vendor_err = 0;
+               wc.qp_num = qp->ibqp.qp_num;
+               wc.src_qp = qp->remote_qpn;
+               wc.pkey_index = 0;
+               wc.slid = qp->remote_ah_attr.dlid;
+               wc.sl = qp->remote_ah_attr.sl;
+               wc.dlid_path_bits = 0;
+               wc.port_num = 0;
+               /* Signal completion event if the solicited bit is set. */
+               ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                              (ohdr->bth[0] &
+                               __constant_cpu_to_be32(1 << 23)) != 0);
+               break;
+
+       case OP(RDMA_WRITE_FIRST):
+       case OP(RDMA_WRITE_ONLY):
+       case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+       rdma_first:
+               /* RETH comes after BTH */
+               if (!header_in_data)
+                       reth = &ohdr->u.rc.reth;
+               else {
+                       reth = (struct ib_reth *)data;
+                       data += sizeof(*reth);
+               }
+               hdrsize += sizeof(*reth);
+               qp->r_len = be32_to_cpu(reth->length);
+               qp->r_rcv_len = 0;
+               if (qp->r_len != 0) {
+                       u32 rkey = be32_to_cpu(reth->rkey);
+                       u64 vaddr = be64_to_cpu(reth->vaddr);
+
+                       /* Check rkey */
+                       if (unlikely(!ipath_rkey_ok(
+                                            dev, &qp->r_sge, qp->r_len,
+                                            vaddr, rkey,
+                                            IB_ACCESS_REMOTE_WRITE))) {
+                               dev->n_pkt_drops++;
+                               goto done;
+                       }
+               } else {
+                       qp->r_sge.sg_list = NULL;
+                       qp->r_sge.sge.mr = NULL;
+                       qp->r_sge.sge.vaddr = NULL;
+                       qp->r_sge.sge.length = 0;
+                       qp->r_sge.sge.sge_length = 0;
+               }
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               if (opcode == OP(RDMA_WRITE_ONLY))
+                       goto rdma_last;
+               else if (opcode ==
+                        OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+                       goto rdma_last_imm;
+               /* FALLTHROUGH */
+       case OP(RDMA_WRITE_MIDDLE):
+               /* Check for invalid length PMTU or posted rwqe len. */
+               if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               qp->r_rcv_len += pmtu;
+               if (unlikely(qp->r_rcv_len > qp->r_len)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               ipath_copy_sge(&qp->r_sge, data, pmtu);
+               break;
+
+       case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+       rdma_last_imm:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               if (qp->r_reuse_sge) {
+                       qp->r_reuse_sge = 0;
+               } else if (!ipath_get_rwqe(qp, 1)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               if (header_in_data) {
+                       wc.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else {
+                       /* Immediate data comes after BTH */
+                       wc.imm_data = ohdr->u.imm_data;
+               }
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               wc.byte_len = 0;
+               goto last_imm;
+
+       case OP(RDMA_WRITE_LAST):
+       rdma_last:
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /* Check for invalid length. */
+               /* XXX LAST len should be >= 1 */
+               if (unlikely(tlen < (hdrsize + pad + 4))) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               /* Don't count the CRC. */
+               tlen -= (hdrsize + pad + 4);
+               if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+                       dev->n_pkt_drops++;
+                       goto done;
+               }
+               ipath_copy_sge(&qp->r_sge, data, tlen);
+               break;
+
+       default:
+               /* Drop packet for unknown opcodes. */
+               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+       qp->r_psn++;
+       qp->r_state = opcode;
+done:
+       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+
+bail:
+       return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
new file mode 100644 (file)
index 0000000..5ff3de6
--- /dev/null
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+#include "ips_common.h"
+
+/**
+ * ipath_ud_loopback - handle send on loopback QPs
+ * @sqp: the QP
+ * @ss: the SGE state
+ * @length: the length of the data to send
+ * @wr: the work request
+ * @wc: the work completion entry
+ *
+ * This is called from ipath_post_ud_send() to forward a WQE addressed
+ * to the same HCA.
+ */
+void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss,
+                      u32 length, struct ib_send_wr *wr, struct ib_wc *wc)
+{
+       struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+       struct ipath_qp *qp;
+       struct ib_ah_attr *ah_attr;
+       unsigned long flags;
+       struct ipath_rq *rq;
+       struct ipath_srq *srq;
+       struct ipath_sge_state rsge;
+       struct ipath_sge *sge;
+       struct ipath_rwqe *wqe;
+
+       qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn);
+       if (!qp)
+               return;
+
+       /*
+        * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       if (unlikely(qp->ibqp.qp_num &&
+                    ((int) wr->wr.ud.remote_qkey < 0
+                     ? qp->qkey : wr->wr.ud.remote_qkey) != qp->qkey)) {
+               /* XXX OK to lose a count once in a while. */
+               dev->qkey_violations++;
+               dev->n_pkt_drops++;
+               goto done;
+       }
+
+       /*
+        * A GRH is expected to preceed the data even if not
+        * present on the wire.
+        */
+       wc->byte_len = length + sizeof(struct ib_grh);
+
+       if (wr->opcode == IB_WR_SEND_WITH_IMM) {
+               wc->wc_flags = IB_WC_WITH_IMM;
+               wc->imm_data = wr->imm_data;
+       } else {
+               wc->wc_flags = 0;
+               wc->imm_data = 0;
+       }
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        * Note that it is safe to drop the lock after changing rq->tail
+        * since ipath_post_receive() won't fill the empty slot.
+        */
+       if (qp->ibqp.srq) {
+               srq = to_isrq(qp->ibqp.srq);
+               rq = &srq->rq;
+       } else {
+               srq = NULL;
+               rq = &qp->r_rq;
+       }
+       spin_lock_irqsave(&rq->lock, flags);
+       if (rq->tail == rq->head) {
+               spin_unlock_irqrestore(&rq->lock, flags);
+               dev->n_pkt_drops++;
+               goto done;
+       }
+       /* Silently drop packets which are too big. */
+       wqe = get_rwqe_ptr(rq, rq->tail);
+       if (wc->byte_len > wqe->length) {
+               spin_unlock_irqrestore(&rq->lock, flags);
+               dev->n_pkt_drops++;
+               goto done;
+       }
+       wc->wr_id = wqe->wr_id;
+       rsge.sge = wqe->sg_list[0];
+       rsge.sg_list = wqe->sg_list + 1;
+       rsge.num_sge = wqe->num_sge;
+       if (++rq->tail >= rq->size)
+               rq->tail = 0;
+       if (srq && srq->ibsrq.event_handler) {
+               u32 n;
+
+               if (rq->head < rq->tail)
+                       n = rq->size + rq->head - rq->tail;
+               else
+                       n = rq->head - rq->tail;
+               if (n < srq->limit) {
+                       struct ib_event ev;
+
+                       srq->limit = 0;
+                       spin_unlock_irqrestore(&rq->lock, flags);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       srq->ibsrq.event_handler(&ev,
+                                                srq->ibsrq.srq_context);
+               } else
+                       spin_unlock_irqrestore(&rq->lock, flags);
+       } else
+               spin_unlock_irqrestore(&rq->lock, flags);
+       ah_attr = &to_iah(wr->wr.ud.ah)->attr;
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
+               wc->wc_flags |= IB_WC_GRH;
+       } else
+               ipath_skip_sge(&rsge, sizeof(struct ib_grh));
+       sge = &ss->sge;
+       while (length) {
+               u32 len = sge->length;
+
+               if (len > length)
+                       len = length;
+               BUG_ON(len == 0);
+               ipath_copy_sge(&rsge, sge->vaddr, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+       wc->status = IB_WC_SUCCESS;
+       wc->opcode = IB_WC_RECV;
+       wc->vendor_err = 0;
+       wc->qp_num = qp->ibqp.qp_num;
+       wc->src_qp = sqp->ibqp.qp_num;
+       /* XXX do we know which pkey matched? Only needed for GSI. */
+       wc->pkey_index = 0;
+       wc->slid = ipath_layer_get_lid(dev->dd) |
+               (ah_attr->src_path_bits &
+                ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1));
+       wc->sl = ah_attr->sl;
+       wc->dlid_path_bits =
+               ah_attr->dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+       /* Signal completion event if the solicited bit is set. */
+       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc,
+                      wr->send_flags & IB_SEND_SOLICITED);
+
+done:
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+}
+
+/**
+ * ipath_post_ud_send - post a UD send on QP
+ * @qp: the QP
+ * @wr: the work request
+ *
+ * Note that we actually send the data as it is posted instead of putting
+ * the request into a ring buffer.  If we wanted to use a ring buffer,
+ * we would need to save a reference to the destination address in the SWQE.
+ */
+int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+       struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_other_headers *ohdr;
+       struct ib_ah_attr *ah_attr;
+       struct ipath_sge_state ss;
+       struct ipath_sge *sg_list;
+       struct ib_wc wc;
+       u32 hwords;
+       u32 nwords;
+       u32 len;
+       u32 extra_bytes;
+       u32 bth0;
+       u16 lrh0;
+       u16 lid;
+       int i;
+       int ret;
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+               ret = 0;
+               goto bail;
+       }
+
+       /* IB spec says that num_sge == 0 is OK. */
+       if (wr->num_sge > qp->s_max_sge) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       if (wr->num_sge > 1) {
+               sg_list = kmalloc((qp->s_max_sge - 1) * sizeof(*sg_list),
+                                 GFP_ATOMIC);
+               if (!sg_list) {
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+       } else
+               sg_list = NULL;
+
+       /* Check the buffer to send. */
+       ss.sg_list = sg_list;
+       ss.sge.mr = NULL;
+       ss.sge.vaddr = NULL;
+       ss.sge.length = 0;
+       ss.sge.sge_length = 0;
+       ss.num_sge = 0;
+       len = 0;
+       for (i = 0; i < wr->num_sge; i++) {
+               /* Check LKEY */
+               if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+
+               if (wr->sg_list[i].length == 0)
+                       continue;
+               if (!ipath_lkey_ok(&dev->lk_table, ss.num_sge ?
+                                  sg_list + ss.num_sge - 1 : &ss.sge,
+                                  &wr->sg_list[i], 0)) {
+                       ret = -EINVAL;
+                       goto bail;
+               }
+               len += wr->sg_list[i].length;
+               ss.num_sge++;
+       }
+       extra_bytes = (4 - len) & 3;
+       nwords = (len + extra_bytes) >> 2;
+
+       /* Construct the header. */
+       ah_attr = &to_iah(wr->wr.ud.ah)->attr;
+       if (ah_attr->dlid == 0) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       if (ah_attr->dlid >= IPS_MULTICAST_LID_BASE) {
+               if (ah_attr->dlid != IPS_PERMISSIVE_LID)
+                       dev->n_multicast_xmit++;
+               else
+                       dev->n_unicast_xmit++;
+       } else {
+               dev->n_unicast_xmit++;
+               lid = ah_attr->dlid &
+                       ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+               if (unlikely(lid == ipath_layer_get_lid(dev->dd))) {
+                       /*
+                        * Pass in an uninitialized ib_wc to save stack
+                        * space.
+                        */
+                       ipath_ud_loopback(qp, &ss, len, wr, &wc);
+                       goto done;
+               }
+       }
+       if (ah_attr->ah_flags & IB_AH_GRH) {
+               /* Header size in 32-bit words. */
+               hwords = 17;
+               lrh0 = IPS_LRH_GRH;
+               ohdr = &qp->s_hdr.u.l.oth;
+               qp->s_hdr.u.l.grh.version_tclass_flow =
+                       cpu_to_be32((6 << 28) |
+                                   (ah_attr->grh.traffic_class << 20) |
+                                   ah_attr->grh.flow_label);
+               qp->s_hdr.u.l.grh.paylen =
+                       cpu_to_be16(((wr->opcode ==
+                                     IB_WR_SEND_WITH_IMM ? 6 : 5) +
+                                    nwords + SIZE_OF_CRC) << 2);
+               /* next_hdr is defined by C8-7 in ch. 8.4.1 */
+               qp->s_hdr.u.l.grh.next_hdr = 0x1B;
+               qp->s_hdr.u.l.grh.hop_limit = ah_attr->grh.hop_limit;
+               /* The SGID is 32-bit aligned. */
+               qp->s_hdr.u.l.grh.sgid.global.subnet_prefix =
+                       dev->gid_prefix;
+               qp->s_hdr.u.l.grh.sgid.global.interface_id =
+                       ipath_layer_get_guid(dev->dd);
+               qp->s_hdr.u.l.grh.dgid = ah_attr->grh.dgid;
+               /*
+                * Don't worry about sending to locally attached multicast
+                * QPs.  It is unspecified by the spec. what happens.
+                */
+       } else {
+               /* Header size in 32-bit words. */
+               hwords = 7;
+               lrh0 = IPS_LRH_BTH;
+               ohdr = &qp->s_hdr.u.oth;
+       }
+       if (wr->opcode == IB_WR_SEND_WITH_IMM) {
+               ohdr->u.ud.imm_data = wr->imm_data;
+               wc.imm_data = wr->imm_data;
+               hwords += 1;
+               bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+       } else if (wr->opcode == IB_WR_SEND) {
+               wc.imm_data = 0;
+               bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+       } else {
+               ret = -EINVAL;
+               goto bail;
+       }
+       lrh0 |= ah_attr->sl << 4;
+       if (qp->ibqp.qp_type == IB_QPT_SMI)
+               lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+       qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+       qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
+       qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
+       lid = ipath_layer_get_lid(dev->dd);
+       if (lid) {
+               lid |= ah_attr->src_path_bits &
+                       ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+               qp->s_hdr.lrh[3] = cpu_to_be16(lid);
+       } else
+               qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
+       if (wr->send_flags & IB_SEND_SOLICITED)
+               bth0 |= 1 << 23;
+       bth0 |= extra_bytes << 20;
+       bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPS_DEFAULT_P_KEY :
+               ipath_layer_get_pkey(dev->dd, qp->s_pkey_index);
+       ohdr->bth[0] = cpu_to_be32(bth0);
+       /*
+        * Use the multicast QP if the destination LID is a multicast LID.
+        */
+       ohdr->bth[1] = ah_attr->dlid >= IPS_MULTICAST_LID_BASE &&
+               ah_attr->dlid != IPS_PERMISSIVE_LID ?
+               __constant_cpu_to_be32(IPS_MULTICAST_QPN) :
+               cpu_to_be32(wr->wr.ud.remote_qpn);
+       /* XXX Could lose a PSN count but not worth locking */
+       ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPS_PSN_MASK);
+       /*
+        * Qkeys with the high order bit set mean use the
+        * qkey from the QP context instead of the WR (see 10.2.5).
+        */
+       ohdr->u.ud.deth[0] = cpu_to_be32((int)wr->wr.ud.remote_qkey < 0 ?
+                                        qp->qkey : wr->wr.ud.remote_qkey);
+       ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+       if (ipath_verbs_send(dev->dd, hwords, (u32 *) &qp->s_hdr,
+                            len, &ss))
+               dev->n_no_piobuf++;
+
+done:
+       /* Queue the completion status entry. */
+       if (!test_bit(IPATH_S_SIGNAL_REQ_WR, &qp->s_flags) ||
+           (wr->send_flags & IB_SEND_SIGNALED)) {
+               wc.wr_id = wr->wr_id;
+               wc.status = IB_WC_SUCCESS;
+               wc.vendor_err = 0;
+               wc.opcode = IB_WC_SEND;
+               wc.byte_len = len;
+               wc.qp_num = qp->ibqp.qp_num;
+               wc.src_qp = 0;
+               wc.wc_flags = 0;
+               /* XXX initialize other fields? */
+               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+       }
+       kfree(sg_list);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_ud_rcv - receive an incoming UD packet
+ * @dev: the device the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+       struct ipath_other_headers *ohdr;
+       int opcode;
+       u32 hdrsize;
+       u32 pad;
+       unsigned long flags;
+       struct ib_wc wc;
+       u32 qkey;
+       u32 src_qp;
+       struct ipath_rq *rq;
+       struct ipath_srq *srq;
+       struct ipath_rwqe *wqe;
+       u16 dlid;
+       int header_in_data;
+
+       /* Check for GRH */
+       if (!has_grh) {
+               ohdr = &hdr->u.oth;
+               hdrsize = 8 + 12 + 8;   /* LRH + BTH + DETH */
+               qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+               src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+               header_in_data = 0;
+       } else {
+               ohdr = &hdr->u.l.oth;
+               hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+               /*
+                * The header with GRH is 68 bytes and the core driver sets
+                * the eager header buffer size to 56 bytes so the last 12
+                * bytes of the IB header is in the data buffer.
+                */
+               header_in_data =
+                       ipath_layer_get_rcvhdrentsize(dev->dd) == 16;
+               if (header_in_data) {
+                       qkey = be32_to_cpu(((__be32 *) data)[1]);
+                       src_qp = be32_to_cpu(((__be32 *) data)[2]);
+                       data += 12;
+               } else {
+                       qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+                       src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+               }
+       }
+       src_qp &= IPS_QPN_MASK;
+
+       /*
+        * Check that the permissive LID is only used on QP0
+        * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+        */
+       if (qp->ibqp.qp_num) {
+               if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                            hdr->lrh[3] == IB_LID_PERMISSIVE)) {
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+               if (unlikely(qkey != qp->qkey)) {
+                       /* XXX OK to lose a count once in a while. */
+                       dev->qkey_violations++;
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+       } else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
+                  hdr->lrh[3] == IB_LID_PERMISSIVE) {
+               struct ib_smp *smp = (struct ib_smp *) data;
+
+               if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+       }
+
+       /* Get the number of bytes the message was padded by. */
+       pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+       if (unlikely(tlen < (hdrsize + pad + 4))) {
+               /* Drop incomplete packets. */
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+       tlen -= hdrsize + pad + 4;
+
+       /* Drop invalid MAD packets (see 13.5.3.1). */
+       if (unlikely((qp->ibqp.qp_num == 0 &&
+                     (tlen != 256 ||
+                      (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
+                    (qp->ibqp.qp_num == 1 &&
+                     (tlen != 256 ||
+                      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+
+       /*
+        * A GRH is expected to preceed the data even if not
+        * present on the wire.
+        */
+       wc.byte_len = tlen + sizeof(struct ib_grh);
+
+       /*
+        * The opcode is in the low byte when its in network order
+        * (top byte when in host order).
+        */
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       if (qp->ibqp.qp_num > 1 &&
+           opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+               if (header_in_data) {
+                       wc.imm_data = *(__be32 *) data;
+                       data += sizeof(__be32);
+               } else
+                       wc.imm_data = ohdr->u.ud.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               hdrsize += sizeof(u32);
+       } else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+               wc.imm_data = 0;
+               wc.wc_flags = 0;
+       } else {
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+
+       /*
+        * Get the next work request entry to find where to put the data.
+        * Note that it is safe to drop the lock after changing rq->tail
+        * since ipath_post_receive() won't fill the empty slot.
+        */
+       if (qp->ibqp.srq) {
+               srq = to_isrq(qp->ibqp.srq);
+               rq = &srq->rq;
+       } else {
+               srq = NULL;
+               rq = &qp->r_rq;
+       }
+       spin_lock_irqsave(&rq->lock, flags);
+       if (rq->tail == rq->head) {
+               spin_unlock_irqrestore(&rq->lock, flags);
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+       /* Silently drop packets which are too big. */
+       wqe = get_rwqe_ptr(rq, rq->tail);
+       if (wc.byte_len > wqe->length) {
+               spin_unlock_irqrestore(&rq->lock, flags);
+               dev->n_pkt_drops++;
+               goto bail;
+       }
+       wc.wr_id = wqe->wr_id;
+       qp->r_sge.sge = wqe->sg_list[0];
+       qp->r_sge.sg_list = wqe->sg_list + 1;
+       qp->r_sge.num_sge = wqe->num_sge;
+       if (++rq->tail >= rq->size)
+               rq->tail = 0;
+       if (srq && srq->ibsrq.event_handler) {
+               u32 n;
+
+               if (rq->head < rq->tail)
+                       n = rq->size + rq->head - rq->tail;
+               else
+                       n = rq->head - rq->tail;
+               if (n < srq->limit) {
+                       struct ib_event ev;
+
+                       srq->limit = 0;
+                       spin_unlock_irqrestore(&rq->lock, flags);
+                       ev.device = qp->ibqp.device;
+                       ev.element.srq = qp->ibqp.srq;
+                       ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+                       srq->ibsrq.event_handler(&ev,
+                                                srq->ibsrq.srq_context);
+               } else
+                       spin_unlock_irqrestore(&rq->lock, flags);
+       } else
+               spin_unlock_irqrestore(&rq->lock, flags);
+       if (has_grh) {
+               ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+                              sizeof(struct ib_grh));
+               wc.wc_flags |= IB_WC_GRH;
+       } else
+               ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
+       ipath_copy_sge(&qp->r_sge, data,
+                      wc.byte_len - sizeof(struct ib_grh));
+       wc.status = IB_WC_SUCCESS;
+       wc.opcode = IB_WC_RECV;
+       wc.vendor_err = 0;
+       wc.qp_num = qp->ibqp.qp_num;
+       wc.src_qp = src_qp;
+       /* XXX do we know which pkey matched? Only needed for GSI. */
+       wc.pkey_index = 0;
+       wc.slid = be16_to_cpu(hdr->lrh[3]);
+       wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+       dlid = be16_to_cpu(hdr->lrh[1]);
+       /*
+        * Save the LMC lower bits if the destination LID is a unicast LID.
+        */
+       wc.dlid_path_bits = dlid >= IPS_MULTICAST_LID_BASE ? 0 :
+               dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+       /* Signal completion event if the solicited bit is set. */
+       ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+                      (ohdr->bth[0] &
+                       __constant_cpu_to_be32(1 << 23)) != 0);
+
+bail:;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
new file mode 100644 (file)
index 0000000..2bb08af
--- /dev/null
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include "ipath_kernel.h"
+
+static void __ipath_release_user_pages(struct page **p, size_t num_pages,
+                                  int dirty)
+{
+       size_t i;
+
+       for (i = 0; i < num_pages; i++) {
+               ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
+                          (unsigned long) num_pages, p[i]);
+               if (dirty)
+                       set_page_dirty_lock(p[i]);
+               put_page(p[i]);
+       }
+}
+
+/* call with current->mm->mmap_sem held */
+static int __get_user_pages(unsigned long start_page, size_t num_pages,
+                       struct page **p, struct vm_area_struct **vma)
+{
+       unsigned long lock_limit;
+       size_t got;
+       int ret;
+
+#if 0
+       /*
+        * XXX - causes MPI programs to fail, haven't had time to check
+        * yet
+        */
+       if (!capable(CAP_IPC_LOCK)) {
+               ret = -EPERM;
+               goto bail;
+       }
+#endif
+
+       lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >>
+               PAGE_SHIFT;
+
+       if (num_pages > lock_limit) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+
+       ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
+                  (unsigned long) num_pages, start_page);
+
+       for (got = 0; got < num_pages; got += ret) {
+               ret = get_user_pages(current, current->mm,
+                                    start_page + got * PAGE_SIZE,
+                                    num_pages - got, 1, 1,
+                                    p + got, vma);
+               if (ret < 0)
+                       goto bail_release;
+       }
+
+       current->mm->locked_vm += num_pages;
+
+       ret = 0;
+       goto bail;
+
+bail_release:
+       __ipath_release_user_pages(p, got, 0);
+bail:
+       return ret;
+}
+
+/**
+ * ipath_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+                        struct page **p)
+{
+       int ret;
+
+       down_write(&current->mm->mmap_sem);
+
+       ret = __get_user_pages(start_page, num_pages, p, NULL);
+
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+/**
+ * ipath_get_user_pages_nocopy - lock a single page for I/O and mark shared
+ * @start_page: the page to lock
+ * @p: the output page structure
+ *
+ * This is similar to ipath_get_user_pages, but it's always one page, and we
+ * mark the page as locked for I/O, and shared.  This is used for the user
+ * process page that contains the destination address for the rcvhdrq tail
+ * update, so we need to have the vma. If we don't do this, the page can be
+ * taken away from us on fork, even if the child never touches it, and then
+ * the user process never sees the tail register updates.
+ */
+int ipath_get_user_pages_nocopy(unsigned long page, struct page **p)
+{
+       struct vm_area_struct *vma;
+       int ret;
+
+       down_write(&current->mm->mmap_sem);
+
+       ret = __get_user_pages(page, 1, p, &vma);
+
+       up_write(&current->mm->mmap_sem);
+
+       return ret;
+}
+
+void ipath_release_user_pages(struct page **p, size_t num_pages)
+{
+       down_write(&current->mm->mmap_sem);
+
+       __ipath_release_user_pages(p, num_pages, 1);
+
+       current->mm->locked_vm -= num_pages;
+
+       up_write(&current->mm->mmap_sem);
+}
+
+struct ipath_user_pages_work {
+       struct work_struct work;
+       struct mm_struct *mm;
+       unsigned long num_pages;
+};
+
+static void user_pages_account(void *ptr)
+{
+       struct ipath_user_pages_work *work = ptr;
+
+       down_write(&work->mm->mmap_sem);
+       work->mm->locked_vm -= work->num_pages;
+       up_write(&work->mm->mmap_sem);
+       mmput(work->mm);
+       kfree(work);
+}
+
+void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
+{
+       struct ipath_user_pages_work *work;
+       struct mm_struct *mm;
+
+       __ipath_release_user_pages(p, num_pages, 1);
+
+       mm = get_task_mm(current);
+       if (!mm)
+               goto bail;
+
+       work = kmalloc(sizeof(*work), GFP_KERNEL);
+       if (!work)
+               goto bail_mm;
+
+       goto bail;
+
+       INIT_WORK(&work->work, user_pages_account, work);
+       work->mm = mm;
+       work->num_pages = num_pages;
+
+bail_mm:
+       mmput(mm);
+bail:
+       return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
new file mode 100644 (file)
index 0000000..9f27fd3
--- /dev/null
@@ -0,0 +1,1222 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/utsname.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ips_common.h"
+
+/* Not static, because we don't want the compiler removing it */
+const char ipath_verbs_version[] = "ipath_verbs " IPATH_IDSTR;
+
+unsigned int ib_ipath_qp_table_size = 251;
+module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_ipath_lkey_table_size = 12;
+module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
+                  S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+                "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+unsigned int ib_ipath_debug;   /* debug mask */
+module_param_named(debug, ib_ipath_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "Verbs debug mask");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("PathScale <support@pathscale.com>");
+MODULE_DESCRIPTION("Pathscale InfiniPath driver");
+
+const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
+       [IB_QPS_RESET] = 0,
+       [IB_QPS_INIT] = IPATH_POST_RECV_OK,
+       [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+       [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+           IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+       [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+           IPATH_POST_SEND_OK,
+       [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+       [IB_QPS_ERR] = 0,
+};
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
+       [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+       [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+       [IB_WR_SEND] = IB_WC_SEND,
+       [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+       [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+       [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+       [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+__be64 sys_image_guid;
+
+/**
+ * ipath_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+{
+       struct ipath_sge *sge = &ss->sge;
+
+       while (length) {
+               u32 len = sge->length;
+
+               BUG_ON(len == 0);
+               if (len > length)
+                       len = length;
+               memcpy(sge->vaddr, data, len);
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               data += len;
+               length -= len;
+       }
+}
+
+/**
+ * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
+{
+       struct ipath_sge *sge = &ss->sge;
+
+       while (length > sge->sge_length) {
+               length -= sge->sge_length;
+               ss->sge = *ss->sg_list++;
+       }
+       while (length) {
+               u32 len = sge->length;
+
+               BUG_ON(len == 0);
+               if (len > length)
+                       len = length;
+               sge->vaddr += len;
+               sge->length -= len;
+               sge->sge_length -= len;
+               if (sge->sge_length == 0) {
+                       if (--ss->num_sge)
+                               *sge = *ss->sg_list++;
+               } else if (sge->length == 0 && sge->mr != NULL) {
+                       if (++sge->n >= IPATH_SEGSZ) {
+                               if (++sge->m >= sge->mr->mapsz)
+                                       break;
+                               sge->n = 0;
+                       }
+                       sge->vaddr =
+                               sge->mr->map[sge->m]->segs[sge->n].vaddr;
+                       sge->length =
+                               sge->mr->map[sge->m]->segs[sge->n].length;
+               }
+               length -= len;
+       }
+}
+
+/**
+ * ipath_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+                          struct ib_send_wr **bad_wr)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       int err = 0;
+
+       /* Check that state is OK to post send. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) {
+               *bad_wr = wr;
+               err = -EINVAL;
+               goto bail;
+       }
+
+       for (; wr; wr = wr->next) {
+               switch (qp->ibqp.qp_type) {
+               case IB_QPT_UC:
+               case IB_QPT_RC:
+                       err = ipath_post_rc_send(qp, wr);
+                       break;
+
+               case IB_QPT_SMI:
+               case IB_QPT_GSI:
+               case IB_QPT_UD:
+                       err = ipath_post_ud_send(qp, wr);
+                       break;
+
+               default:
+                       err = -EINVAL;
+               }
+               if (err) {
+                       *bad_wr = wr;
+                       break;
+               }
+       }
+
+bail:
+       return err;
+}
+
+/**
+ * ipath_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                             struct ib_recv_wr **bad_wr)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       unsigned long flags;
+       int ret;
+
+       /* Check that state is OK to post receive. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK)) {
+               *bad_wr = wr;
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       for (; wr; wr = wr->next) {
+               struct ipath_rwqe *wqe;
+               u32 next;
+               int i, j;
+
+               if (wr->num_sge > qp->r_rq.max_sge) {
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               spin_lock_irqsave(&qp->r_rq.lock, flags);
+               next = qp->r_rq.head + 1;
+               if (next >= qp->r_rq.size)
+                       next = 0;
+               if (next == qp->r_rq.tail) {
+                       spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+                       *bad_wr = wr;
+                       ret = -ENOMEM;
+                       goto bail;
+               }
+
+               wqe = get_rwqe_ptr(&qp->r_rq, qp->r_rq.head);
+               wqe->wr_id = wr->wr_id;
+               wqe->sg_list[0].mr = NULL;
+               wqe->sg_list[0].vaddr = NULL;
+               wqe->sg_list[0].length = 0;
+               wqe->sg_list[0].sge_length = 0;
+               wqe->length = 0;
+               for (i = 0, j = 0; i < wr->num_sge; i++) {
+                       /* Check LKEY */
+                       if (to_ipd(qp->ibqp.pd)->user &&
+                           wr->sg_list[i].lkey == 0) {
+                               spin_unlock_irqrestore(&qp->r_rq.lock,
+                                                      flags);
+                               *bad_wr = wr;
+                               ret = -EINVAL;
+                               goto bail;
+                       }
+                       if (wr->sg_list[i].length == 0)
+                               continue;
+                       if (!ipath_lkey_ok(
+                                   &to_idev(qp->ibqp.device)->lk_table,
+                                   &wqe->sg_list[j], &wr->sg_list[i],
+                                   IB_ACCESS_LOCAL_WRITE)) {
+                               spin_unlock_irqrestore(&qp->r_rq.lock,
+                                                      flags);
+                               *bad_wr = wr;
+                               ret = -EINVAL;
+                               goto bail;
+                       }
+                       wqe->length += wr->sg_list[i].length;
+                       j++;
+               }
+               wqe->num_sge = j;
+               qp->r_rq.head = next;
+               spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+       }
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_qp_rcv - processing an incoming packet on a QP
+ * @dev: the device the packet came on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_qp_rcv(struct ipath_ibdev *dev,
+                        struct ipath_ib_header *hdr, int has_grh,
+                        void *data, u32 tlen, struct ipath_qp *qp)
+{
+       /* Check for valid receive state. */
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               dev->n_pkt_drops++;
+               return;
+       }
+
+       switch (qp->ibqp.qp_type) {
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+       case IB_QPT_UD:
+               ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
+               break;
+
+       case IB_QPT_RC:
+               ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
+               break;
+
+       case IB_QPT_UC:
+               ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
+               break;
+
+       default:
+               break;
+       }
+}
+
+/**
+ * ipath_ib_rcv - process and incoming packet
+ * @arg: the device pointer
+ * @rhdr: the header of the packet
+ * @data: the packet data
+ * @tlen: the packet length
+ *
+ * This is called from ipath_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+static void ipath_ib_rcv(void *arg, void *rhdr, void *data, u32 tlen)
+{
+       struct ipath_ibdev *dev = (struct ipath_ibdev *) arg;
+       struct ipath_ib_header *hdr = rhdr;
+       struct ipath_other_headers *ohdr;
+       struct ipath_qp *qp;
+       u32 qp_num;
+       int lnh;
+       u8 opcode;
+       u16 lid;
+
+       if (unlikely(dev == NULL))
+               goto bail;
+
+       if (unlikely(tlen < 24)) {      /* LRH+BTH+CRC */
+               dev->rcv_errors++;
+               goto bail;
+       }
+
+       /* Check for a valid destination LID (see ch. 7.11.1). */
+       lid = be16_to_cpu(hdr->lrh[1]);
+       if (lid < IPS_MULTICAST_LID_BASE) {
+               lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+               if (unlikely(lid != ipath_layer_get_lid(dev->dd))) {
+                       dev->rcv_errors++;
+                       goto bail;
+               }
+       }
+
+       /* Check for GRH */
+       lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+       if (lnh == IPS_LRH_BTH)
+               ohdr = &hdr->u.oth;
+       else if (lnh == IPS_LRH_GRH)
+               ohdr = &hdr->u.l.oth;
+       else {
+               dev->rcv_errors++;
+               goto bail;
+       }
+
+       opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+       dev->opstats[opcode].n_bytes += tlen;
+       dev->opstats[opcode].n_packets++;
+
+       /* Get the destination QP number. */
+       qp_num = be32_to_cpu(ohdr->bth[1]) & IPS_QPN_MASK;
+       if (qp_num == IPS_MULTICAST_QPN) {
+               struct ipath_mcast *mcast;
+               struct ipath_mcast_qp *p;
+
+               mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
+               if (mcast == NULL) {
+                       dev->n_pkt_drops++;
+                       goto bail;
+               }
+               dev->n_multicast_rcv++;
+               list_for_each_entry_rcu(p, &mcast->qp_list, list)
+                       ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data,
+                                    tlen, p->qp);
+               /*
+                * Notify ipath_multicast_detach() if it is waiting for us
+                * to finish.
+                */
+               if (atomic_dec_return(&mcast->refcount) <= 1)
+                       wake_up(&mcast->wait);
+       } else {
+               qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
+               if (qp) {
+                       dev->n_unicast_rcv++;
+                       ipath_qp_rcv(dev, hdr, lnh == IPS_LRH_GRH, data,
+                                    tlen, qp);
+                       /*
+                        * Notify ipath_destroy_qp() if it is waiting
+                        * for us to finish.
+                        */
+                       if (atomic_dec_and_test(&qp->refcount))
+                               wake_up(&qp->wait);
+               } else
+                       dev->n_pkt_drops++;
+       }
+
+bail:;
+}
+
+/**
+ * ipath_ib_timer - verbs timer
+ * @arg: the device pointer
+ *
+ * This is called from ipath_do_rcv_timer() at interrupt level to check for
+ * QPs which need retransmits and to collect performance numbers.
+ */
+static void ipath_ib_timer(void *arg)
+{
+       struct ipath_ibdev *dev = (struct ipath_ibdev *) arg;
+       struct ipath_qp *resend = NULL;
+       struct ipath_qp *rnr = NULL;
+       struct list_head *last;
+       struct ipath_qp *qp;
+       unsigned long flags;
+
+       if (dev == NULL)
+               return;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       /* Start filling the next pending queue. */
+       if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
+               dev->pending_index = 0;
+       /* Save any requests still in the new queue, they have timed out. */
+       last = &dev->pending[dev->pending_index];
+       while (!list_empty(last)) {
+               qp = list_entry(last->next, struct ipath_qp, timerwait);
+               if (last->next == LIST_POISON1 ||
+                   last->next != &qp->timerwait ||
+                   qp->timerwait.prev != last) {
+                       INIT_LIST_HEAD(last);
+               } else {
+                       list_del(&qp->timerwait);
+                       qp->timerwait.prev = (struct list_head *) resend;
+                       resend = qp;
+                       atomic_inc(&qp->refcount);
+               }
+       }
+       last = &dev->rnrwait;
+       if (!list_empty(last)) {
+               qp = list_entry(last->next, struct ipath_qp, timerwait);
+               if (--qp->s_rnr_timeout == 0) {
+                       do {
+                               if (last->next == LIST_POISON1 ||
+                                   last->next != &qp->timerwait ||
+                                   qp->timerwait.prev != last) {
+                                       INIT_LIST_HEAD(last);
+                                       break;
+                               }
+                               list_del(&qp->timerwait);
+                               qp->timerwait.prev =
+                                       (struct list_head *) rnr;
+                               rnr = qp;
+                               if (list_empty(last))
+                                       break;
+                               qp = list_entry(last->next, struct ipath_qp,
+                                               timerwait);
+                       } while (qp->s_rnr_timeout == 0);
+               }
+       }
+       /*
+        * We should only be in the started state if pma_sample_start != 0
+        */
+       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
+           --dev->pma_sample_start == 0) {
+               dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+               ipath_layer_snapshot_counters(dev->dd, &dev->ipath_sword,
+                                             &dev->ipath_rword,
+                                             &dev->ipath_spkts,
+                                             &dev->ipath_rpkts,
+                                             &dev->ipath_xmit_wait);
+       }
+       if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+               if (dev->pma_sample_interval == 0) {
+                       u64 ta, tb, tc, td, te;
+
+                       dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+                       ipath_layer_snapshot_counters(dev->dd, &ta, &tb,
+                                                     &tc, &td, &te);
+
+                       dev->ipath_sword = ta - dev->ipath_sword;
+                       dev->ipath_rword = tb - dev->ipath_rword;
+                       dev->ipath_spkts = tc - dev->ipath_spkts;
+                       dev->ipath_rpkts = td - dev->ipath_rpkts;
+                       dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
+               }
+               else
+                       dev->pma_sample_interval--;
+       }
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+       /* XXX What if timer fires again while this is running? */
+       for (qp = resend; qp != NULL;
+            qp = (struct ipath_qp *) qp->timerwait.prev) {
+               struct ib_wc wc;
+
+               spin_lock_irqsave(&qp->s_lock, flags);
+               if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) {
+                       dev->n_timeouts++;
+                       ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
+               }
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+
+               /* Notify ipath_destroy_qp() if it is waiting. */
+               if (atomic_dec_and_test(&qp->refcount))
+                       wake_up(&qp->wait);
+       }
+       for (qp = rnr; qp != NULL;
+            qp = (struct ipath_qp *) qp->timerwait.prev)
+               tasklet_hi_schedule(&qp->s_task);
+}
+
+/**
+ * ipath_ib_piobufavail - callback when a PIO buffer is available
+ * @arg: the device pointer
+ *
+ * This is called from ipath_intr() at interrupt level when a PIO buffer is
+ * available after ipath_verbs_send() returned an error that no buffers were
+ * available.  Return 0 if we consumed all the PIO buffers and we still have
+ * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and
+ * return one).
+ */
+static int ipath_ib_piobufavail(void *arg)
+{
+       struct ipath_ibdev *dev = (struct ipath_ibdev *) arg;
+       struct ipath_qp *qp;
+       unsigned long flags;
+
+       if (dev == NULL)
+               goto bail;
+
+       spin_lock_irqsave(&dev->pending_lock, flags);
+       while (!list_empty(&dev->piowait)) {
+               qp = list_entry(dev->piowait.next, struct ipath_qp,
+                               piowait);
+               list_del(&qp->piowait);
+               tasklet_hi_schedule(&qp->s_task);
+       }
+       spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+bail:
+       return 1;
+}
+
+static int ipath_query_device(struct ib_device *ibdev,
+                             struct ib_device_attr *props)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       u32 vendor, boardrev, majrev, minrev;
+
+       memset(props, 0, sizeof(*props));
+
+       props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+               IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+               IB_DEVICE_SYS_IMAGE_GUID;
+       ipath_layer_query_device(dev->dd, &vendor, &boardrev,
+                                &majrev, &minrev);
+       props->vendor_id = vendor;
+       props->vendor_part_id = boardrev;
+       props->hw_ver = boardrev << 16 | majrev << 8 | minrev;
+
+       props->sys_image_guid = dev->sys_image_guid;
+
+       props->max_mr_size = ~0ull;
+       props->max_qp = 0xffff;
+       props->max_qp_wr = 0xffff;
+       props->max_sge = 255;
+       props->max_cq = 0xffff;
+       props->max_cqe = 0xffff;
+       props->max_mr = 0xffff;
+       props->max_pd = 0xffff;
+       props->max_qp_rd_atom = 1;
+       props->max_qp_init_rd_atom = 1;
+       /* props->max_res_rd_atom */
+       props->max_srq = 0xffff;
+       props->max_srq_wr = 0xffff;
+       props->max_srq_sge = 255;
+       /* props->local_ca_ack_delay */
+       props->atomic_cap = IB_ATOMIC_HCA;
+       props->max_pkeys = ipath_layer_get_npkeys(dev->dd);
+       props->max_mcast_grp = 0xffff;
+       props->max_mcast_qp_attach = 0xffff;
+       props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+               props->max_mcast_grp;
+
+       return 0;
+}
+
+const u8 ipath_cvt_physportstate[16] = {
+       [INFINIPATH_IBCS_LT_STATE_DISABLED] = 3,
+       [INFINIPATH_IBCS_LT_STATE_LINKUP] = 5,
+       [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = 2,
+       [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = 2,
+       [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = 1,
+       [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = 1,
+       [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = 4,
+       [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = 4,
+       [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = 4,
+       [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = 4,
+       [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = 6,
+       [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = 6,
+       [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = 6,
+};
+
+static int ipath_query_port(struct ib_device *ibdev,
+                           u8 port, struct ib_port_attr *props)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       enum ib_mtu mtu;
+       u16 lid = ipath_layer_get_lid(dev->dd);
+       u64 ibcstat;
+
+       memset(props, 0, sizeof(*props));
+       props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE);
+       props->lmc = dev->mkeyprot_resv_lmc & 7;
+       props->sm_lid = dev->sm_lid;
+       props->sm_sl = dev->sm_sl;
+       ibcstat = ipath_layer_get_lastibcstat(dev->dd);
+       props->state = ((ibcstat >> 4) & 0x3) + 1;
+       /* See phys_state_show() */
+       props->phys_state = ipath_cvt_physportstate[
+               ipath_layer_get_lastibcstat(dev->dd) & 0xf];
+       props->port_cap_flags = dev->port_cap_flags;
+       props->gid_tbl_len = 1;
+       props->max_msg_sz = 4096;
+       props->pkey_tbl_len = ipath_layer_get_npkeys(dev->dd);
+       props->bad_pkey_cntr = ipath_layer_get_cr_errpkey(dev->dd) -
+               dev->n_pkey_violations;
+       props->qkey_viol_cntr = dev->qkey_violations;
+       props->active_width = IB_WIDTH_4X;
+       /* See rate_show() */
+       props->active_speed = 1;        /* Regular 10Mbs speed. */
+       props->max_vl_num = 1;          /* VLCap = VL0 */
+       props->init_type_reply = 0;
+
+       props->max_mtu = IB_MTU_4096;
+       switch (ipath_layer_get_ibmtu(dev->dd)) {
+       case 4096:
+               mtu = IB_MTU_4096;
+               break;
+       case 2048:
+               mtu = IB_MTU_2048;
+               break;
+       case 1024:
+               mtu = IB_MTU_1024;
+               break;
+       case 512:
+               mtu = IB_MTU_512;
+               break;
+       case 256:
+               mtu = IB_MTU_256;
+               break;
+       default:
+               mtu = IB_MTU_2048;
+       }
+       props->active_mtu = mtu;
+       props->subnet_timeout = dev->subnet_timeout;
+
+       return 0;
+}
+
+static int ipath_modify_device(struct ib_device *device,
+                              int device_modify_mask,
+                              struct ib_device_modify *device_modify)
+{
+       int ret;
+
+       if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+                                  IB_DEVICE_MODIFY_NODE_DESC)) {
+               ret = -EOPNOTSUPP;
+               goto bail;
+       }
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
+               memcpy(device->node_desc, device_modify->node_desc, 64);
+
+       if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+               to_idev(device)->sys_image_guid =
+                       cpu_to_be64(device_modify->sys_image_guid);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static int ipath_modify_port(struct ib_device *ibdev,
+                            u8 port, int port_modify_mask,
+                            struct ib_port_modify *props)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+
+       dev->port_cap_flags |= props->set_port_cap_mask;
+       dev->port_cap_flags &= ~props->clr_port_cap_mask;
+       if (port_modify_mask & IB_PORT_SHUTDOWN)
+               ipath_layer_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
+       if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+               dev->qkey_violations = 0;
+       return 0;
+}
+
+static int ipath_query_gid(struct ib_device *ibdev, u8 port,
+                          int index, union ib_gid *gid)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int ret;
+
+       if (index >= 1) {
+               ret = -EINVAL;
+               goto bail;
+       }
+       gid->global.subnet_prefix = dev->gid_prefix;
+       gid->global.interface_id = ipath_layer_get_guid(dev->dd);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
+                                   struct ib_ucontext *context,
+                                   struct ib_udata *udata)
+{
+       struct ipath_pd *pd;
+       struct ib_pd *ret;
+
+       pd = kmalloc(sizeof *pd, GFP_KERNEL);
+       if (!pd) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       /* ib_alloc_pd() will initialize pd->ibpd. */
+       pd->user = udata != NULL;
+
+       ret = &pd->ibpd;
+
+bail:
+       return ret;
+}
+
+static int ipath_dealloc_pd(struct ib_pd *ibpd)
+{
+       struct ipath_pd *pd = to_ipd(ibpd);
+
+       kfree(pd);
+
+       return 0;
+}
+
+/**
+ * ipath_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
+                                    struct ib_ah_attr *ah_attr)
+{
+       struct ipath_ah *ah;
+       struct ib_ah *ret;
+
+       /* A multicast address requires a GRH (see ch. 8.4.1). */
+       if (ah_attr->dlid >= IPS_MULTICAST_LID_BASE &&
+           ah_attr->dlid != IPS_PERMISSIVE_LID &&
+           !(ah_attr->ah_flags & IB_AH_GRH)) {
+               ret = ERR_PTR(-EINVAL);
+               goto bail;
+       }
+
+       ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+       if (!ah) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       /* ib_create_ah() will initialize ah->ibah. */
+       ah->attr = *ah_attr;
+
+       ret = &ah->ibah;
+
+bail:
+       return ret;
+}
+
+/**
+ * ipath_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_destroy_ah(struct ib_ah *ibah)
+{
+       struct ipath_ah *ah = to_iah(ibah);
+
+       kfree(ah);
+
+       return 0;
+}
+
+static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+       struct ipath_ah *ah = to_iah(ibah);
+
+       *ah_attr = ah->attr;
+
+       return 0;
+}
+
+static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+                           u16 *pkey)
+{
+       struct ipath_ibdev *dev = to_idev(ibdev);
+       int ret;
+
+       if (index >= ipath_layer_get_npkeys(dev->dd)) {
+               ret = -EINVAL;
+               goto bail;
+       }
+
+       *pkey = ipath_layer_get_pkey(dev->dd, index);
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+
+/**
+ * ipath_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the InfiniPath driver
+ */
+
+static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
+                                               struct ib_udata *udata)
+{
+       struct ipath_ucontext *context;
+       struct ib_ucontext *ret;
+
+       context = kmalloc(sizeof *context, GFP_KERNEL);
+       if (!context) {
+               ret = ERR_PTR(-ENOMEM);
+               goto bail;
+       }
+
+       ret = &context->ibucontext;
+
+bail:
+       return ret;
+}
+
+static int ipath_dealloc_ucontext(struct ib_ucontext *context)
+{
+       kfree(to_iucontext(context));
+       return 0;
+}
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev);
+
+/**
+ * ipath_register_ib_device - register our device with the infiniband core
+ * @unit: the device number to register
+ * @dd: the device data structure
+ * Return the allocated ipath_ibdev pointer or NULL on error.
+ */
+static void *ipath_register_ib_device(int unit, struct ipath_devdata *dd)
+{
+       struct ipath_ibdev *idev;
+       struct ib_device *dev;
+       int ret;
+
+       idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
+       if (idev == NULL)
+               goto bail;
+
+       dev = &idev->ibdev;
+
+       /* Only need to initialize non-zero fields. */
+       spin_lock_init(&idev->qp_table.lock);
+       spin_lock_init(&idev->lk_table.lock);
+       idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE);
+       /* Set the prefix to the default value (see ch. 4.1.1) */
+       idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL);
+
+       ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
+       if (ret)
+               goto err_qp;
+
+       /*
+        * The top ib_ipath_lkey_table_size bits are used to index the
+        * table.  The lower 8 bits can be owned by the user (copied from
+        * the LKEY).  The remaining bits act as a generation number or tag.
+        */
+       idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
+       idev->lk_table.table = kzalloc(idev->lk_table.max *
+                                      sizeof(*idev->lk_table.table),
+                                      GFP_KERNEL);
+       if (idev->lk_table.table == NULL) {
+               ret = -ENOMEM;
+               goto err_lk;
+       }
+       spin_lock_init(&idev->pending_lock);
+       INIT_LIST_HEAD(&idev->pending[0]);
+       INIT_LIST_HEAD(&idev->pending[1]);
+       INIT_LIST_HEAD(&idev->pending[2]);
+       INIT_LIST_HEAD(&idev->piowait);
+       INIT_LIST_HEAD(&idev->rnrwait);
+       idev->pending_index = 0;
+       idev->port_cap_flags =
+               IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+       idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+       idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+       idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+       idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+       idev->pma_counter_select[5] = IB_PMA_PORT_XMIT_WAIT;
+       idev->link_width_enabled = 3;   /* 1x or 4x */
+
+       /*
+        * The system image GUID is supposed to be the same for all
+        * IB HCAs in a single system but since there can be other
+        * device types in the system, we can't be sure this is unique.
+        */
+       if (!sys_image_guid)
+               sys_image_guid = ipath_layer_get_guid(dd);
+       idev->sys_image_guid = sys_image_guid;
+       idev->ib_unit = unit;
+       idev->dd = dd;
+
+       strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
+       dev->node_guid = ipath_layer_get_guid(dd);
+       dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
+       dev->uverbs_cmd_mask =
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+               (1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
+               (1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
+               (1ull << IB_USER_VERBS_CMD_REG_MR)              |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+               (1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
+               (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+               (1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+               (1ull << IB_USER_VERBS_CMD_POST_SEND)           |
+               (1ull << IB_USER_VERBS_CMD_POST_RECV)           |
+               (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+               (1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+               (1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
+               (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+       dev->node_type = IB_NODE_CA;
+       dev->phys_port_cnt = 1;
+       dev->dma_device = ipath_layer_get_device(dd);
+       dev->class_dev.dev = dev->dma_device;
+       dev->query_device = ipath_query_device;
+       dev->modify_device = ipath_modify_device;
+       dev->query_port = ipath_query_port;
+       dev->modify_port = ipath_modify_port;
+       dev->query_pkey = ipath_query_pkey;
+       dev->query_gid = ipath_query_gid;
+       dev->alloc_ucontext = ipath_alloc_ucontext;
+       dev->dealloc_ucontext = ipath_dealloc_ucontext;
+       dev->alloc_pd = ipath_alloc_pd;
+       dev->dealloc_pd = ipath_dealloc_pd;
+       dev->create_ah = ipath_create_ah;
+       dev->destroy_ah = ipath_destroy_ah;
+       dev->query_ah = ipath_query_ah;
+       dev->create_srq = ipath_create_srq;
+       dev->modify_srq = ipath_modify_srq;
+       dev->query_srq = ipath_query_srq;
+       dev->destroy_srq = ipath_destroy_srq;
+       dev->create_qp = ipath_create_qp;
+       dev->modify_qp = ipath_modify_qp;
+       dev->query_qp = ipath_query_qp;
+       dev->destroy_qp = ipath_destroy_qp;
+       dev->post_send = ipath_post_send;
+       dev->post_recv = ipath_post_receive;
+       dev->post_srq_recv = ipath_post_srq_receive;
+       dev->create_cq = ipath_create_cq;
+       dev->destroy_cq = ipath_destroy_cq;
+       dev->resize_cq = ipath_resize_cq;
+       dev->poll_cq = ipath_poll_cq;
+       dev->req_notify_cq = ipath_req_notify_cq;
+       dev->get_dma_mr = ipath_get_dma_mr;
+       dev->reg_phys_mr = ipath_reg_phys_mr;
+       dev->reg_user_mr = ipath_reg_user_mr;
+       dev->dereg_mr = ipath_dereg_mr;
+       dev->alloc_fmr = ipath_alloc_fmr;
+       dev->map_phys_fmr = ipath_map_phys_fmr;
+       dev->unmap_fmr = ipath_unmap_fmr;
+       dev->dealloc_fmr = ipath_dealloc_fmr;
+       dev->attach_mcast = ipath_multicast_attach;
+       dev->detach_mcast = ipath_multicast_detach;
+       dev->process_mad = ipath_process_mad;
+
+       snprintf(dev->node_desc, sizeof(dev->node_desc),
+                IPATH_IDSTR " %s kernel_SMA", system_utsname.nodename);
+
+       ret = ib_register_device(dev);
+       if (ret)
+               goto err_reg;
+
+       if (ipath_verbs_register_sysfs(dev))
+               goto err_class;
+
+       ipath_layer_enable_timer(dd);
+
+       goto bail;
+
+err_class:
+       ib_unregister_device(dev);
+err_reg:
+       kfree(idev->lk_table.table);
+err_lk:
+       kfree(idev->qp_table.table);
+err_qp:
+       ib_dealloc_device(dev);
+       _VERBS_ERROR("ib_ipath%d cannot register verbs (%d)!\n",
+                    unit, -ret);
+       idev = NULL;
+
+bail:
+       return idev;
+}
+
+static void ipath_unregister_ib_device(void *arg)
+{
+       struct ipath_ibdev *dev = (struct ipath_ibdev *) arg;
+       struct ib_device *ibdev = &dev->ibdev;
+
+       ipath_layer_disable_timer(dev->dd);
+
+       ib_unregister_device(ibdev);
+
+       if (!list_empty(&dev->pending[0]) ||
+           !list_empty(&dev->pending[1]) ||
+           !list_empty(&dev->pending[2]))
+               _VERBS_ERROR("ipath%d pending list not empty!\n",
+                            dev->ib_unit);
+       if (!list_empty(&dev->piowait))
+               _VERBS_ERROR("ipath%d piowait list not empty!\n",
+                            dev->ib_unit);
+       if (!list_empty(&dev->rnrwait))
+               _VERBS_ERROR("ipath%d rnrwait list not empty!\n",
+                            dev->ib_unit);
+       if (!ipath_mcast_tree_empty())
+               _VERBS_ERROR("ipath%d multicast table memory leak!\n",
+                            dev->ib_unit);
+       /*
+        * Note that ipath_unregister_ib_device() can be called before all
+        * the QPs are destroyed!
+        */
+       ipath_free_all_qps(&dev->qp_table);
+       kfree(dev->qp_table.table);
+       kfree(dev->lk_table.table);
+       ib_dealloc_device(ibdev);
+}
+
+int __init ipath_verbs_init(void)
+{
+       return ipath_verbs_register(ipath_register_ib_device,
+                                   ipath_unregister_ib_device,
+                                   ipath_ib_piobufavail, ipath_ib_rcv,
+                                   ipath_ib_timer);
+}
+
+void __exit ipath_verbs_cleanup(void)
+{
+       ipath_verbs_unregister();
+}
+
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+        struct ipath_ibdev *dev =
+                container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+        int vendor, boardrev, majrev, minrev;
+
+        ipath_layer_query_device(dev->dd, &vendor, &boardrev,
+                                 &majrev, &minrev);
+        return sprintf(buf, "%d.%d\n", majrev, minrev);
+}
+
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+        struct ipath_ibdev *dev =
+                container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+        int ret;
+
+        ret = ipath_layer_get_boardname(dev->dd, buf, 128);
+        if (ret < 0)
+                goto bail;
+        strcat(buf, "\n");
+        ret = strlen(buf);
+
+bail:
+       return ret;
+}
+
+static ssize_t show_stats(struct class_device *cdev, char *buf)
+{
+        struct ipath_ibdev *dev =
+                container_of(cdev, struct ipath_ibdev, ibdev.class_dev);
+        int i;
+        int len;
+
+        len = sprintf(buf,
+                      "RC resends  %d\n"
+                      "RC QACKs    %d\n"
+                      "RC ACKs     %d\n"
+                      "RC SEQ NAKs %d\n"
+                      "RC RDMA seq %d\n"
+                      "RC RNR NAKs %d\n"
+                      "RC OTH NAKs %d\n"
+                      "RC timeouts %d\n"
+                      "RC RDMA dup %d\n"
+                      "piobuf wait %d\n"
+                      "no piobuf   %d\n"
+                      "PKT drops   %d\n"
+                      "WQE errs    %d\n",
+                      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
+                      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
+                      dev->n_other_naks, dev->n_timeouts,
+                      dev->n_rdma_dup_busy, dev->n_piowait,
+                      dev->n_no_piobuf, dev->n_pkt_drops, dev->n_wqe_errs);
+        for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
+               const struct ipath_opcode_stats *si = &dev->opstats[i];
+
+                if (!si->n_packets && !si->n_bytes)
+                        continue;
+                len += sprintf(buf + len, "%02x %llu/%llu\n", i,
+                              (unsigned long long) si->n_packets,
+                               (unsigned long long) si->n_bytes);
+        }
+        return len;
+}
+
+static CLASS_DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static CLASS_DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+
+static struct class_device_attribute *ipath_class_attributes[] = {
+        &class_device_attr_hw_rev,
+        &class_device_attr_hca_type,
+        &class_device_attr_board_id,
+        &class_device_attr_stats
+};
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev)
+{
+        int i;
+       int ret;
+
+        for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+                if (class_device_create_file(&dev->class_dev,
+                                             ipath_class_attributes[i])) {
+                        ret = 1;
+                       goto bail;
+               }
+
+        ret = 0;
+
+bail:
+       return ret;
+}
+
+module_init(ipath_verbs_init);
+module_exit(ipath_verbs_cleanup);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
new file mode 100644 (file)
index 0000000..b824632
--- /dev/null
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPATH_VERBS_H
+#define IPATH_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <rdma/ib_pack.h>
+
+#include "ipath_layer.h"
+#include "verbs_debug.h"
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IPATH_UVERBS_ABI_VERSION       1
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE     (IB_CQ_NEXT_COMP + 1)
+
+#define IB_RNR_NAK                     0x20
+#define IB_NAK_PSN_ERROR               0x60
+#define IB_NAK_INVALID_REQUEST         0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR     0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST      0x64
+
+#define IPATH_POST_SEND_OK             0x01
+#define IPATH_POST_RECV_OK             0x02
+#define IPATH_PROCESS_RECV_OK          0x04
+#define IPATH_PROCESS_SEND_OK          0x08
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE      0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED   0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING   0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA  __constant_htons(0x0001)
+#define IB_PMA_PORT_RCV_DATA   __constant_htons(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS  __constant_htons(0x0003)
+#define IB_PMA_PORT_RCV_PKTS   __constant_htons(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT  __constant_htons(0x0005)
+
+struct ib_reth {
+       __be64 vaddr;
+       __be32 rkey;
+       __be32 length;
+} __attribute__ ((packed));
+
+struct ib_atomic_eth {
+       __be64 vaddr;
+       __be32 rkey;
+       __be64 swap_data;
+       __be64 compare_data;
+} __attribute__ ((packed));
+
+struct ipath_other_headers {
+       __be32 bth[3];
+       union {
+               struct {
+                       __be32 deth[2];
+                       __be32 imm_data;
+               } ud;
+               struct {
+                       struct ib_reth reth;
+                       __be32 imm_data;
+               } rc;
+               struct {
+                       __be32 aeth;
+                       __be64 atomic_ack_eth;
+               } at;
+               __be32 imm_data;
+               __be32 aeth;
+               struct ib_atomic_eth atomic_eth;
+       } u;
+} __attribute__ ((packed));
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct ipath_ib_header {
+       __be16 lrh[4];
+       union {
+               struct {
+                       struct ib_grh grh;
+                       struct ipath_other_headers oth;
+               } l;
+               struct ipath_other_headers oth;
+       } u;
+} __attribute__ ((packed));
+
+/*
+ * There is one struct ipath_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct ipath_mcast_qp.
+ */
+struct ipath_mcast_qp {
+       struct list_head list;
+       struct ipath_qp *qp;
+};
+
+struct ipath_mcast {
+       struct rb_node rb_node;
+       union ib_gid mgid;
+       struct list_head qp_list;
+       wait_queue_head_t wait;
+       atomic_t refcount;
+};
+
+/* Memory region */
+struct ipath_mr {
+       struct ib_mr ibmr;
+       struct ipath_mregion mr;        /* must be last */
+};
+
+/* Fast memory region */
+struct ipath_fmr {
+       struct ib_fmr ibfmr;
+       u8 page_shift;
+       struct ipath_mregion mr;        /* must be last */
+};
+
+/* Protection domain */
+struct ipath_pd {
+       struct ib_pd ibpd;
+       int user;               /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct ipath_ah {
+       struct ib_ah ibah;
+       struct ib_ah_attr attr;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table.  Each
+ * struct ipath_cq/qp also has its own lock.  An individual qp lock
+ * may be taken inside of an individual cq lock.  Both cqs attached to
+ * a qp may be locked, with the send cq locked first.  No other
+ * nesting should be done.
+ *
+ * Each struct ipath_cq/qp also has an atomic_t ref count.  The
+ * pointer from the cq/qp_table to the struct counts as one reference.
+ * This reference also is good for access through the consumer API, so
+ * modifying the CQ/QP etc doesn't need to take another reference.
+ * Access because of a completion being polled does need a reference.
+ *
+ * Finally, each struct ipath_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table, remove pointer, unlock cq/qp_table lock
+ * - decrement ref count
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when the
+ * QP is destroyed.  Also, the consumer must make sure that calls to
+ * qp_modify are serialized.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ *   indexed (say) by the page in the table
+ */
+
+struct ipath_cq {
+       struct ib_cq ibcq;
+       struct tasklet_struct comptask;
+       spinlock_t lock;
+       u8 notify;
+       u8 triggered;
+       u32 head;               /* new records added to the head */
+       u32 tail;               /* poll_cq() reads from here. */
+       struct ib_wc *queue;    /* this is actually ibcq.cqe + 1 */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct ipath_swqe {
+       struct ib_send_wr wr;   /* don't use wr.sg_list */
+       u32 psn;                /* first packet sequence number */
+       u32 lpsn;               /* last packet sequence number */
+       u32 ssn;                /* send sequence number */
+       u32 length;             /* total length of data in sg_list */
+       struct ipath_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->r_max_sge.
+ */
+struct ipath_rwqe {
+       u64 wr_id;
+       u32 length;             /* total length of data in sg_list */
+       u8 num_sge;
+       struct ipath_sge sg_list[0];
+};
+
+struct ipath_rq {
+       spinlock_t lock;
+       u32 head;               /* new work requests posted to the head */
+       u32 tail;               /* receives pull requests from here. */
+       u32 size;               /* size of RWQE array */
+       u8 max_sge;
+       struct ipath_rwqe *wq;  /* RWQE array */
+};
+
+struct ipath_srq {
+       struct ib_srq ibsrq;
+       struct ipath_rq rq;
+       /* send signal when number of RWQEs < limit */
+       u32 limit;
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct ipath_qp {
+       struct ib_qp ibqp;
+       struct ipath_qp *next;  /* link list for QPN hash table */
+       struct list_head piowait;       /* link for wait PIO buf */
+       struct list_head timerwait;     /* link for waiting for timeouts */
+       struct ib_ah_attr remote_ah_attr;
+       struct ipath_ib_header s_hdr;   /* next packet header to send */
+       atomic_t refcount;
+       wait_queue_head_t wait;
+       struct tasklet_struct s_task;
+       struct ipath_sge_state *s_cur_sge;
+       struct ipath_sge_state s_sge;   /* current send request data */
+       /* current RDMA read send data */
+       struct ipath_sge_state s_rdma_sge;
+       struct ipath_sge_state r_sge;   /* current receive data */
+       spinlock_t s_lock;
+       unsigned long s_flags;
+       u32 s_hdrwords;         /* size of s_hdr in 32 bit words */
+       u32 s_cur_size;         /* size of send packet in bytes */
+       u32 s_len;              /* total length of s_sge */
+       u32 s_rdma_len;         /* total length of s_rdma_sge */
+       u32 s_next_psn;         /* PSN for next request */
+       u32 s_last_psn;         /* last response PSN processed */
+       u32 s_psn;              /* current packet sequence number */
+       u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
+       u32 s_ack_psn;          /* PSN for next ACK or RDMA_READ */
+       u64 s_ack_atomic;       /* data for atomic ACK */
+       u64 r_wr_id;            /* ID for current receive WQE */
+       u64 r_atomic_data;      /* data for last atomic op */
+       u32 r_atomic_psn;       /* PSN of last atomic op */
+       u32 r_len;              /* total length of r_sge */
+       u32 r_rcv_len;          /* receive data len processed */
+       u32 r_psn;              /* expected rcv packet sequence number */
+       u8 state;               /* QP state */
+       u8 s_state;             /* opcode of last packet sent */
+       u8 s_ack_state;         /* opcode of packet to ACK */
+       u8 s_nak_state;         /* non-zero if NAK is pending */
+       u8 r_state;             /* opcode of last packet received */
+       u8 r_reuse_sge;         /* for UC receive errors */
+       u8 r_sge_inx;           /* current index into sg_list */
+       u8 s_max_sge;           /* size of s_wq->sg_list */
+       u8 qp_access_flags;
+       u8 s_retry_cnt;         /* number of times to retry */
+       u8 s_rnr_retry_cnt;
+       u8 s_min_rnr_timer;
+       u8 s_retry;             /* requester retry counter */
+       u8 s_rnr_retry;         /* requester RNR retry counter */
+       u8 s_pkey_index;        /* PKEY index to use */
+       enum ib_mtu path_mtu;
+       atomic_t msn;           /* message sequence number */
+       u32 remote_qpn;
+       u32 qkey;               /* QKEY for this QP (for UD or RD) */
+       u32 s_size;             /* send work queue size */
+       u32 s_head;             /* new entries added here */
+       u32 s_tail;             /* next entry to process */
+       u32 s_cur;              /* current work queue entry */
+       u32 s_last;             /* last un-ACK'ed entry */
+       u32 s_ssn;              /* SSN of tail entry */
+       u32 s_lsn;              /* limit sequence number (credit) */
+       struct ipath_swqe *s_wq;        /* send work queue */
+       struct ipath_rq r_rq;   /* receive work queue */
+};
+
+/*
+ * Bit definitions for s_flags.
+ */
+#define IPATH_S_BUSY           0
+#define IPATH_S_SIGNAL_REQ_WR  1
+
+/*
+ * Since struct ipath_swqe is not a fixed size, we can't simply index into
+ * struct ipath_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
+                                             unsigned n)
+{
+       return (struct ipath_swqe *)((char *)qp->s_wq +
+                                    (sizeof(struct ipath_swqe) +
+                                     qp->s_max_sge *
+                                     sizeof(struct ipath_sge)) * n);
+}
+
+/*
+ * Since struct ipath_rwqe is not a fixed size, we can't simply index into
+ * struct ipath_rq.wq.  This function does the array index computation.
+ */
+static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
+                                             unsigned n)
+{
+       return (struct ipath_rwqe *)
+               ((char *) rq->wq +
+                (sizeof(struct ipath_rwqe) +
+                 rq->max_sge * sizeof(struct ipath_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+       atomic_t n_free;
+       void *page;
+};
+
+struct ipath_qp_table {
+       spinlock_t lock;
+       u32 last;               /* last QP number allocated */
+       u32 max;                /* size of the hash table */
+       u32 nmaps;              /* size of the map table */
+       struct ipath_qp **table;
+       /* bit map of free numbers */
+       struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct ipath_lkey_table {
+       spinlock_t lock;
+       u32 next;               /* next unused index (speeds search) */
+       u32 gen;                /* generation count */
+       u32 max;                /* size of the table */
+       struct ipath_mregion **table;
+};
+
+struct ipath_opcode_stats {
+       u64 n_packets;          /* number of packets */
+       u64 n_bytes;            /* total number of bytes */
+};
+
+struct ipath_ibdev {
+       struct ib_device ibdev;
+       struct list_head dev_list;
+       struct ipath_devdata *dd;
+       int ib_unit;            /* This is the device number */
+       u16 sm_lid;             /* in host order */
+       u8 sm_sl;
+       u8 mkeyprot_resv_lmc;
+       /* non-zero when timer is set */
+       unsigned long mkey_lease_timeout;
+
+       /* The following fields are really per port. */
+       struct ipath_qp_table qp_table;
+       struct ipath_lkey_table lk_table;
+       struct list_head pending[3];    /* FIFO of QPs waiting for ACKs */
+       struct list_head piowait;       /* list for wait PIO buf */
+       /* list of QPs waiting for RNR timer */
+       struct list_head rnrwait;
+       spinlock_t pending_lock;
+       __be64 sys_image_guid;  /* in network order */
+       __be64 gid_prefix;      /* in network order */
+       __be64 mkey;
+       u64 ipath_sword;        /* total dwords sent (sample result) */
+       u64 ipath_rword;        /* total dwords received (sample result) */
+       u64 ipath_spkts;        /* total packets sent (sample result) */
+       u64 ipath_rpkts;        /* total packets received (sample result) */
+       /* # of ticks no data sent (sample result) */
+       u64 ipath_xmit_wait;
+       u64 rcv_errors;         /* # of packets with SW detected rcv errs */
+       u64 n_unicast_xmit;     /* total unicast packets sent */
+       u64 n_unicast_rcv;      /* total unicast packets received */
+       u64 n_multicast_xmit;   /* total multicast packets sent */
+       u64 n_multicast_rcv;    /* total multicast packets received */
+       u64 n_symbol_error_counter;     /* starting count for PMA */
+       u64 n_link_error_recovery_counter;      /* starting count for PMA */
+       u64 n_link_downed_counter;      /* starting count for PMA */
+       u64 n_port_rcv_errors;  /* starting count for PMA */
+       u64 n_port_rcv_remphys_errors;  /* starting count for PMA */
+       u64 n_port_xmit_discards;       /* starting count for PMA */
+       u64 n_port_xmit_data;   /* starting count for PMA */
+       u64 n_port_rcv_data;    /* starting count for PMA */
+       u64 n_port_xmit_packets;        /* starting count for PMA */
+       u64 n_port_rcv_packets; /* starting count for PMA */
+       u32 n_pkey_violations;  /* starting count for PMA */
+       u32 n_rc_resends;
+       u32 n_rc_acks;
+       u32 n_rc_qacks;
+       u32 n_seq_naks;
+       u32 n_rdma_seq;
+       u32 n_rnr_naks;
+       u32 n_other_naks;
+       u32 n_timeouts;
+       u32 n_pkt_drops;
+       u32 n_wqe_errs;
+       u32 n_rdma_dup_busy;
+       u32 n_piowait;
+       u32 n_no_piobuf;
+       u32 port_cap_flags;
+       u32 pma_sample_start;
+       u32 pma_sample_interval;
+       __be16 pma_counter_select[5];
+       u16 pma_tag;
+       u16 qkey_violations;
+       u16 mkey_violations;
+       u16 mkey_lease_period;
+       u16 pending_index;      /* which pending queue is active */
+       u8 pma_sample_status;
+       u8 subnet_timeout;
+       u8 link_width_enabled;
+       u8 vl_high_limit;
+       struct ipath_opcode_stats opstats[128];
+};
+
+struct ipath_ucontext {
+       struct ib_ucontext ibucontext;
+};
+
+static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct ipath_mr, ibmr);
+}
+
+static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+       return container_of(ibfmr, struct ipath_fmr, ibfmr);
+}
+
+static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
+{
+       return container_of(ibpd, struct ipath_pd, ibpd);
+}
+
+static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
+{
+       return container_of(ibah, struct ipath_ah, ibah);
+}
+
+static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
+{
+       return container_of(ibcq, struct ipath_cq, ibcq);
+}
+
+static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
+{
+       return container_of(ibsrq, struct ipath_srq, ibsrq);
+}
+
+static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct ipath_qp, ibqp);
+}
+
+static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
+{
+       return container_of(ibdev, struct ipath_ibdev, ibdev);
+}
+
+int ipath_process_mad(struct ib_device *ibdev,
+                     int mad_flags,
+                     u8 port_num,
+                     struct ib_wc *in_wc,
+                     struct ib_grh *in_grh,
+                     struct ib_mad *in_mad, struct ib_mad *out_mad);
+
+static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
+                                                 *ibucontext)
+{
+       return container_of(ibucontext, struct ipath_ucontext, ibucontext);
+}
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int ipath_cmp24(u32 a, u32 b)
+{
+       return (((int) a) - ((int) b)) << 8;
+}
+
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_mcast_tree_empty(void);
+
+__be32 ipath_compute_aeth(struct ipath_qp *qp);
+
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
+
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+                             struct ib_qp_init_attr *init_attr,
+                             struct ib_udata *udata);
+
+int ipath_destroy_qp(struct ib_qp *ibqp);
+
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                   int attr_mask);
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+                  int attr_mask, struct ib_qp_init_attr *init_attr);
+
+void ipath_free_all_qps(struct ipath_qp_table *qpt);
+
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
+
+void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc);
+
+void ipath_error_qp(struct ipath_qp *qp);
+
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
+
+void ipath_do_rc_send(unsigned long data);
+
+void ipath_do_uc_send(unsigned long data);
+
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
+int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
+                 u32 len, u64 vaddr, u32 rkey, int acc);
+
+int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
+                 struct ib_sge *sge, int acc);
+
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
+
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
+
+int ipath_post_rc_send(struct ipath_qp *qp, struct ib_send_wr *wr);
+
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc);
+
+void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss,
+                      u32 length, struct ib_send_wr *wr, struct ib_wc *wc);
+
+int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
+
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+                 int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
+                    struct ipath_mregion *mr);
+
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
+
+int ipath_lkey_ok(struct ipath_lkey_table *rkt, struct ipath_sge *isge,
+                 struct ib_sge *sge, int acc);
+
+int ipath_rkey_ok(struct ipath_ibdev *dev, struct ipath_sge_state *ss,
+                 u32 len, u64 vaddr, u32 rkey, int acc);
+
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+                          struct ib_recv_wr **bad_wr);
+
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+                               struct ib_srq_init_attr *srq_init_attr,
+                               struct ib_udata *udata);
+
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+                    enum ib_srq_attr_mask attr_mask);
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int ipath_destroy_srq(struct ib_srq *ibsrq);
+
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries,
+                             struct ib_ucontext *context,
+                             struct ib_udata *udata);
+
+int ipath_destroy_cq(struct ib_cq *ibcq);
+
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify notify);
+
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+                               struct ib_phys_buf *buffer_list,
+                               int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, struct ib_umem *region,
+                               int mr_access_flags,
+                               struct ib_udata *udata);
+
+int ipath_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+                              struct ib_fmr_attr *fmr_attr);
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+                      int list_len, u64 iova);
+
+int ipath_unmap_fmr(struct list_head *fmr_list);
+
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
+
+void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev);
+
+void ipath_insert_rnr_queue(struct ipath_qp *qp);
+
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
+
+void ipath_ruc_loopback(struct ipath_qp *sqp, struct ib_wc *wc);
+
+extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
+
+extern const u8 ipath_cvt_physportstate[];
+
+extern const int ib_ipath_state_ops[];
+
+extern unsigned int ib_ipath_lkey_table_size;
+
+extern const u32 ib_ipath_rnr_table[];
+
+#endif                         /* IPATH_VERBS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
new file mode 100644 (file)
index 0000000..10b31d2
--- /dev/null
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+
+#include "ipath_verbs.h"
+
+/*
+ * Global table of GID to attached QPs.
+ * The table is global to all ipath devices since a send from one QP/device
+ * needs to be locally routed to any locally attached QPs on the same
+ * or different device.
+ */
+static struct rb_root mcast_tree;
+static DEFINE_SPINLOCK(mcast_lock);
+
+/**
+ * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
+{
+       struct ipath_mcast_qp *mqp;
+
+       mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
+       if (!mqp)
+               goto bail;
+
+       mqp->qp = qp;
+       atomic_inc(&qp->refcount);
+
+bail:
+       return mqp;
+}
+
+static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
+{
+       struct ipath_qp *qp = mqp->qp;
+
+       /* Notify ipath_destroy_qp() if it is waiting. */
+       if (atomic_dec_and_test(&qp->refcount))
+               wake_up(&qp->wait);
+
+       kfree(mqp);
+}
+
+/**
+ * ipath_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
+{
+       struct ipath_mcast *mcast;
+
+       mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+       if (!mcast)
+               goto bail;
+
+       mcast->mgid = *mgid;
+       INIT_LIST_HEAD(&mcast->qp_list);
+       init_waitqueue_head(&mcast->wait);
+       atomic_set(&mcast->refcount, 0);
+
+bail:
+       return mcast;
+}
+
+static void ipath_mcast_free(struct ipath_mcast *mcast)
+{
+       struct ipath_mcast_qp *p, *tmp;
+
+       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+               ipath_mcast_qp_free(p);
+
+       kfree(mcast);
+}
+
+/**
+ * ipath_mcast_find - search the global table for the given multicast GID
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
+{
+       struct rb_node *n;
+       unsigned long flags;
+       struct ipath_mcast *mcast;
+
+       spin_lock_irqsave(&mcast_lock, flags);
+       n = mcast_tree.rb_node;
+       while (n) {
+               int ret;
+
+               mcast = rb_entry(n, struct ipath_mcast, rb_node);
+
+               ret = memcmp(mgid->raw, mcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0)
+                       n = n->rb_left;
+               else if (ret > 0)
+                       n = n->rb_right;
+               else {
+                       atomic_inc(&mcast->refcount);
+                       spin_unlock_irqrestore(&mcast_lock, flags);
+                       goto bail;
+               }
+       }
+       spin_unlock_irqrestore(&mcast_lock, flags);
+
+       mcast = NULL;
+
+bail:
+       return mcast;
+}
+
+/**
+ * ipath_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int ipath_mcast_add(struct ipath_mcast *mcast,
+                          struct ipath_mcast_qp *mqp)
+{
+       struct rb_node **n = &mcast_tree.rb_node;
+       struct rb_node *pn = NULL;
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&mcast_lock, flags);
+
+       while (*n) {
+               struct ipath_mcast *tmcast;
+               struct ipath_mcast_qp *p;
+
+               pn = *n;
+               tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
+
+               ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0) {
+                       n = &pn->rb_left;
+                       continue;
+               }
+               if (ret > 0) {
+                       n = &pn->rb_right;
+                       continue;
+               }
+
+               /* Search the QP list to see if this is already there. */
+               list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+                       if (p->qp == mqp->qp) {
+                               spin_unlock_irqrestore(&mcast_lock, flags);
+                               ret = ESRCH;
+                               goto bail;
+                       }
+               }
+               list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+               spin_unlock_irqrestore(&mcast_lock, flags);
+               ret = EEXIST;
+               goto bail;
+       }
+
+       list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+       atomic_inc(&mcast->refcount);
+       rb_link_node(&mcast->rb_node, pn, n);
+       rb_insert_color(&mcast->rb_node, &mcast_tree);
+
+       spin_unlock_irqrestore(&mcast_lock, flags);
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       struct ipath_mcast *mcast;
+       struct ipath_mcast_qp *mqp;
+       int ret;
+
+       /*
+        * Allocate data structures since its better to do this outside of
+        * spin locks and it will most likely be needed.
+        */
+       mcast = ipath_mcast_alloc(gid);
+       if (mcast == NULL) {
+               ret = -ENOMEM;
+               goto bail;
+       }
+       mqp = ipath_mcast_qp_alloc(qp);
+       if (mqp == NULL) {
+               ipath_mcast_free(mcast);
+               ret = -ENOMEM;
+               goto bail;
+       }
+       switch (ipath_mcast_add(mcast, mqp)) {
+       case ESRCH:
+               /* Neither was used: can't attach the same QP twice. */
+               ipath_mcast_qp_free(mqp);
+               ipath_mcast_free(mcast);
+               ret = -EINVAL;
+               goto bail;
+       case EEXIST:            /* The mcast wasn't used */
+               ipath_mcast_free(mcast);
+               break;
+       default:
+               break;
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+       struct ipath_qp *qp = to_iqp(ibqp);
+       struct ipath_mcast *mcast = NULL;
+       struct ipath_mcast_qp *p, *tmp;
+       struct rb_node *n;
+       unsigned long flags;
+       int last = 0;
+       int ret;
+
+       spin_lock_irqsave(&mcast_lock, flags);
+
+       /* Find the GID in the mcast table. */
+       n = mcast_tree.rb_node;
+       while (1) {
+               if (n == NULL) {
+                       spin_unlock_irqrestore(&mcast_lock, flags);
+                       ret = 0;
+                       goto bail;
+               }
+
+               mcast = rb_entry(n, struct ipath_mcast, rb_node);
+               ret = memcmp(gid->raw, mcast->mgid.raw,
+                            sizeof(union ib_gid));
+               if (ret < 0)
+                       n = n->rb_left;
+               else if (ret > 0)
+                       n = n->rb_right;
+               else
+                       break;
+       }
+
+       /* Search the QP list. */
+       list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+               if (p->qp != qp)
+                       continue;
+               /*
+                * We found it, so remove it, but don't poison the forward
+                * link until we are sure there are no list walkers.
+                */
+               list_del_rcu(&p->list);
+
+               /* If this was the last attached QP, remove the GID too. */
+               if (list_empty(&mcast->qp_list)) {
+                       rb_erase(&mcast->rb_node, &mcast_tree);
+                       last = 1;
+               }
+               break;
+       }
+
+       spin_unlock_irqrestore(&mcast_lock, flags);
+
+       if (p) {
+               /*
+                * Wait for any list walkers to finish before freeing the
+                * list element.
+                */
+               wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+               ipath_mcast_qp_free(p);
+       }
+       if (last) {
+               atomic_dec(&mcast->refcount);
+               wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+               ipath_mcast_free(mcast);
+       }
+
+       ret = 0;
+
+bail:
+       return ret;
+}
+
+int ipath_mcast_tree_empty(void)
+{
+       return mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
new file mode 100644 (file)
index 0000000..adc5322
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+       int ret = 0;
+       u64 pioaddr, piolen;
+       unsigned bits;
+       const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+       const size_t len = pci_resource_len(dd->pcidev, 0);
+
+       /*
+        * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+        * chip.  Linux (possibly the hardware) requires it to be on a power
+        * of 2 address matching the length (which has to be a power of 2).
+        * For rev1, that means the base address, for rev2, it will be just
+        * the PIO buffers themselves.
+        */
+       pioaddr = addr + dd->ipath_piobufbase;
+       piolen = (dd->ipath_piobcnt2k +
+                 dd->ipath_piobcnt4k) *
+               ALIGN(dd->ipath_piobcnt2k +
+                     dd->ipath_piobcnt4k, dd->ipath_palign);
+
+       for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+               /* do nothing */ ;
+
+       if (piolen != (1ULL << bits)) {
+               piolen >>= bits;
+               while (piolen >>= 1)
+                       bits++;
+               piolen = 1ULL << (bits + 1);
+       }
+       if (pioaddr & (piolen - 1)) {
+               u64 atmp;
+               ipath_dbg("pioaddr %llx not on right boundary for size "
+                         "%llx, fixing\n",
+                         (unsigned long long) pioaddr,
+                         (unsigned long long) piolen);
+               atmp = pioaddr & ~(piolen - 1);
+               if (atmp < addr || (atmp + piolen) > (addr + len)) {
+                       ipath_dev_err(dd, "No way to align address/size "
+                                     "(%llx/%llx), no WC mtrr\n",
+                                     (unsigned long long) atmp,
+                                     (unsigned long long) piolen << 1);
+                       ret = -ENODEV;
+               } else {
+                       ipath_dbg("changing WC base from %llx to %llx, "
+                                 "len from %llx to %llx\n",
+                                 (unsigned long long) pioaddr,
+                                 (unsigned long long) atmp,
+                                 (unsigned long long) piolen,
+                                 (unsigned long long) piolen << 1);
+                       pioaddr = atmp;
+                       piolen <<= 1;
+               }
+       }
+
+       if (!ret) {
+               int cookie;
+               ipath_cdbg(VERBOSE, "Setting mtrr for chip to WC "
+                          "(addr %llx, len=0x%llx)\n",
+                          (unsigned long long) pioaddr,
+                          (unsigned long long) piolen);
+               cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0);
+               if (cookie < 0) {
+                       {
+                               dev_info(&dd->pcidev->dev,
+                                        "mtrr_add()  WC for PIO bufs "
+                                        "failed (%d)\n",
+                                        cookie);
+                               ret = -EINVAL;
+                       }
+               } else {
+                       ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, "
+                                  "cookie is %d\n", cookie);
+                       dd->ipath_wc_cookie = cookie;
+               }
+       }
+
+       return ret;
+}
+
+/**
+ * ipath_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ */
+void ipath_disable_wc(struct ipath_devdata *dd)
+{
+       if (dd->ipath_wc_cookie) {
+               ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n");
+               mtrr_del(dd->ipath_wc_cookie, 0, 0);
+               dd->ipath_wc_cookie = 0;
+       }
+}
+
+/**
+ * ipath_unordered_wc - indicate whether write combining is ordered
+ *
+ * Because our performance depends on our ability to do write combining mmio
+ * writes in the most efficient way, we need to know if we are on an Intel
+ * or AMD x86_64 processor.  AMD x86_64 processors flush WC buffers out in
+ * the order completed, and so no special flushing is required to get
+ * correct ordering.  Intel processors, however, will flush write buffers
+ * out in "random" orders, and so explicit ordering is needed at times.
+ */
+int ipath_unordered_wc(void)
+{
+       return boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
+}
diff --git a/drivers/infiniband/hw/ipath/ips_common.h b/drivers/infiniband/hw/ipath/ips_common.h
new file mode 100644 (file)
index 0000000..410a764
--- /dev/null
@@ -0,0 +1,263 @@
+#ifndef IPS_COMMON_H
+#define IPS_COMMON_H
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_common.h"
+
+struct ipath_header {
+       /*
+        * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
+        * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+        * Port 3, TID 11, offset 14.
+        */
+       __le32 ver_port_tid_offset;
+       __le16 chksum;
+       __le16 pkt_flags;
+};
+
+struct ips_message_header {
+       __be16 lrh[4];
+       __be32 bth[3];
+       /* fields below this point are in host byte order */
+       struct ipath_header iph;
+       __u8 sub_opcode;
+       __u8 flags;
+       __u16 src_rank;
+       /* 24 bits. The upper 8 bit is available for other use */
+       union {
+               struct {
+                       unsigned ack_seq_num:24;
+                       unsigned port:4;
+                       unsigned unused:4;
+               };
+               __u32 ack_seq_num_org;
+       };
+       __u8 expected_tid_session_id;
+       __u8 tinylen;           /* to aid MPI */
+       union {
+           __u16 tag;          /* to aid MPI */
+           __u16 mqhdr;        /* for PSM MQ */
+       };
+       union {
+               __u32 mpi[4];   /* to aid MPI */
+               __u32 data[4];
+               __u64 mq[2];    /* for PSM MQ */
+               struct {
+                       __u16 mtu;
+                       __u8 major_ver;
+                       __u8 minor_ver;
+                       __u32 not_used; //free
+                       __u32 run_id;
+                       __u32 client_ver;
+               };
+       };
+};
+
+struct ether_header {
+       __be16 lrh[4];
+       __be32 bth[3];
+       struct ipath_header iph;
+       __u8 sub_opcode;
+       __u8 cmd;
+       __be16 lid;
+       __u16 mac[3];
+       __u8 frag_num;
+       __u8 seq_num;
+       __le32 len;
+       /* MUST be of word size due to PIO write requirements */
+       __u32 csum;
+       __le16 csum_offset;
+       __le16 flags;
+       __u16 first_2_bytes;
+       __u8 unused[2];         /* currently unused */
+};
+
+/*
+ * The PIO buffer used for sending infinipath messages must only be written
+ * in 32-bit words, all the data must be written, and no writes can occur
+ * after the last word is written (which transfers "ownership" of the buffer
+ * to the chip and triggers the message to be sent).
+ * Since the Linux sk_buff structure can be recursive, non-aligned, and
+ * any number of bytes in each segment, we use the following structure
+ * to keep information about the overall state of the copy operation.
+ * This is used to save the information needed to store the checksum
+ * in the right place before sending the last word to the hardware and
+ * to buffer the last 0-3 bytes of non-word sized segments.
+ */
+struct copy_data_s {
+       struct ether_header *hdr;
+       /* addr of PIO buf to write csum to */
+       __u32 __iomem *csum_pio;
+       __u32 __iomem *to;      /* addr of PIO buf to write data to */
+       __u32 device;           /* which device to allocate PIO bufs from */
+       __s32 error;            /* set if there is an error. */
+       __s32 extra;            /* amount of data saved in u.buf below */
+       __u32 len;              /* total length to send in bytes */
+       __u32 flen;             /* frament length in words */
+       __u32 csum;             /* partial IP checksum */
+       __u32 pos;              /* position for partial checksum */
+       __u32 offset;           /* offset to where data currently starts */
+       __s32 checksum_calc;    /* set to 1 when csum has been calculated */
+       struct sk_buff *skb;
+       union {
+               __u32 w;
+               __u8 buf[4];
+       } u;
+};
+
+/* IB - LRH header consts */
+#define IPS_LRH_GRH 0x0003     /* 1. word of IB LRH - next header: GRH */
+#define IPS_LRH_BTH 0x0002     /* 1. word of IB LRH - next header: BTH */
+
+#define IPS_OFFSET  0
+
+/*
+ * defines the cut-off point between the header queue and eager/expected
+ * TID queue
+ */
+#define NUM_OF_EXTRA_WORDS_IN_HEADER_QUEUE \
+       ((sizeof(struct ips_message_header) - \
+         offsetof(struct ips_message_header, iph)) >> 2)
+
+/* OpCodes  */
+#define OPCODE_IPS 0xC0
+#define OPCODE_ITH4X 0xC1
+
+/* OpCode 30 is use by stand-alone test programs  */
+#define OPCODE_RAW_DATA 0xDE
+/* last OpCode (31) is reserved for test  */
+#define OPCODE_TEST 0xDF
+
+/* sub OpCodes - ips  */
+#define OPCODE_SEQ_DATA 0x01
+#define OPCODE_SEQ_CTRL 0x02
+
+#define OPCODE_SEQ_MQ_DATA 0x03
+#define OPCODE_SEQ_MQ_CTRL 0x04
+
+#define OPCODE_ACK 0x10
+#define OPCODE_NAK 0x11
+
+#define OPCODE_ERR_CHK 0x20
+#define OPCODE_ERR_CHK_PLS 0x21
+
+#define OPCODE_STARTUP 0x30
+#define OPCODE_STARTUP_ACK 0x31
+#define OPCODE_STARTUP_NAK 0x32
+
+#define OPCODE_STARTUP_EXT 0x34
+#define OPCODE_STARTUP_ACK_EXT 0x35
+#define OPCODE_STARTUP_NAK_EXT 0x36
+
+#define OPCODE_TIDS_RELEASE 0x40
+#define OPCODE_TIDS_RELEASE_CONFIRM 0x41
+
+#define OPCODE_CLOSE 0x50
+#define OPCODE_CLOSE_ACK 0x51
+/*
+ * like OPCODE_CLOSE, but no complaint if other side has already closed.
+ * Used when doing abort(), MPI_Abort(), etc.
+ */
+#define OPCODE_ABORT 0x52
+
+/* sub OpCodes - ith4x  */
+#define OPCODE_ENCAP 0x81
+#define OPCODE_LID_ARP 0x82
+
+/* Receive Header Queue: receive type (from infinipath) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define EAGER_TID_ID INFINIPATH_I_TID_MASK
+
+#define IPS_DEFAULT_P_KEY 0xFFFF
+
+#define IPS_PERMISSIVE_LID 0xFFFF
+#define IPS_MULTICAST_LID_BASE 0xC000
+
+#define IPS_AETH_CREDIT_SHIFT 24
+#define IPS_AETH_CREDIT_MASK 0x1F
+#define IPS_AETH_CREDIT_INVAL 0x1F
+
+#define IPS_PSN_MASK 0xFFFFFF
+#define IPS_MSN_MASK 0xFFFFFF
+#define IPS_QPN_MASK 0xFFFFFF
+#define IPS_MULTICAST_QPN 0xFFFFFF
+
+/* functions for extracting fields from rcvhdrq entries */
+static inline __u32 ips_get_hdr_err_flags(const __le32 * rbuf)
+{
+       return __le32_to_cpu(rbuf[1]);
+}
+
+static inline __u32 ips_get_index(const __le32 * rbuf)
+{
+       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
+           & INFINIPATH_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 ips_get_rcv_type(const __le32 * rbuf)
+{
+       return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
+           & INFINIPATH_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 ips_get_length_in_bytes(const __le32 * rbuf)
+{
+       return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
+               & INFINIPATH_RHF_LENGTH_MASK) << 2;
+}
+
+static inline void *ips_get_first_protocol_header(const __u32 * rbuf)
+{
+       return (void *)&rbuf[2];
+}
+
+static inline struct ips_message_header *ips_get_ips_header(const __u32 *
+                                                           rbuf)
+{
+       return (struct ips_message_header *)&rbuf[2];
+}
+
+static inline __u32 ips_get_ipath_ver(__le32 hdrword)
+{
+       return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
+           & INFINIPATH_I_VERS_MASK;
+}
+
+#endif                         /* IPS_COMMON_H */
diff --git a/drivers/infiniband/hw/ipath/verbs_debug.h b/drivers/infiniband/hw/ipath/verbs_debug.h
new file mode 100644 (file)
index 0000000..40d693c
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VERBS_DEBUG_H
+#define _VERBS_DEBUG_H
+
+/*
+ * This file contains tracing code for the ib_ipath kernel module.
+ */
+#ifndef _VERBS_DEBUGGING       /* tracing enabled or not */
+#define _VERBS_DEBUGGING 1
+#endif
+
+extern unsigned ib_ipath_debug;
+
+#define _VERBS_ERROR(fmt,...) \
+       do { \
+               printk(KERN_ERR "%s: " fmt, "ib_ipath", ##__VA_ARGS__); \
+       } while(0)
+
+#define _VERBS_UNIT_ERROR(unit,fmt,...) \
+       do { \
+               printk(KERN_ERR "%s: " fmt, "ib_ipath", ##__VA_ARGS__); \
+       } while(0)
+
+#if _VERBS_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or
+ * disable dynamically.
+ * This can be set at modprobe time also:
+ *      modprobe ib_path ib_ipath_debug=3
+ */
+
+#define __VERBS_INFO        0x1        /* generic low verbosity stuff */
+#define __VERBS_DBG         0x2        /* generic debug */
+#define __VERBS_VDBG        0x4        /* verbose debug */
+#define __VERBS_SMADBG      0x8000     /* sma packet debug */
+
+#define _VERBS_INFO(fmt,...) \
+       do { \
+               if (unlikely(ib_ipath_debug&__VERBS_INFO)) \
+                       printk(KERN_INFO "%s: " fmt,"ib_ipath", \
+                              ##__VA_ARGS__); \
+       } while(0)
+
+#define _VERBS_DBG(fmt,...) \
+       do { \
+               if (unlikely(ib_ipath_debug&__VERBS_DBG)) \
+                       printk(KERN_DEBUG "%s: " fmt, __func__, \
+                              ##__VA_ARGS__); \
+       } while(0)
+
+#define _VERBS_VDBG(fmt,...) \
+       do { \
+               if (unlikely(ib_ipath_debug&__VERBS_VDBG)) \
+                       printk(KERN_DEBUG "%s: " fmt, __func__, \
+                              ##__VA_ARGS__); \
+       } while(0)
+
+#define _VERBS_SMADBG(fmt,...) \
+       do { \
+               if (unlikely(ib_ipath_debug&__VERBS_SMADBG)) \
+                       printk(KERN_DEBUG "%s: " fmt, __func__, \
+                              ##__VA_ARGS__); \
+       } while(0)
+
+#else /* ! _VERBS_DEBUGGING */
+
+#define _VERBS_INFO(fmt,...)
+#define _VERBS_DBG(fmt,...)
+#define _VERBS_VDBG(fmt,...)
+#define _VERBS_SMADBG(fmt,...)
+
+#endif /* _VERBS_DEBUGGING */
+
+#endif /* _VERBS_DEBUG_H */