int t4_sge_init(struct adapter *adap)
 {
        struct sge *s = &adap->sge;
-       u32 sge_control, sge_conm_ctrl;
+       u32 sge_control, sge_control2, sge_conm_ctrl;
+       unsigned int ingpadboundary, ingpackboundary;
        int ret, egress_threshold;
 
        /*
        sge_control = t4_read_reg(adap, SGE_CONTROL);
        s->pktshift = PKTSHIFT_GET(sge_control);
        s->stat_len = (sge_control & EGRSTATUSPAGESIZE_MASK) ? 128 : 64;
-       s->fl_align = 1 << (INGPADBOUNDARY_GET(sge_control) +
-                           X_INGPADBOUNDARY_SHIFT);
+
+       /* T4 uses a single control field to specify both the PCIe Padding and
+        * Packing Boundary.  T5 introduced the ability to specify these
+        * separately.  The actual Ingress Packet Data alignment boundary
+        * within Packed Buffer Mode is the maximum of these two
+        * specifications.
+        */
+       ingpadboundary = 1 << (INGPADBOUNDARY_GET(sge_control) +
+                              X_INGPADBOUNDARY_SHIFT);
+       if (is_t4(adap->params.chip)) {
+               s->fl_align = ingpadboundary;
+       } else {
+               /* T5 has a different interpretation of one of the PCIe Packing
+                * Boundary values.
+                */
+               sge_control2 = t4_read_reg(adap, SGE_CONTROL2_A);
+               ingpackboundary = INGPACKBOUNDARY_G(sge_control2);
+               if (ingpackboundary == INGPACKBOUNDARY_16B_X)
+                       ingpackboundary = 16;
+               else
+                       ingpackboundary = 1 << (ingpackboundary +
+                                               INGPACKBOUNDARY_SHIFT_X);
+
+               s->fl_align = max(ingpadboundary, ingpackboundary);
+       }
 
        if (adap->flags & USING_SOFT_PARAMS)
                ret = t4_sge_init_soft(adap);
 
                     HOSTPAGESIZEPF6(sge_hps) |
                     HOSTPAGESIZEPF7(sge_hps));
 
-       t4_set_reg_field(adap, SGE_CONTROL,
-                        INGPADBOUNDARY_MASK |
-                        EGRSTATUSPAGESIZE_MASK,
-                        INGPADBOUNDARY(fl_align_log - 5) |
-                        EGRSTATUSPAGESIZE(stat_len != 64));
-
+       if (is_t4(adap->params.chip)) {
+               t4_set_reg_field(adap, SGE_CONTROL,
+                                INGPADBOUNDARY_MASK |
+                                EGRSTATUSPAGESIZE_MASK,
+                                INGPADBOUNDARY(fl_align_log - 5) |
+                                EGRSTATUSPAGESIZE(stat_len != 64));
+       } else {
+               /* T5 introduced the separation of the Free List Padding and
+                * Packing Boundaries.  Thus, we can select a smaller Padding
+                * Boundary to avoid uselessly chewing up PCIe Link and Memory
+                * Bandwidth, and use a Packing Boundary which is large enough
+                * to avoid false sharing between CPUs, etc.
+                *
+                * For the PCI Link, the smaller the Padding Boundary the
+                * better.  For the Memory Controller, a smaller Padding
+                * Boundary is better until we cross under the Memory Line
+                * Size (the minimum unit of transfer to/from Memory).  If we
+                * have a Padding Boundary which is smaller than the Memory
+                * Line Size, that'll involve a Read-Modify-Write cycle on the
+                * Memory Controller which is never good.  For T5 the smallest
+                * Padding Boundary which we can select is 32 bytes which is
+                * larger than any known Memory Controller Line Size so we'll
+                * use that.
+                *
+                * T5 has a different interpretation of the "0" value for the
+                * Packing Boundary.  This corresponds to 16 bytes instead of
+                * the expected 32 bytes.  We never have a Packing Boundary
+                * less than 32 bytes so we can't use that special value but
+                * on the other hand, if we wanted 32 bytes, the best we can
+                * really do is 64 bytes.
+               */
+               if (fl_align <= 32) {
+                       fl_align = 64;
+                       fl_align_log = 6;
+               }
+               t4_set_reg_field(adap, SGE_CONTROL,
+                                INGPADBOUNDARY_MASK |
+                                EGRSTATUSPAGESIZE_MASK,
+                                INGPADBOUNDARY(INGPCIEBOUNDARY_32B_X) |
+                                EGRSTATUSPAGESIZE(stat_len != 64));
+               t4_set_reg_field(adap, SGE_CONTROL2_A,
+                                INGPACKBOUNDARY_V(INGPACKBOUNDARY_M),
+                                INGPACKBOUNDARY_V(fl_align_log -
+                                                INGPACKBOUNDARY_SHIFT_X));
+       }
        /*
         * Adjust various SGE Free List Host Buffer Sizes.
         *
 
 #define X_INGPADBOUNDARY_SHIFT 5
 
 #define SGE_CONTROL 0x1008
+#define SGE_CONTROL2_A         0x1124
 #define  DCASYSTYPE             0x00080000U
 #define  RXPKTCPLMODE_MASK      0x00040000U
 #define  RXPKTCPLMODE_SHIFT     18
 #define  PKTSHIFT_SHIFT         10
 #define  PKTSHIFT(x)            ((x) << PKTSHIFT_SHIFT)
 #define  PKTSHIFT_GET(x)       (((x) & PKTSHIFT_MASK) >> PKTSHIFT_SHIFT)
+#define  INGPCIEBOUNDARY_32B_X 0
 #define  INGPCIEBOUNDARY_MASK   0x00000380U
 #define  INGPCIEBOUNDARY_SHIFT  7
 #define  INGPCIEBOUNDARY(x)     ((x) << INGPCIEBOUNDARY_SHIFT)
 #define  INGPADBOUNDARY(x)      ((x) << INGPADBOUNDARY_SHIFT)
 #define  INGPADBOUNDARY_GET(x) (((x) & INGPADBOUNDARY_MASK) \
                                 >> INGPADBOUNDARY_SHIFT)
+#define  INGPACKBOUNDARY_16B_X 0
+#define  INGPACKBOUNDARY_SHIFT_X 5
+
+#define  INGPACKBOUNDARY_S     16
+#define  INGPACKBOUNDARY_M     0x7U
+#define  INGPACKBOUNDARY_V(x)  ((x) << INGPACKBOUNDARY_S)
+#define  INGPACKBOUNDARY_G(x)  (((x) >> INGPACKBOUNDARY_S) \
+                                & INGPACKBOUNDARY_M)
 #define  EGRPCIEBOUNDARY_MASK   0x0000000eU
 #define  EGRPCIEBOUNDARY_SHIFT  1
 #define  EGRPCIEBOUNDARY(x)     ((x) << EGRPCIEBOUNDARY_SHIFT)
 
        u32 fl0 = sge_params->sge_fl_buffer_size[0];
        u32 fl1 = sge_params->sge_fl_buffer_size[1];
        struct sge *s = &adapter->sge;
+       unsigned int ingpadboundary, ingpackboundary;
 
        /*
         * Start by vetting the basic SGE parameters which have been set up by
        s->stat_len = ((sge_params->sge_control & EGRSTATUSPAGESIZE_MASK)
                        ? 128 : 64);
        s->pktshift = PKTSHIFT_GET(sge_params->sge_control);
-       s->fl_align = 1 << (INGPADBOUNDARY_GET(sge_params->sge_control) +
-                           SGE_INGPADBOUNDARY_SHIFT);
+
+       /* T4 uses a single control field to specify both the PCIe Padding and
+        * Packing Boundary.  T5 introduced the ability to specify these
+        * separately.  The actual Ingress Packet Data alignment boundary
+        * within Packed Buffer Mode is the maximum of these two
+        * specifications.  (Note that it makes no real practical sense to
+        * have the Pading Boudary be larger than the Packing Boundary but you
+        * could set the chip up that way and, in fact, legacy T4 code would
+        * end doing this because it would initialize the Padding Boundary and
+        * leave the Packing Boundary initialized to 0 (16 bytes).)
+        */
+       ingpadboundary = 1 << (INGPADBOUNDARY_GET(sge_params->sge_control) +
+                              X_INGPADBOUNDARY_SHIFT);
+       if (is_t4(adapter->params.chip)) {
+               s->fl_align = ingpadboundary;
+       } else {
+               /* T5 has a different interpretation of one of the PCIe Packing
+                * Boundary values.
+                */
+               ingpackboundary = INGPACKBOUNDARY_G(sge_params->sge_control2);
+               if (ingpackboundary == INGPACKBOUNDARY_16B_X)
+                       ingpackboundary = 16;
+               else
+                       ingpackboundary = 1 << (ingpackboundary +
+                                               INGPACKBOUNDARY_SHIFT_X);
+
+               s->fl_align = max(ingpadboundary, ingpackboundary);
+       }
 
        /*
         * Set up tasklet timers.
 
  */
 struct sge_params {
        u32 sge_control;                /* padding, boundaries, lengths, etc. */
+       u32 sge_control2;               /* T5: more of the same */
        u32 sge_host_page_size;         /* RDMA page sizes */
        u32 sge_queues_per_page;        /* RDMA queues/page */
        u32 sge_user_mode_limits;       /* limits for BAR2 user mode accesses */
 
        sge_params->sge_timer_value_2_and_3 = vals[5];
        sge_params->sge_timer_value_4_and_5 = vals[6];
 
+       /* T4 uses a single control field to specify both the PCIe Padding and
+        * Packing Boundary.  T5 introduced the ability to specify these
+        * separately with the Padding Boundary in SGE_CONTROL and and Packing
+        * Boundary in SGE_CONTROL2.  So for T5 and later we need to grab
+        * SGE_CONTROL in order to determine how ingress packet data will be
+        * laid out in Packed Buffer Mode.  Unfortunately, older versions of
+        * the firmware won't let us retrieve SGE_CONTROL2 so if we get a
+        * failure grabbing it we throw an error since we can't figure out the
+        * right value.
+        */
+       if (!is_t4(adapter->params.chip)) {
+               params[0] = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_REG) |
+                            FW_PARAMS_PARAM_XYZ(SGE_CONTROL2_A));
+               v = t4vf_query_params(adapter, 1, params, vals);
+               if (v != FW_SUCCESS) {
+                       dev_err(adapter->pdev_dev,
+                               "Unable to get SGE Control2; "
+                               "probably old firmware.\n");
+                       return v;
+               }
+               sge_params->sge_control2 = vals[0];
+       }
+
        params[0] = (FW_PARAMS_MNEM(FW_PARAMS_MNEM_REG) |
                     FW_PARAMS_PARAM_XYZ(SGE_INGRESS_RX_THRESHOLD));
        v = t4vf_query_params(adapter, 1, params, vals);