linux/drivers
Eric Dumazet e3f42f8453 mlx4: reorganize struct mlx4_en_tx_ring
Goal is to reorganize this critical structure to increase performance.

ndo_start_xmit() should only dirty one cache line, and access as few
cache lines as possible.

Add sp_ (Slow Path) prefix to fields that are not used in fast path,
to make clear what is going on.

After this patch pahole reports something much better, as all
ndo_start_xmit() needed fields are packed into two cache lines instead
of seven or eight

struct mlx4_en_tx_ring {
	u32                        last_nr_txbb;         /*     0   0x4 */
	u32                        cons;                 /*   0x4   0x4 */
	long unsigned int          wake_queue;           /*   0x8   0x8 */
	struct netdev_queue *      tx_queue;             /*  0x10   0x8 */
	u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /*  0x18   0x8 */
	struct mlx4_en_rx_ring *   recycle_ring;         /*  0x20   0x8 */

	/* XXX 24 bytes hole, try to pack */

	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        prod;                 /*  0x40   0x4 */
	unsigned int               tx_dropped;           /*  0x44   0x4 */
	long unsigned int          bytes;                /*  0x48   0x8 */
	long unsigned int          packets;              /*  0x50   0x8 */
	long unsigned int          tx_csum;              /*  0x58   0x8 */
	long unsigned int          tso_packets;          /*  0x60   0x8 */
	long unsigned int          xmit_more;            /*  0x68   0x8 */
	struct mlx4_bf             bf;                   /*  0x70  0x18 */
	/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
	__be32                     doorbell_qpn;         /*  0x88   0x4 */
	__be32                     mr_key;               /*  0x8c   0x4 */
	u32                        size;                 /*  0x90   0x4 */
	u32                        size_mask;            /*  0x94   0x4 */
	u32                        full_size;            /*  0x98   0x4 */
	u32                        buf_size;             /*  0x9c   0x4 */
	void *                     buf;                  /*  0xa0   0x8 */
	struct mlx4_en_tx_info *   tx_info;              /*  0xa8   0x8 */
	int                        qpn;                  /*  0xb0   0x4 */
	u8                         queue_index;          /*  0xb4   0x1 */
	bool                       bf_enabled;           /*  0xb5   0x1 */
	bool                       bf_alloced;           /*  0xb6   0x1 */
	u8                         hwtstamp_tx_type;     /*  0xb7   0x1 */
	u8 *                       bounce_buf;           /*  0xb8   0x8 */
	/* --- cacheline 3 boundary (192 bytes) --- */
	long unsigned int          queue_stopped;        /*  0xc0   0x8 */
	struct mlx4_hwq_resources  sp_wqres;             /*  0xc8  0x58 */
	/* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
	struct mlx4_qp             sp_qp;                /* 0x120  0x30 */
	/* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */
	struct mlx4_qp_context     sp_context;           /* 0x150  0xf8 */
	/* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
	cpumask_t                  sp_affinity_mask;     /* 0x248  0x20 */
	enum mlx4_qp_state         sp_qp_state;          /* 0x268   0x4 */
	u16                        sp_stride;            /* 0x26c   0x2 */
	u16                        sp_cqn;               /* 0x26e   0x2 */

	/* size: 640, cachelines: 10, members: 36 */
	/* sum members: 600, holes: 1, sum holes: 24 */
	/* padding: 16 */
};

Instead of this silly placement :

struct mlx4_en_tx_ring {
	u32                        last_nr_txbb;         /*     0   0x4 */
	u32                        cons;                 /*   0x4   0x4 */
	long unsigned int          wake_queue;           /*   0x8   0x8 */

	/* XXX 48 bytes hole, try to pack */

	/* --- cacheline 1 boundary (64 bytes) --- */
	u32                        prod;                 /*  0x40   0x4 */

	/* XXX 4 bytes hole, try to pack */

	long unsigned int          bytes;                /*  0x48   0x8 */
	long unsigned int          packets;              /*  0x50   0x8 */
	long unsigned int          tx_csum;              /*  0x58   0x8 */
	long unsigned int          tso_packets;          /*  0x60   0x8 */
	long unsigned int          xmit_more;            /*  0x68   0x8 */
	unsigned int               tx_dropped;           /*  0x70   0x4 */

	/* XXX 4 bytes hole, try to pack */

	struct mlx4_bf             bf;                   /*  0x78  0x18 */
	/* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */
	long unsigned int          queue_stopped;        /*  0x90   0x8 */
	cpumask_t                  affinity_mask;        /*  0x98  0x10 */
	struct mlx4_qp             qp;                   /*  0xa8  0x30 */
	/* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */
	struct mlx4_hwq_resources  wqres;                /*  0xd8  0x58 */
	/* --- cacheline 4 boundary (256 bytes) was 48 bytes ago --- */
	u32                        size;                 /* 0x130   0x4 */
	u32                        size_mask;            /* 0x134   0x4 */
	u16                        stride;               /* 0x138   0x2 */

	/* XXX 2 bytes hole, try to pack */

	u32                        full_size;            /* 0x13c   0x4 */
	/* --- cacheline 5 boundary (320 bytes) --- */
	u16                        cqn;                  /* 0x140   0x2 */

	/* XXX 2 bytes hole, try to pack */

	u32                        buf_size;             /* 0x144   0x4 */
	__be32                     doorbell_qpn;         /* 0x148   0x4 */
	__be32                     mr_key;               /* 0x14c   0x4 */
	void *                     buf;                  /* 0x150   0x8 */
	struct mlx4_en_tx_info *   tx_info;              /* 0x158   0x8 */
	struct mlx4_en_rx_ring *   recycle_ring;         /* 0x160   0x8 */
	u32                        (*free_tx_desc)(struct mlx4_en_priv *, struct mlx4_en_tx_ring *, int, u8, u64, int); /* 0x168   0x8 */
	u8 *                       bounce_buf;           /* 0x170   0x8 */
	struct mlx4_qp_context     context;              /* 0x178  0xf8 */
	/* --- cacheline 9 boundary (576 bytes) was 48 bytes ago --- */
	int                        qpn;                  /* 0x270   0x4 */
	enum mlx4_qp_state         qp_state;             /* 0x274   0x4 */
	u8                         queue_index;          /* 0x278   0x1 */
	bool                       bf_enabled;           /* 0x279   0x1 */
	bool                       bf_alloced;           /* 0x27a   0x1 */

	/* XXX 5 bytes hole, try to pack */

	/* --- cacheline 10 boundary (640 bytes) --- */
	struct netdev_queue *      tx_queue;             /* 0x280   0x8 */
	int                        hwtstamp_tx_type;     /* 0x288   0x4 */

	/* size: 704, cachelines: 11, members: 36 */
	/* sum members: 587, holes: 6, sum holes: 65 */
	/* padding: 52 */
};

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-11-24 16:03:37 -05:00
..
accessibility
acpi Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-22 13:27:16 -05:00
amba
android ANDROID: binder: Clear binder and cookie when setting handle in flat binder struct 2016-10-24 19:37:48 +02:00
ata ahci: fix the single MSI-X case in ahci_init_one 2016-10-25 11:43:07 -04:00
atm solos-pci: use permission-specific DEVICE_ATTR variants 2016-10-31 15:32:12 -04:00
auxdisplay auxdisplay: img-ascii-lcd: driver for simple ASCII LCD displays 2016-10-06 17:03:41 +02:00
base devres: add devm_alloc_percpu() 2016-11-15 22:34:25 -05:00
bcma
block aoe: fix crash in page count manipulation 2016-11-12 08:27:07 -07:00
bluetooth Bluetooth: btwilink: Fix probe return value 2016-10-20 10:14:49 +02:00
bus bus: qcom-ebi2: depend on ARCH_QCOM or COMPILE_TEST 2016-10-17 13:46:09 -07:00
cdrom
char Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-22 13:27:16 -05:00
clk clk: mmp: pxa910: fix return value check in pxa910_clk_init() 2016-11-01 17:41:20 -07:00
clocksource Revert "clocksource/drivers/timer_sun5i: Replace code by clocksource_mmio_init" 2016-10-20 21:58:58 +02:00
connector
cpufreq Merge branches 'pm-cpufreq-fixes' and 'pm-sleep-fixes' 2016-10-29 01:29:17 +02:00
cpuidle Merge branch 'upstream' of git://git.linux-mips.org/pub/scm/ralf/upstream-linus 2016-10-15 09:26:12 -07:00
crypto Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-22 13:27:16 -05:00
dax device-dax: fix percpu_ref_exit ordering 2016-10-27 17:04:05 -07:00
dca
devfreq PM / devfreq: Skip status update on uninitialized previous_freq 2016-10-11 00:01:20 +02:00
dio
dma dmaengine: cppi41: More PM runtime fixes 2016-11-17 16:09:23 +05:30
dma-buf Merge tag 'drm-for-v4.9' of git://people.freedesktop.org/~airlied/linux 2016-10-11 18:12:22 -07:00
edac * Altera Arria10 enablement of NAND, DMA, USB, QSPI and SD-MMC FIFO 2016-10-04 12:06:26 -07:00
eisa
extcon extcon: qcom-spmi-misc: Sync the extcon state on interrupt 2016-10-26 16:04:29 +09:00
firewire Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-15 10:54:36 -05:00
firmware efi/arm: Fix absolute relocation detection for older toolchains 2016-10-19 14:49:44 +02:00
fmc
fpga
gpio gpio: Remove GPIO_DEVRES option 2016-11-16 20:46:32 +01:00
gpu Merge tag 'drm-intel-fixes-2016-11-17' of ssh://git.freedesktop.org/git/drm-intel into drm-fixes 2016-11-18 10:33:28 +10:00
hid HID: sensor: fix attributes in HID sensor interface 2016-11-05 16:56:09 +01:00
hsi net: use core MTU range checking in misc drivers 2016-10-20 14:51:10 -04:00
hv vmbus: make sysfs names consistent with PCI 2016-11-01 09:07:13 -06:00
hwmon hwmon: (core) fix resource leak on devm_kcalloc failure 2016-10-24 06:05:13 -07:00
hwspinlock
hwtracing
i2c i2c: i2c-mux-pca954x: fix deselect enabling for device-tree 2016-11-19 21:49:29 +01:00
ide
idle nmi_backtrace: generate one-line reports for idle cpus 2016-10-07 18:46:30 -07:00
iio iio: maxim_thermocouple: detect invalid storage size in read() 2016-11-13 10:08:32 +01:00
infiniband Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-22 13:27:16 -05:00
input Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input 2016-11-05 11:26:11 -07:00
iommu iommu/vt-d: Fix dead-locks in disable_dmar_iommu() path 2016-11-08 15:08:26 +01:00
ipack ipack: print a hex number after a 0x prefix 2016-10-27 18:43:43 -07:00
irqchip GIC updates for Linux 4.9-rc2 2016-10-21 21:40:29 +02:00
isdn net: deprecate eth_change_mtu, remove usage 2016-10-13 09:36:57 -04:00
leds leds: triggers: Check return value of kobject_uevent_env() 2016-09-20 10:22:10 +02:00
lguest
lightnvm Merge branch 'for-4.9/block' of git://git.kernel.dk/linux-block 2016-10-07 14:42:05 -07:00
macintosh powerpc: Remove all usages of NO_IRQ 2016-09-20 20:57:12 +10:00
mailbox mailbox: PCC: Fix lockdep warning when request PCC channel 2016-11-14 22:07:38 +01:00
mcb mcb: Add a dma_device to mcb_device 2016-09-27 12:33:47 +02:00
md Merge tag 'md/4.9-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md 2016-11-05 11:34:07 -07:00
media Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-15 10:54:36 -05:00
memory ARM: SoC driver updates for v4.9 2016-10-07 21:23:40 -07:00
memstick memstick: rtsx_usb_ms: Manage runtime PM when accessing the device 2016-10-17 15:43:05 +02:00
message net: use core MTU range checking in misc drivers 2016-10-20 14:51:10 -04:00
mfd mfd: core: Fix device reference leak in mfd_clone_cell 2016-11-16 09:50:41 +00:00
misc Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-15 10:54:36 -05:00
mmc mmc: mxs: Initialize the spinlock prior to using it 2016-11-07 13:30:08 +01:00
mtd MTD updates for 4.9-rc4: 2016-11-05 10:52:29 -07:00
net mlx4: reorganize struct mlx4_en_tx_ring 2016-11-24 16:03:37 -05:00
nfc mei: bus: fix received data size check in NFC fixup 2016-10-31 10:25:22 -06:00
ntb ntb_perf: potential info leak in debugfs 2016-11-13 16:48:30 -05:00
nubus
nvdimm nvdimm: make CONFIG_NVDIMM_DAX 'bool' 2016-10-27 16:16:21 -07:00
nvme nvme/pci: Don't free queues on error 2016-11-16 12:39:57 -07:00
nvmem ARM: SoC driver updates for v4.9 2016-10-07 21:23:40 -07:00
of of_mdio: fix device reference leak in of_phy_find_device 2016-11-17 12:05:04 -05:00
oprofile Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs 2016-10-10 20:16:43 -07:00
parisc
parport
pci Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2016-11-14 08:39:56 -08:00
pcmcia pcmcia: fix return value of soc_pcmcia_regulator_set 2016-11-11 08:45:08 -08:00
perf perf: xgene: Remove bogus IS_ERR() check 2016-10-17 15:50:07 +01:00
phy phy: sun4i: check PMU presence when poking unknown bit of pmu 2016-11-05 13:45:02 +05:30
pinctrl pinctrl-aspeed-g5: Never set SCU90[6] 2016-11-07 10:31:33 +01:00
platform ACPI fix for v4.9-rc5 2016-11-11 17:02:01 -08:00
pnp
power power supply and reset changes for the v4.9 series 2016-10-06 18:21:15 -07:00
powercap
pps pps: kc: fix non-tickless system config dependency 2016-10-11 15:06:32 -07:00
ps3 powerpc: Remove all usages of NO_IRQ 2016-09-20 20:57:12 +10:00
ptp ptp: Introduce a high resolution frequency adjustment method. 2016-11-09 21:19:53 -05:00
pwm
rapidio mm: replace get_user_pages() write/force parameters with gup_flags 2016-10-19 08:11:43 -07:00
ras
regulator regulator: core: silence warning: "VDD1: ramp_delay not set" 2016-10-28 18:22:40 +01:00
remoteproc rpmsg updates for v4.9 2016-10-06 17:03:49 -07:00
reset reset: uniphier: rename MIO reset to SD reset for Pro5, PXs2, LD20 SoCs 2016-10-22 18:31:42 +09:00
rpmsg
rtc rtc: omap: prevent disabling of clock/module during suspend 2016-11-04 23:11:39 +01:00
s390 Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-10-30 12:42:58 -04:00
sbus
scsi cxgb4: Allocate Tx queues dynamically 2016-11-18 14:04:29 -05:00
sfi
sh
sn
soc powerpc updates for 4.9 #2 2016-10-14 11:07:42 -07:00
spi Merge remote-tracking branches 'spi/fix/dt', 'spi/fix/fsl-dspi' and 'spi/fix/fsl-espi' into spi-linus 2016-10-29 12:51:55 -06:00
spmi spmi: pmic-arb: Return an error code if sanity check fails 2016-09-27 12:43:34 +02:00
ssb
staging Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-15 10:54:36 -05:00
target cxgb4: Allocate Tx queues dynamically 2016-11-18 14:04:29 -05:00
tc
thermal Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-10-30 12:42:58 -04:00
thunderbolt
tty Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-10-30 12:42:58 -04:00
uio
usb Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net 2016-11-15 10:54:36 -05:00
uwb uwb: fix device reference leaks 2016-11-01 09:04:04 -06:00
vfio vfio/pci: Fix integer overflows, bitmask check 2016-10-26 13:49:29 -06:00
vhost
video video: ARM CLCD: fix Vexpress regression 2016-11-03 12:20:14 +02:00
virt mm: replace get_user_pages() write/force parameters with gup_flags 2016-10-19 08:11:43 -07:00
virtio virtio_ring: mark vring_dma_dev inline 2016-10-31 00:40:08 +02:00
vlynq
vme vme: vme_get_size potentially returning incorrect value on failure 2016-10-28 08:25:18 -04:00
w1
watchdog Merge branches 'acpi-wdat' and 'acpi-cppc' 2016-10-21 22:24:23 +02:00
xen xen: fixes for 4.9-rc2 2016-10-24 19:52:24 -07:00
zorro
Kconfig
Makefile A small bug fix and a new driver for acting as an IPMI device. 2016-10-23 15:56:23 -07:00